quicktime/ffmpeg/libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 /* snow.c */
  35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  36
  37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  38 uint32_t squareTbl[512] = {0, };
  39
  40 const uint8_t ff_zigzag_direct[64] = {
  41     0,   1,  8, 16,  9,  2,  3, 10,
  42     17, 24, 32, 25, 18, 11,  4,  5,
  43     12, 19, 26, 33, 40, 48, 41, 34,
  44     27, 20, 13,  6,  7, 14, 21, 28,
  45     35, 42, 49, 56, 57, 50, 43, 36,
  46     29, 22, 15, 23, 30, 37, 44, 51,
  47     58, 59, 52, 45, 38, 31, 39, 46,
  48     53, 60, 61, 54, 47, 55, 62, 63
  49 };
  50
  51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  52    specification, we interleave the fields */
  53 const uint8_t ff_zigzag248_direct[64] = {
  54      0,  8,  1,  9, 16, 24,  2, 10,
  55     17, 25, 32, 40, 48, 56, 33, 41,
  56     18, 26,  3, 11,  4, 12, 19, 27,
  57     34, 42, 49, 57, 50, 58, 35, 43,
  58     20, 28,  5, 13,  6, 14, 21, 29,
  59     36, 44, 51, 59, 52, 60, 37, 45,
  60     22, 30,  7, 15, 23, 31, 38, 46,
  61     53, 61, 54, 62, 39, 47, 55, 63,
  62 };
  63
  64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
  66
  67 const uint8_t ff_alternate_horizontal_scan[64] = {
  68     0,  1,   2,  3,  8,  9, 16, 17,
  69     10, 11,  4,  5,  6,  7, 15, 14,
  70     13, 12, 19, 18, 24, 25, 32, 33,
  71     26, 27, 20, 21, 22, 23, 28, 29,
  72     30, 31, 34, 35, 40, 41, 48, 49,
  73     42, 43, 36, 37, 38, 39, 44, 45,
  74     46, 47, 50, 51, 56, 57, 58, 59,
  75     52, 53, 54, 55, 60, 61, 62, 63,
  76 };
  77
  78 const uint8_t ff_alternate_vertical_scan[64] = {
  79     0,  8,  16, 24,  1,  9,  2, 10,
  80     17, 25, 32, 40, 48, 56, 57, 49,
  81     41, 33, 26, 18,  3, 11,  4, 12,
  82     19, 27, 34, 42, 50, 58, 35, 43,
  83     51, 59, 20, 28,  5, 13,  6, 14,
  84     21, 29, 36, 44, 52, 60, 37, 45,
  85     53, 61, 22, 30,  7, 15, 23, 31,
  86     38, 46, 54, 62, 39, 47, 55, 63,
  87 };
  88
  89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  90 const uint32_t inverse[256]={
  91          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  92  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  93  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  94  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  95  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  96  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  97   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  98   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  99   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 100   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 101   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 102   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 103   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 104   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 105   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 106   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 107   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 108   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 109   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 110   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 111   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 112   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 113   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 114   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 115   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 116   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 117   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 118   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 119   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 120   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 121   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 122   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 123 };
 124
 125 /* Input permutation for the simple_idct_mmx */
 126 static const uint8_t simple_mmx_permutation[64]={
 127         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 128         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 129         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 130         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 131         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 132         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 133         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 134         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 135 };
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= bswap_32(src[i+0]);
 212         dst[i+1]= bswap_32(src[i+1]);
 213         dst[i+2]= bswap_32(src[i+2]);
 214         dst[i+3]= bswap_32(src[i+3]);
 215         dst[i+4]= bswap_32(src[i+4]);
 216         dst[i+5]= bswap_32(src[i+5]);
 217         dst[i+6]= bswap_32(src[i+6]);
 218         dst[i+7]= bswap_32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= bswap_32(src[i+0]);
 222     }
 223 }
 224
 225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 226 {
 227     int s, i;
 228     uint32_t *sq = squareTbl + 256;
 229
 230     s = 0;
 231     for (i = 0; i < h; i++) {
 232         s += sq[pix1[0] - pix2[0]];
 233         s += sq[pix1[1] - pix2[1]];
 234         s += sq[pix1[2] - pix2[2]];
 235         s += sq[pix1[3] - pix2[3]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         s += sq[pix1[4] - pix2[4]];
 254         s += sq[pix1[5] - pix2[5]];
 255         s += sq[pix1[6] - pix2[6]];
 256         s += sq[pix1[7] - pix2[7]];
 257         pix1 += line_size;
 258         pix2 += line_size;
 259     }
 260     return s;
 261 }
 262
 263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[ 0] - pix2[ 0]];
 271         s += sq[pix1[ 1] - pix2[ 1]];
 272         s += sq[pix1[ 2] - pix2[ 2]];
 273         s += sq[pix1[ 3] - pix2[ 3]];
 274         s += sq[pix1[ 4] - pix2[ 4]];
 275         s += sq[pix1[ 5] - pix2[ 5]];
 276         s += sq[pix1[ 6] - pix2[ 6]];
 277         s += sq[pix1[ 7] - pix2[ 7]];
 278         s += sq[pix1[ 8] - pix2[ 8]];
 279         s += sq[pix1[ 9] - pix2[ 9]];
 280         s += sq[pix1[10] - pix2[10]];
 281         s += sq[pix1[11] - pix2[11]];
 282         s += sq[pix1[12] - pix2[12]];
 283         s += sq[pix1[13] - pix2[13]];
 284         s += sq[pix1[14] - pix2[14]];
 285         s += sq[pix1[15] - pix2[15]];
 286
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293
 294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 295 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
 296     int s, i, j;
 297     const int dec_count= w==8 ? 3 : 4;
 298     int tmp[16*16];
 299 #if 0
 300     int level, ori;
 301     static const int scale[2][2][4][4]={
 302       {
 303         {
 304             //8x8 dec=3
 305             {268, 239, 239, 213},
 306             {  0, 224, 224, 152},
 307             {  0, 135, 135, 110},
 308         },{
 309             //16x16 dec=4
 310             {344, 310, 310, 280},
 311             {  0, 320, 320, 228},
 312             {  0, 175, 175, 136},
 313             {  0, 129, 129, 102},
 314         }
 315       },{
 316         {//FIXME 5/3
 317             //8x8 dec=3
 318             {275, 245, 245, 218},
 319             {  0, 230, 230, 156},
 320             {  0, 138, 138, 113},
 321         },{
 322             //16x16 dec=4
 323             {352, 317, 317, 286},
 324             {  0, 328, 328, 233},
 325             {  0, 180, 180, 140},
 326             {  0, 132, 132, 105},
 327         }
 328       }
 329     };
 330 #endif
 331
 332     for (i = 0; i < h; i++) {
 333         for (j = 0; j < w; j+=4) {
 334             tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 335             tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 336             tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 337             tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 338         }
 339         pix1 += line_size;
 340         pix2 += line_size;
 341     }
 342
 343     ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
 344
 345     s=0;
 346 #if 0
 347     for(level=0; level<dec_count; level++){
 348         for(ori= level ? 1 : 0; ori<4; ori++){
 349             int sx= (ori&1) ? 1<<level: 0;
 350             int stride= 16<<(dec_count-level);
 351             int sy= (ori&2) ? stride>>1 : 0;
 352             int size= 1<<level;
 353
 354             for(i=0; i<size; i++){
 355                 for(j=0; j<size; j++){
 356                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 357                     s += ABS(v);
 358                 }
 359             }
 360         }
 361     }
 362 #endif
 363     for (i = 0; i < h; i++) {
 364         for (j = 0; j < w; j+=4) {
 365             s+= ABS(tmp[16*i+j+0]);
 366             s+= ABS(tmp[16*i+j+1]);
 367             s+= ABS(tmp[16*i+j+2]);
 368             s+= ABS(tmp[16*i+j+3]);
 369         }
 370     }
 371     assert(s>=0);
 372
 373     return s>>2;
 374 #endif
 375 }
 376
 377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 378     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 379 }
 380
 381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 382     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 383 }
 384
 385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 386     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 387 }
 388
 389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 390     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 391 }
 392
 393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 394 {
 395     int i;
 396
 397     /* read the pixels */
 398     for(i=0;i<8;i++) {
 399         block[0] = pixels[0];
 400         block[1] = pixels[1];
 401         block[2] = pixels[2];
 402         block[3] = pixels[3];
 403         block[4] = pixels[4];
 404         block[5] = pixels[5];
 405         block[6] = pixels[6];
 406         block[7] = pixels[7];
 407         pixels += line_size;
 408         block += 8;
 409     }
 410 }
 411
 412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 413                           const uint8_t *s2, int stride){
 414     int i;
 415
 416     /* read the pixels */
 417     for(i=0;i<8;i++) {
 418         block[0] = s1[0] - s2[0];
 419         block[1] = s1[1] - s2[1];
 420         block[2] = s1[2] - s2[2];
 421         block[3] = s1[3] - s2[3];
 422         block[4] = s1[4] - s2[4];
 423         block[5] = s1[5] - s2[5];
 424         block[6] = s1[6] - s2[6];
 425         block[7] = s1[7] - s2[7];
 426         s1 += stride;
 427         s2 += stride;
 428         block += 8;
 429     }
 430 }
 431
 432
 433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 434                                  int line_size)
 435 {
 436     int i;
 437     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 438
 439     /* read the pixels */
 440     for(i=0;i<8;i++) {
 441         pixels[0] = cm[block[0]];
 442         pixels[1] = cm[block[1]];
 443         pixels[2] = cm[block[2]];
 444         pixels[3] = cm[block[3]];
 445         pixels[4] = cm[block[4]];
 446         pixels[5] = cm[block[5]];
 447         pixels[6] = cm[block[6]];
 448         pixels[7] = cm[block[7]];
 449
 450         pixels += line_size;
 451         block += 8;
 452     }
 453 }
 454
 455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 456                                  int line_size)
 457 {
 458     int i;
 459     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 460
 461     /* read the pixels */
 462     for(i=0;i<4;i++) {
 463         pixels[0] = cm[block[0]];
 464         pixels[1] = cm[block[1]];
 465         pixels[2] = cm[block[2]];
 466         pixels[3] = cm[block[3]];
 467
 468         pixels += line_size;
 469         block += 8;
 470     }
 471 }
 472
 473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 474                                  int line_size)
 475 {
 476     int i;
 477     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 478
 479     /* read the pixels */
 480     for(i=0;i<2;i++) {
 481         pixels[0] = cm[block[0]];
 482         pixels[1] = cm[block[1]];
 483
 484         pixels += line_size;
 485         block += 8;
 486     }
 487 }
 488
 489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 490                                         uint8_t *restrict pixels,
 491                                         int line_size)
 492 {
 493     int i, j;
 494
 495     for (i = 0; i < 8; i++) {
 496         for (j = 0; j < 8; j++) {
 497             if (*block < -128)
 498                 *pixels = 0;
 499             else if (*block > 127)
 500                 *pixels = 255;
 501             else
 502                 *pixels = (uint8_t)(*block + 128);
 503             block++;
 504             pixels++;
 505         }
 506         pixels += (line_size - 8);
 507     }
 508 }
 509
 510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 511                           int line_size)
 512 {
 513     int i;
 514     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 515
 516     /* read the pixels */
 517     for(i=0;i<8;i++) {
 518         pixels[0] = cm[pixels[0] + block[0]];
 519         pixels[1] = cm[pixels[1] + block[1]];
 520         pixels[2] = cm[pixels[2] + block[2]];
 521         pixels[3] = cm[pixels[3] + block[3]];
 522         pixels[4] = cm[pixels[4] + block[4]];
 523         pixels[5] = cm[pixels[5] + block[5]];
 524         pixels[6] = cm[pixels[6] + block[6]];
 525         pixels[7] = cm[pixels[7] + block[7]];
 526         pixels += line_size;
 527         block += 8;
 528     }
 529 }
 530
 531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 532                           int line_size)
 533 {
 534     int i;
 535     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 536
 537     /* read the pixels */
 538     for(i=0;i<4;i++) {
 539         pixels[0] = cm[pixels[0] + block[0]];
 540         pixels[1] = cm[pixels[1] + block[1]];
 541         pixels[2] = cm[pixels[2] + block[2]];
 542         pixels[3] = cm[pixels[3] + block[3]];
 543         pixels += line_size;
 544         block += 8;
 545     }
 546 }
 547
 548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 549                           int line_size)
 550 {
 551     int i;
 552     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 553
 554     /* read the pixels */
 555     for(i=0;i<2;i++) {
 556         pixels[0] = cm[pixels[0] + block[0]];
 557         pixels[1] = cm[pixels[1] + block[1]];
 558         pixels += line_size;
 559         block += 8;
 560     }
 561 }
 562
 563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 564 {
 565     int i;
 566     for(i=0;i<8;i++) {
 567         pixels[0] += block[0];
 568         pixels[1] += block[1];
 569         pixels[2] += block[2];
 570         pixels[3] += block[3];
 571         pixels[4] += block[4];
 572         pixels[5] += block[5];
 573         pixels[6] += block[6];
 574         pixels[7] += block[7];
 575         pixels += line_size;
 576         block += 8;
 577     }
 578 }
 579
 580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 581 {
 582     int i;
 583     for(i=0;i<4;i++) {
 584         pixels[0] += block[0];
 585         pixels[1] += block[1];
 586         pixels[2] += block[2];
 587         pixels[3] += block[3];
 588         pixels += line_size;
 589         block += 4;
 590     }
 591 }
 592
 593 #if 0
 594
 595 #define PIXOP2(OPNAME, OP) \
 596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 597 {\
 598     int i;\
 599     for(i=0; i<h; i++){\
 600         OP(*((uint64_t*)block), LD64(pixels));\
 601         pixels+=line_size;\
 602         block +=line_size;\
 603     }\
 604 }\
 605 \
 606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 607 {\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         const uint64_t a= LD64(pixels  );\
 611         const uint64_t b= LD64(pixels+1);\
 612         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 613         pixels+=line_size;\
 614         block +=line_size;\
 615     }\
 616 }\
 617 \
 618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 619 {\
 620     int i;\
 621     for(i=0; i<h; i++){\
 622         const uint64_t a= LD64(pixels  );\
 623         const uint64_t b= LD64(pixels+1);\
 624         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 625         pixels+=line_size;\
 626         block +=line_size;\
 627     }\
 628 }\
 629 \
 630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 631 {\
 632     int i;\
 633     for(i=0; i<h; i++){\
 634         const uint64_t a= LD64(pixels          );\
 635         const uint64_t b= LD64(pixels+line_size);\
 636         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 637         pixels+=line_size;\
 638         block +=line_size;\
 639     }\
 640 }\
 641 \
 642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 643 {\
 644     int i;\
 645     for(i=0; i<h; i++){\
 646         const uint64_t a= LD64(pixels          );\
 647         const uint64_t b= LD64(pixels+line_size);\
 648         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 649         pixels+=line_size;\
 650         block +=line_size;\
 651     }\
 652 }\
 653 \
 654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 655 {\
 656         int i;\
 657         const uint64_t a= LD64(pixels  );\
 658         const uint64_t b= LD64(pixels+1);\
 659         uint64_t l0=  (a&0x0303030303030303ULL)\
 660                     + (b&0x0303030303030303ULL)\
 661                     + 0x0202020202020202ULL;\
 662         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 663                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 664         uint64_t l1,h1;\
 665 \
 666         pixels+=line_size;\
 667         for(i=0; i<h; i+=2){\
 668             uint64_t a= LD64(pixels  );\
 669             uint64_t b= LD64(pixels+1);\
 670             l1=  (a&0x0303030303030303ULL)\
 671                + (b&0x0303030303030303ULL);\
 672             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 673               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 674             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 675             pixels+=line_size;\
 676             block +=line_size;\
 677             a= LD64(pixels  );\
 678             b= LD64(pixels+1);\
 679             l0=  (a&0x0303030303030303ULL)\
 680                + (b&0x0303030303030303ULL)\
 681                + 0x0202020202020202ULL;\
 682             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 683               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 684             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 685             pixels+=line_size;\
 686             block +=line_size;\
 687         }\
 688 }\
 689 \
 690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 691 {\
 692         int i;\
 693         const uint64_t a= LD64(pixels  );\
 694         const uint64_t b= LD64(pixels+1);\
 695         uint64_t l0=  (a&0x0303030303030303ULL)\
 696                     + (b&0x0303030303030303ULL)\
 697                     + 0x0101010101010101ULL;\
 698         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 699                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 700         uint64_t l1,h1;\
 701 \
 702         pixels+=line_size;\
 703         for(i=0; i<h; i+=2){\
 704             uint64_t a= LD64(pixels  );\
 705             uint64_t b= LD64(pixels+1);\
 706             l1=  (a&0x0303030303030303ULL)\
 707                + (b&0x0303030303030303ULL);\
 708             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 709               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 710             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 711             pixels+=line_size;\
 712             block +=line_size;\
 713             a= LD64(pixels  );\
 714             b= LD64(pixels+1);\
 715             l0=  (a&0x0303030303030303ULL)\
 716                + (b&0x0303030303030303ULL)\
 717                + 0x0101010101010101ULL;\
 718             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 719               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 720             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 721             pixels+=line_size;\
 722             block +=line_size;\
 723         }\
 724 }\
 725 \
 726 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 733
 734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 735 #else // 64 bit variant
 736
 737 #define PIXOP2(OPNAME, OP) \
 738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 739     int i;\
 740     for(i=0; i<h; i++){\
 741         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 742         pixels+=line_size;\
 743         block +=line_size;\
 744     }\
 745 }\
 746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 747     int i;\
 748     for(i=0; i<h; i++){\
 749         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 750         pixels+=line_size;\
 751         block +=line_size;\
 752     }\
 753 }\
 754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 755     int i;\
 756     for(i=0; i<h; i++){\
 757         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 758         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 759         pixels+=line_size;\
 760         block +=line_size;\
 761     }\
 762 }\
 763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 764     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 765 }\
 766 \
 767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 768                                                 int src_stride1, int src_stride2, int h){\
 769     int i;\
 770     for(i=0; i<h; i++){\
 771         uint32_t a,b;\
 772         a= LD32(&src1[i*src_stride1  ]);\
 773         b= LD32(&src2[i*src_stride2  ]);\
 774         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 775         a= LD32(&src1[i*src_stride1+4]);\
 776         b= LD32(&src2[i*src_stride2+4]);\
 777         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 778     }\
 779 }\
 780 \
 781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 782                                                 int src_stride1, int src_stride2, int h){\
 783     int i;\
 784     for(i=0; i<h; i++){\
 785         uint32_t a,b;\
 786         a= LD32(&src1[i*src_stride1  ]);\
 787         b= LD32(&src2[i*src_stride2  ]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 789         a= LD32(&src1[i*src_stride1+4]);\
 790         b= LD32(&src2[i*src_stride2+4]);\
 791         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 792     }\
 793 }\
 794 \
 795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 796                                                 int src_stride1, int src_stride2, int h){\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         uint32_t a,b;\
 800         a= LD32(&src1[i*src_stride1  ]);\
 801         b= LD32(&src2[i*src_stride2  ]);\
 802         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 803     }\
 804 }\
 805 \
 806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 807                                                 int src_stride1, int src_stride2, int h){\
 808     int i;\
 809     for(i=0; i<h; i++){\
 810         uint32_t a,b;\
 811         a= LD16(&src1[i*src_stride1  ]);\
 812         b= LD16(&src2[i*src_stride2  ]);\
 813         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 814     }\
 815 }\
 816 \
 817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 818                                                 int src_stride1, int src_stride2, int h){\
 819     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 820     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 821 }\
 822 \
 823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 824                                                 int src_stride1, int src_stride2, int h){\
 825     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 826     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 827 }\
 828 \
 829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 834     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 835 }\
 836 \
 837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 838     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 839 }\
 840 \
 841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 842     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 843 }\
 844 \
 845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 846                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 847     int i;\
 848     for(i=0; i<h; i++){\
 849         uint32_t a, b, c, d, l0, l1, h0, h1;\
 850         a= LD32(&src1[i*src_stride1]);\
 851         b= LD32(&src2[i*src_stride2]);\
 852         c= LD32(&src3[i*src_stride3]);\
 853         d= LD32(&src4[i*src_stride4]);\
 854         l0=  (a&0x03030303UL)\
 855            + (b&0x03030303UL)\
 856            + 0x02020202UL;\
 857         h0= ((a&0xFCFCFCFCUL)>>2)\
 858           + ((b&0xFCFCFCFCUL)>>2);\
 859         l1=  (c&0x03030303UL)\
 860            + (d&0x03030303UL);\
 861         h1= ((c&0xFCFCFCFCUL)>>2)\
 862           + ((d&0xFCFCFCFCUL)>>2);\
 863         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 864         a= LD32(&src1[i*src_stride1+4]);\
 865         b= LD32(&src2[i*src_stride2+4]);\
 866         c= LD32(&src3[i*src_stride3+4]);\
 867         d= LD32(&src4[i*src_stride4+4]);\
 868         l0=  (a&0x03030303UL)\
 869            + (b&0x03030303UL)\
 870            + 0x02020202UL;\
 871         h0= ((a&0xFCFCFCFCUL)>>2)\
 872           + ((b&0xFCFCFCFCUL)>>2);\
 873         l1=  (c&0x03030303UL)\
 874            + (d&0x03030303UL);\
 875         h1= ((c&0xFCFCFCFCUL)>>2)\
 876           + ((d&0xFCFCFCFCUL)>>2);\
 877         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 878     }\
 879 }\
 880 \
 881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 882     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 883 }\
 884 \
 885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 886     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 887 }\
 888 \
 889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 890     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 891 }\
 892 \
 893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 894     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 895 }\
 896 \
 897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 898                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 899     int i;\
 900     for(i=0; i<h; i++){\
 901         uint32_t a, b, c, d, l0, l1, h0, h1;\
 902         a= LD32(&src1[i*src_stride1]);\
 903         b= LD32(&src2[i*src_stride2]);\
 904         c= LD32(&src3[i*src_stride3]);\
 905         d= LD32(&src4[i*src_stride4]);\
 906         l0=  (a&0x03030303UL)\
 907            + (b&0x03030303UL)\
 908            + 0x01010101UL;\
 909         h0= ((a&0xFCFCFCFCUL)>>2)\
 910           + ((b&0xFCFCFCFCUL)>>2);\
 911         l1=  (c&0x03030303UL)\
 912            + (d&0x03030303UL);\
 913         h1= ((c&0xFCFCFCFCUL)>>2)\
 914           + ((d&0xFCFCFCFCUL)>>2);\
 915         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 916         a= LD32(&src1[i*src_stride1+4]);\
 917         b= LD32(&src2[i*src_stride2+4]);\
 918         c= LD32(&src3[i*src_stride3+4]);\
 919         d= LD32(&src4[i*src_stride4+4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x01010101UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930     }\
 931 }\
 932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 933                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 934     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 935     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 936 }\
 937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 938                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 939     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 940     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 941 }\
 942 \
 943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 944 {\
 945         int i, a0, b0, a1, b1;\
 946         a0= pixels[0];\
 947         b0= pixels[1] + 2;\
 948         a0 += b0;\
 949         b0 += pixels[2];\
 950 \
 951         pixels+=line_size;\
 952         for(i=0; i<h; i+=2){\
 953             a1= pixels[0];\
 954             b1= pixels[1];\
 955             a1 += b1;\
 956             b1 += pixels[2];\
 957 \
 958             block[0]= (a1+a0)>>2; /* FIXME non put */\
 959             block[1]= (b1+b0)>>2;\
 960 \
 961             pixels+=line_size;\
 962             block +=line_size;\
 963 \
 964             a0= pixels[0];\
 965             b0= pixels[1] + 2;\
 966             a0 += b0;\
 967             b0 += pixels[2];\
 968 \
 969             block[0]= (a1+a0)>>2;\
 970             block[1]= (b1+b0)>>2;\
 971             pixels+=line_size;\
 972             block +=line_size;\
 973         }\
 974 }\
 975 \
 976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 977 {\
 978         int i;\
 979         const uint32_t a= LD32(pixels  );\
 980         const uint32_t b= LD32(pixels+1);\
 981         uint32_t l0=  (a&0x03030303UL)\
 982                     + (b&0x03030303UL)\
 983                     + 0x02020202UL;\
 984         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 985                    + ((b&0xFCFCFCFCUL)>>2);\
 986         uint32_t l1,h1;\
 987 \
 988         pixels+=line_size;\
 989         for(i=0; i<h; i+=2){\
 990             uint32_t a= LD32(pixels  );\
 991             uint32_t b= LD32(pixels+1);\
 992             l1=  (a&0x03030303UL)\
 993                + (b&0x03030303UL);\
 994             h1= ((a&0xFCFCFCFCUL)>>2)\
 995               + ((b&0xFCFCFCFCUL)>>2);\
 996             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 997             pixels+=line_size;\
 998             block +=line_size;\
 999             a= LD32(pixels  );\
1000             b= LD32(pixels+1);\
1001             l0=  (a&0x03030303UL)\
1002                + (b&0x03030303UL)\
1003                + 0x02020202UL;\
1004             h0= ((a&0xFCFCFCFCUL)>>2)\
1005               + ((b&0xFCFCFCFCUL)>>2);\
1006             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007             pixels+=line_size;\
1008             block +=line_size;\
1009         }\
1010 }\
1011 \
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1013 {\
1014     int j;\
1015     for(j=0; j<2; j++){\
1016         int i;\
1017         const uint32_t a= LD32(pixels  );\
1018         const uint32_t b= LD32(pixels+1);\
1019         uint32_t l0=  (a&0x03030303UL)\
1020                     + (b&0x03030303UL)\
1021                     + 0x02020202UL;\
1022         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023                    + ((b&0xFCFCFCFCUL)>>2);\
1024         uint32_t l1,h1;\
1025 \
1026         pixels+=line_size;\
1027         for(i=0; i<h; i+=2){\
1028             uint32_t a= LD32(pixels  );\
1029             uint32_t b= LD32(pixels+1);\
1030             l1=  (a&0x03030303UL)\
1031                + (b&0x03030303UL);\
1032             h1= ((a&0xFCFCFCFCUL)>>2)\
1033               + ((b&0xFCFCFCFCUL)>>2);\
1034             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035             pixels+=line_size;\
1036             block +=line_size;\
1037             a= LD32(pixels  );\
1038             b= LD32(pixels+1);\
1039             l0=  (a&0x03030303UL)\
1040                + (b&0x03030303UL)\
1041                + 0x02020202UL;\
1042             h0= ((a&0xFCFCFCFCUL)>>2)\
1043               + ((b&0xFCFCFCFCUL)>>2);\
1044             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045             pixels+=line_size;\
1046             block +=line_size;\
1047         }\
1048         pixels+=4-line_size*(h+1);\
1049         block +=4-line_size*h;\
1050     }\
1051 }\
1052 \
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054 {\
1055     int j;\
1056     for(j=0; j<2; j++){\
1057         int i;\
1058         const uint32_t a= LD32(pixels  );\
1059         const uint32_t b= LD32(pixels+1);\
1060         uint32_t l0=  (a&0x03030303UL)\
1061                     + (b&0x03030303UL)\
1062                     + 0x01010101UL;\
1063         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064                    + ((b&0xFCFCFCFCUL)>>2);\
1065         uint32_t l1,h1;\
1066 \
1067         pixels+=line_size;\
1068         for(i=0; i<h; i+=2){\
1069             uint32_t a= LD32(pixels  );\
1070             uint32_t b= LD32(pixels+1);\
1071             l1=  (a&0x03030303UL)\
1072                + (b&0x03030303UL);\
1073             h1= ((a&0xFCFCFCFCUL)>>2)\
1074               + ((b&0xFCFCFCFCUL)>>2);\
1075             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076             pixels+=line_size;\
1077             block +=line_size;\
1078             a= LD32(pixels  );\
1079             b= LD32(pixels+1);\
1080             l0=  (a&0x03030303UL)\
1081                + (b&0x03030303UL)\
1082                + 0x01010101UL;\
1083             h0= ((a&0xFCFCFCFCUL)>>2)\
1084               + ((b&0xFCFCFCFCUL)>>2);\
1085             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086             pixels+=line_size;\
1087             block +=line_size;\
1088         }\
1089         pixels+=4-line_size*(h+1);\
1090         block +=4-line_size*h;\
1091     }\
1092 }\
1093 \
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1102
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1104 #endif
1105 #define op_put(a, b) a = b
1106
1107 PIXOP2(avg, op_avg)
1108 PIXOP2(put, op_put)
1109 #undef op_avg
1110 #undef op_put
1111
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117 }
1118
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121 }
1122
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1124 {
1125     const int A=(16-x16)*(16-y16);
1126     const int B=(   x16)*(16-y16);
1127     const int C=(16-x16)*(   y16);
1128     const int D=(   x16)*(   y16);
1129     int i;
1130
1131     for(i=0; i<h; i++)
1132     {
1133         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141         dst+= stride;
1142         src+= stride;
1143     }
1144 }
1145
1146 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148 {
1149     int y, vx, vy;
1150     const int s= 1<<shift;
1151
1152     width--;
1153     height--;
1154
1155     for(y=0; y<h; y++){
1156         int x;
1157
1158         vx= ox;
1159         vy= oy;
1160         for(x=0; x<8; x++){ //XXX FIXME optimize
1161             int src_x, src_y, frac_x, frac_y, index;
1162
1163             src_x= vx>>16;
1164             src_y= vy>>16;
1165             frac_x= src_x&(s-1);
1166             frac_y= src_y&(s-1);
1167             src_x>>=shift;
1168             src_y>>=shift;
1169
1170             if((unsigned)src_x < width){
1171                 if((unsigned)src_y < height){
1172                     index= src_x + src_y*stride;
1173                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1174                                            + src[index       +1]*   frac_x )*(s-frac_y)
1175                                         + (  src[index+stride  ]*(s-frac_x)
1176                                            + src[index+stride+1]*   frac_x )*   frac_y
1177                                         + r)>>(shift*2);
1178                 }else{
1179                     index= src_x + clip(src_y, 0, height)*stride;
1180                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1181                                           + src[index       +1]*   frac_x )*s
1182                                         + r)>>(shift*2);
1183                 }
1184             }else{
1185                 if((unsigned)src_y < height){
1186                     index= clip(src_x, 0, width) + src_y*stride;
1187                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1188                                            + src[index+stride  ]*   frac_y )*s
1189                                         + r)>>(shift*2);
1190                 }else{
1191                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192                     dst[y*stride + x]=    src[index         ];
1193                 }
1194             }
1195
1196             vx+= dxx;
1197             vy+= dyx;
1198         }
1199         ox += dxy;
1200         oy += dyy;
1201     }
1202 }
1203
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205     switch(width){
1206     case 2: put_pixels2_c (dst, src, stride, height); break;
1207     case 4: put_pixels4_c (dst, src, stride, height); break;
1208     case 8: put_pixels8_c (dst, src, stride, height); break;
1209     case 16:put_pixels16_c(dst, src, stride, height); break;
1210     }
1211 }
1212
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214     int i,j;
1215     for (i=0; i < height; i++) {
1216       for (j=0; j < width; j++) {
1217         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1218       }
1219       src += stride;
1220       dst += stride;
1221     }
1222 }
1223
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225     int i,j;
1226     for (i=0; i < height; i++) {
1227       for (j=0; j < width; j++) {
1228         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1229       }
1230       src += stride;
1231       dst += stride;
1232     }
1233 }
1234
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236     int i,j;
1237     for (i=0; i < height; i++) {
1238       for (j=0; j < width; j++) {
1239         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1240       }
1241       src += stride;
1242       dst += stride;
1243     }
1244 }
1245
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247     int i,j;
1248     for (i=0; i < height; i++) {
1249       for (j=0; j < width; j++) {
1250         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1251       }
1252       src += stride;
1253       dst += stride;
1254     }
1255 }
1256
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258     int i,j;
1259     for (i=0; i < height; i++) {
1260       for (j=0; j < width; j++) {
1261         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1262       }
1263       src += stride;
1264       dst += stride;
1265     }
1266 }
1267
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269     int i,j;
1270     for (i=0; i < height; i++) {
1271       for (j=0; j < width; j++) {
1272         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1273       }
1274       src += stride;
1275       dst += stride;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     int i,j;
1281     for (i=0; i < height; i++) {
1282       for (j=0; j < width; j++) {
1283         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284       }
1285       src += stride;
1286       dst += stride;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     int i,j;
1292     for (i=0; i < height; i++) {
1293       for (j=0; j < width; j++) {
1294         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1295       }
1296       src += stride;
1297       dst += stride;
1298     }
1299 }
1300
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302     switch(width){
1303     case 2: avg_pixels2_c (dst, src, stride, height); break;
1304     case 4: avg_pixels4_c (dst, src, stride, height); break;
1305     case 8: avg_pixels8_c (dst, src, stride, height); break;
1306     case 16:avg_pixels16_c(dst, src, stride, height); break;
1307     }
1308 }
1309
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311     int i,j;
1312     for (i=0; i < height; i++) {
1313       for (j=0; j < width; j++) {
1314         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1315       }
1316       src += stride;
1317       dst += stride;
1318     }
1319 }
1320
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     int i,j;
1389     for (i=0; i < height; i++) {
1390       for (j=0; j < width; j++) {
1391         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392       }
1393       src += stride;
1394       dst += stride;
1395     }
1396 }
1397 #if 0
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417 #endif
1418
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421     const int A=(8-x)*(8-y);\
1422     const int B=(  x)*(8-y);\
1423     const int C=(8-x)*(  y);\
1424     const int D=(  x)*(  y);\
1425     int i;\
1426     \
1427     assert(x<8 && y<8 && x>=0 && y>=0);\
1428 \
1429     for(i=0; i<h; i++)\
1430     {\
1431         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433         dst+= stride;\
1434         src+= stride;\
1435     }\
1436 }\
1437 \
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439     const int A=(8-x)*(8-y);\
1440     const int B=(  x)*(8-y);\
1441     const int C=(8-x)*(  y);\
1442     const int D=(  x)*(  y);\
1443     int i;\
1444     \
1445     assert(x<8 && y<8 && x>=0 && y>=0);\
1446 \
1447     for(i=0; i<h; i++)\
1448     {\
1449         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453         dst+= stride;\
1454         src+= stride;\
1455     }\
1456 }\
1457 \
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459     const int A=(8-x)*(8-y);\
1460     const int B=(  x)*(8-y);\
1461     const int C=(8-x)*(  y);\
1462     const int D=(  x)*(  y);\
1463     int i;\
1464     \
1465     assert(x<8 && y<8 && x>=0 && y>=0);\
1466 \
1467     for(i=0; i<h; i++)\
1468     {\
1469         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477         dst+= stride;\
1478         src+= stride;\
1479     }\
1480 }
1481
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1484
1485 H264_CHROMA_MC(put_       , op_put)
1486 H264_CHROMA_MC(avg_       , op_avg)
1487 #undef op_avg
1488 #undef op_put
1489
1490 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491 {
1492     int i;
1493     for(i=0; i<h; i++)
1494     {
1495         ST32(dst   , LD32(src   ));
1496         dst+=dstStride;
1497         src+=srcStride;
1498     }
1499 }
1500
1501 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502 {
1503     int i;
1504     for(i=0; i<h; i++)
1505     {
1506         ST32(dst   , LD32(src   ));
1507         ST32(dst+4 , LD32(src+4 ));
1508         dst+=dstStride;
1509         src+=srcStride;
1510     }
1511 }
1512
1513 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514 {
1515     int i;
1516     for(i=0; i<h; i++)
1517     {
1518         ST32(dst   , LD32(src   ));
1519         ST32(dst+4 , LD32(src+4 ));
1520         ST32(dst+8 , LD32(src+8 ));
1521         ST32(dst+12, LD32(src+12));
1522         dst+=dstStride;
1523         src+=srcStride;
1524     }
1525 }
1526
1527 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1528 {
1529     int i;
1530     for(i=0; i<h; i++)
1531     {
1532         ST32(dst   , LD32(src   ));
1533         ST32(dst+4 , LD32(src+4 ));
1534         ST32(dst+8 , LD32(src+8 ));
1535         ST32(dst+12, LD32(src+12));
1536         dst[16]= src[16];
1537         dst+=dstStride;
1538         src+=srcStride;
1539     }
1540 }
1541
1542 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1543 {
1544     int i;
1545     for(i=0; i<h; i++)
1546     {
1547         ST32(dst   , LD32(src   ));
1548         ST32(dst+4 , LD32(src+4 ));
1549         dst[8]= src[8];
1550         dst+=dstStride;
1551         src+=srcStride;
1552     }
1553 }
1554
1555
1556 #define QPEL_MC(r, OPNAME, RND, OP) \
1557 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1558     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1559     int i;\
1560     for(i=0; i<h; i++)\
1561     {\
1562         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1563         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1564         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1565         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1566         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1567         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1568         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1569         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1570         dst+=dstStride;\
1571         src+=srcStride;\
1572     }\
1573 }\
1574 \
1575 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1576     const int w=8;\
1577     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1578     int i;\
1579     for(i=0; i<w; i++)\
1580     {\
1581         const int src0= src[0*srcStride];\
1582         const int src1= src[1*srcStride];\
1583         const int src2= src[2*srcStride];\
1584         const int src3= src[3*srcStride];\
1585         const int src4= src[4*srcStride];\
1586         const int src5= src[5*srcStride];\
1587         const int src6= src[6*srcStride];\
1588         const int src7= src[7*srcStride];\
1589         const int src8= src[8*srcStride];\
1590         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1591         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1592         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1593         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1594         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1595         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1596         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1597         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1598         dst++;\
1599         src++;\
1600     }\
1601 }\
1602 \
1603 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1604     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1605     int i;\
1606     \
1607     for(i=0; i<h; i++)\
1608     {\
1609         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1610         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1611         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1612         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1613         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1614         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1615         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1616         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1617         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1618         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1619         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1620         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1621         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1622         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1623         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1624         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1625         dst+=dstStride;\
1626         src+=srcStride;\
1627     }\
1628 }\
1629 \
1630 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1631     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1632     int i;\
1633     const int w=16;\
1634     for(i=0; i<w; i++)\
1635     {\
1636         const int src0= src[0*srcStride];\
1637         const int src1= src[1*srcStride];\
1638         const int src2= src[2*srcStride];\
1639         const int src3= src[3*srcStride];\
1640         const int src4= src[4*srcStride];\
1641         const int src5= src[5*srcStride];\
1642         const int src6= src[6*srcStride];\
1643         const int src7= src[7*srcStride];\
1644         const int src8= src[8*srcStride];\
1645         const int src9= src[9*srcStride];\
1646         const int src10= src[10*srcStride];\
1647         const int src11= src[11*srcStride];\
1648         const int src12= src[12*srcStride];\
1649         const int src13= src[13*srcStride];\
1650         const int src14= src[14*srcStride];\
1651         const int src15= src[15*srcStride];\
1652         const int src16= src[16*srcStride];\
1653         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1654         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1655         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1656         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1657         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1658         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1659         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1660         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1661         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1662         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1663         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1664         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1665         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1666         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1667         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1668         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1669         dst++;\
1670         src++;\
1671     }\
1672 }\
1673 \
1674 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1675     OPNAME ## pixels8_c(dst, src, stride, 8);\
1676 }\
1677 \
1678 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1679     uint8_t half[64];\
1680     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1681     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1682 }\
1683 \
1684 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1685     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1686 }\
1687 \
1688 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1689     uint8_t half[64];\
1690     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1692 }\
1693 \
1694 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1695     uint8_t full[16*9];\
1696     uint8_t half[64];\
1697     copy_block9(full, src, 16, stride, 9);\
1698     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1699     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1700 }\
1701 \
1702 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1703     uint8_t full[16*9];\
1704     copy_block9(full, src, 16, stride, 9);\
1705     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1706 }\
1707 \
1708 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1709     uint8_t full[16*9];\
1710     uint8_t half[64];\
1711     copy_block9(full, src, 16, stride, 9);\
1712     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1713     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1714 }\
1715 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716     uint8_t full[16*9];\
1717     uint8_t halfH[72];\
1718     uint8_t halfV[64];\
1719     uint8_t halfHV[64];\
1720     copy_block9(full, src, 16, stride, 9);\
1721     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1723     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1725 }\
1726 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1727     uint8_t full[16*9];\
1728     uint8_t halfH[72];\
1729     uint8_t halfHV[64];\
1730     copy_block9(full, src, 16, stride, 9);\
1731     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1733     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1735 }\
1736 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737     uint8_t full[16*9];\
1738     uint8_t halfH[72];\
1739     uint8_t halfV[64];\
1740     uint8_t halfHV[64];\
1741     copy_block9(full, src, 16, stride, 9);\
1742     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1744     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1746 }\
1747 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1748     uint8_t full[16*9];\
1749     uint8_t halfH[72];\
1750     uint8_t halfHV[64];\
1751     copy_block9(full, src, 16, stride, 9);\
1752     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1754     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1756 }\
1757 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758     uint8_t full[16*9];\
1759     uint8_t halfH[72];\
1760     uint8_t halfV[64];\
1761     uint8_t halfHV[64];\
1762     copy_block9(full, src, 16, stride, 9);\
1763     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1765     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1767 }\
1768 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[16*9];\
1770     uint8_t halfH[72];\
1771     uint8_t halfHV[64];\
1772     copy_block9(full, src, 16, stride, 9);\
1773     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1775     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1777 }\
1778 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779     uint8_t full[16*9];\
1780     uint8_t halfH[72];\
1781     uint8_t halfV[64];\
1782     uint8_t halfHV[64];\
1783     copy_block9(full, src, 16, stride, 9);\
1784     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1785     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1786     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1788 }\
1789 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1790     uint8_t full[16*9];\
1791     uint8_t halfH[72];\
1792     uint8_t halfHV[64];\
1793     copy_block9(full, src, 16, stride, 9);\
1794     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1796     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1798 }\
1799 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1800     uint8_t halfH[72];\
1801     uint8_t halfHV[64];\
1802     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1803     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1805 }\
1806 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1807     uint8_t halfH[72];\
1808     uint8_t halfHV[64];\
1809     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1810     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1811     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1812 }\
1813 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1814     uint8_t full[16*9];\
1815     uint8_t halfH[72];\
1816     uint8_t halfV[64];\
1817     uint8_t halfHV[64];\
1818     copy_block9(full, src, 16, stride, 9);\
1819     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1821     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1823 }\
1824 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1825     uint8_t full[16*9];\
1826     uint8_t halfH[72];\
1827     copy_block9(full, src, 16, stride, 9);\
1828     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1830     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1831 }\
1832 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     uint8_t halfV[64];\
1836     uint8_t halfHV[64];\
1837     copy_block9(full, src, 16, stride, 9);\
1838     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1842 }\
1843 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     copy_block9(full, src, 16, stride, 9);\
1847     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1849     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1850 }\
1851 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1852     uint8_t halfH[72];\
1853     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1854     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1855 }\
1856 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1857     OPNAME ## pixels16_c(dst, src, stride, 16);\
1858 }\
1859 \
1860 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1861     uint8_t half[256];\
1862     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1863     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1864 }\
1865 \
1866 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1867     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1868 }\
1869 \
1870 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1871     uint8_t half[256];\
1872     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1874 }\
1875 \
1876 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t full[24*17];\
1878     uint8_t half[256];\
1879     copy_block17(full, src, 24, stride, 17);\
1880     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1881     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1882 }\
1883 \
1884 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t full[24*17];\
1886     copy_block17(full, src, 24, stride, 17);\
1887     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1888 }\
1889 \
1890 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1891     uint8_t full[24*17];\
1892     uint8_t half[256];\
1893     copy_block17(full, src, 24, stride, 17);\
1894     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1895     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1896 }\
1897 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[24*17];\
1899     uint8_t halfH[272];\
1900     uint8_t halfV[256];\
1901     uint8_t halfHV[256];\
1902     copy_block17(full, src, 24, stride, 17);\
1903     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1905     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1907 }\
1908 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1909     uint8_t full[24*17];\
1910     uint8_t halfH[272];\
1911     uint8_t halfHV[256];\
1912     copy_block17(full, src, 24, stride, 17);\
1913     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1915     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1917 }\
1918 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919     uint8_t full[24*17];\
1920     uint8_t halfH[272];\
1921     uint8_t halfV[256];\
1922     uint8_t halfHV[256];\
1923     copy_block17(full, src, 24, stride, 17);\
1924     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1926     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1928 }\
1929 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[24*17];\
1931     uint8_t halfH[272];\
1932     uint8_t halfHV[256];\
1933     copy_block17(full, src, 24, stride, 17);\
1934     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1936     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1938 }\
1939 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1940     uint8_t full[24*17];\
1941     uint8_t halfH[272];\
1942     uint8_t halfV[256];\
1943     uint8_t halfHV[256];\
1944     copy_block17(full, src, 24, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1947     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1949 }\
1950 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1951     uint8_t full[24*17];\
1952     uint8_t halfH[272];\
1953     uint8_t halfHV[256];\
1954     copy_block17(full, src, 24, stride, 17);\
1955     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1957     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1959 }\
1960 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1961     uint8_t full[24*17];\
1962     uint8_t halfH[272];\
1963     uint8_t halfV[256];\
1964     uint8_t halfHV[256];\
1965     copy_block17(full, src, 24, stride, 17);\
1966     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1967     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1968     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1970 }\
1971 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1972     uint8_t full[24*17];\
1973     uint8_t halfH[272];\
1974     uint8_t halfHV[256];\
1975     copy_block17(full, src, 24, stride, 17);\
1976     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1978     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1980 }\
1981 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1982     uint8_t halfH[272];\
1983     uint8_t halfHV[256];\
1984     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1985     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1987 }\
1988 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1989     uint8_t halfH[272];\
1990     uint8_t halfHV[256];\
1991     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1992     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1993     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1994 }\
1995 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1996     uint8_t full[24*17];\
1997     uint8_t halfH[272];\
1998     uint8_t halfV[256];\
1999     uint8_t halfHV[256];\
2000     copy_block17(full, src, 24, stride, 17);\
2001     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2003     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2005 }\
2006 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[24*17];\
2008     uint8_t halfH[272];\
2009     copy_block17(full, src, 24, stride, 17);\
2010     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2012     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2013 }\
2014 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2015     uint8_t full[24*17];\
2016     uint8_t halfH[272];\
2017     uint8_t halfV[256];\
2018     uint8_t halfHV[256];\
2019     copy_block17(full, src, 24, stride, 17);\
2020     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2022     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2024 }\
2025 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t halfH[272];\
2028     copy_block17(full, src, 24, stride, 17);\
2029     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2030     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2031     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2032 }\
2033 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t halfH[272];\
2035     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2036     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2037 }
2038
2039 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2040 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2041 #define op_put(a, b) a = cm[((b) + 16)>>5]
2042 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2043
2044 QPEL_MC(0, put_       , _       , op_put)
2045 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2046 QPEL_MC(0, avg_       , _       , op_avg)
2047 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2048 #undef op_avg
2049 #undef op_avg_no_rnd
2050 #undef op_put
2051 #undef op_put_no_rnd
2052
2053 #if 1
2054 #define H264_LOWPASS(OPNAME, OP, OP2) \
2055 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2056     const int h=4;\
2057     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2058     int i;\
2059     for(i=0; i<h; i++)\
2060     {\
2061         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2062         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2063         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2064         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2065         dst+=dstStride;\
2066         src+=srcStride;\
2067     }\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2071     const int w=4;\
2072     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2073     int i;\
2074     for(i=0; i<w; i++)\
2075     {\
2076         const int srcB= src[-2*srcStride];\
2077         const int srcA= src[-1*srcStride];\
2078         const int src0= src[0 *srcStride];\
2079         const int src1= src[1 *srcStride];\
2080         const int src2= src[2 *srcStride];\
2081         const int src3= src[3 *srcStride];\
2082         const int src4= src[4 *srcStride];\
2083         const int src5= src[5 *srcStride];\
2084         const int src6= src[6 *srcStride];\
2085         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2086         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2087         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2088         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2089         dst++;\
2090         src++;\
2091     }\
2092 }\
2093 \
2094 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2095     const int h=4;\
2096     const int w=4;\
2097     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2098     int i;\
2099     src -= 2*srcStride;\
2100     for(i=0; i<h+5; i++)\
2101     {\
2102         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2103         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2104         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2105         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2106         tmp+=tmpStride;\
2107         src+=srcStride;\
2108     }\
2109     tmp -= tmpStride*(h+5-2);\
2110     for(i=0; i<w; i++)\
2111     {\
2112         const int tmpB= tmp[-2*tmpStride];\
2113         const int tmpA= tmp[-1*tmpStride];\
2114         const int tmp0= tmp[0 *tmpStride];\
2115         const int tmp1= tmp[1 *tmpStride];\
2116         const int tmp2= tmp[2 *tmpStride];\
2117         const int tmp3= tmp[3 *tmpStride];\
2118         const int tmp4= tmp[4 *tmpStride];\
2119         const int tmp5= tmp[5 *tmpStride];\
2120         const int tmp6= tmp[6 *tmpStride];\
2121         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2123         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2124         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2125         dst++;\
2126         tmp++;\
2127     }\
2128 }\
2129 \
2130 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2131     const int h=8;\
2132     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2133     int i;\
2134     for(i=0; i<h; i++)\
2135     {\
2136         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2137         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2138         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2139         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2140         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2141         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2142         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2143         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2144         dst+=dstStride;\
2145         src+=srcStride;\
2146     }\
2147 }\
2148 \
2149 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2150     const int w=8;\
2151     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2152     int i;\
2153     for(i=0; i<w; i++)\
2154     {\
2155         const int srcB= src[-2*srcStride];\
2156         const int srcA= src[-1*srcStride];\
2157         const int src0= src[0 *srcStride];\
2158         const int src1= src[1 *srcStride];\
2159         const int src2= src[2 *srcStride];\
2160         const int src3= src[3 *srcStride];\
2161         const int src4= src[4 *srcStride];\
2162         const int src5= src[5 *srcStride];\
2163         const int src6= src[6 *srcStride];\
2164         const int src7= src[7 *srcStride];\
2165         const int src8= src[8 *srcStride];\
2166         const int src9= src[9 *srcStride];\
2167         const int src10=src[10*srcStride];\
2168         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2169         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2170         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2171         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2172         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2173         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2174         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2175         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2176         dst++;\
2177         src++;\
2178     }\
2179 }\
2180 \
2181 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2182     const int h=8;\
2183     const int w=8;\
2184     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2185     int i;\
2186     src -= 2*srcStride;\
2187     for(i=0; i<h+5; i++)\
2188     {\
2189         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2190         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2191         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2192         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2193         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2194         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2195         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2196         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2197         tmp+=tmpStride;\
2198         src+=srcStride;\
2199     }\
2200     tmp -= tmpStride*(h+5-2);\
2201     for(i=0; i<w; i++)\
2202     {\
2203         const int tmpB= tmp[-2*tmpStride];\
2204         const int tmpA= tmp[-1*tmpStride];\
2205         const int tmp0= tmp[0 *tmpStride];\
2206         const int tmp1= tmp[1 *tmpStride];\
2207         const int tmp2= tmp[2 *tmpStride];\
2208         const int tmp3= tmp[3 *tmpStride];\
2209         const int tmp4= tmp[4 *tmpStride];\
2210         const int tmp5= tmp[5 *tmpStride];\
2211         const int tmp6= tmp[6 *tmpStride];\
2212         const int tmp7= tmp[7 *tmpStride];\
2213         const int tmp8= tmp[8 *tmpStride];\
2214         const int tmp9= tmp[9 *tmpStride];\
2215         const int tmp10=tmp[10*tmpStride];\
2216         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2217         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2218         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2219         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2220         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2221         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2222         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2223         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2224         dst++;\
2225         tmp++;\
2226     }\
2227 }\
2228 \
2229 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2231     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2232     src += 8*srcStride;\
2233     dst += 8*dstStride;\
2234     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2235     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2236 }\
2237 \
2238 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2239     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2240     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2241     src += 8*srcStride;\
2242     dst += 8*dstStride;\
2243     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2244     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2245 }\
2246 \
2247 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2248     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2249     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2250     src += 8*srcStride;\
2251     dst += 8*dstStride;\
2252     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2253     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2254 }\
2255
2256 #define H264_MC(OPNAME, SIZE) \
2257 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2258     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2259 }\
2260 \
2261 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2262     uint8_t half[SIZE*SIZE];\
2263     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2264     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2265 }\
2266 \
2267 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2268     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2269 }\
2270 \
2271 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2272     uint8_t half[SIZE*SIZE];\
2273     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2274     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2275 }\
2276 \
2277 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2278     uint8_t full[SIZE*(SIZE+5)];\
2279     uint8_t * const full_mid= full + SIZE*2;\
2280     uint8_t half[SIZE*SIZE];\
2281     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2282     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2283     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2284 }\
2285 \
2286 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2287     uint8_t full[SIZE*(SIZE+5)];\
2288     uint8_t * const full_mid= full + SIZE*2;\
2289     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2290     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2291 }\
2292 \
2293 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2294     uint8_t full[SIZE*(SIZE+5)];\
2295     uint8_t * const full_mid= full + SIZE*2;\
2296     uint8_t half[SIZE*SIZE];\
2297     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2298     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2299     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2300 }\
2301 \
2302 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2303     uint8_t full[SIZE*(SIZE+5)];\
2304     uint8_t * const full_mid= full + SIZE*2;\
2305     uint8_t halfH[SIZE*SIZE];\
2306     uint8_t halfV[SIZE*SIZE];\
2307     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2308     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2309     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2310     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2311 }\
2312 \
2313 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2314     uint8_t full[SIZE*(SIZE+5)];\
2315     uint8_t * const full_mid= full + SIZE*2;\
2316     uint8_t halfH[SIZE*SIZE];\
2317     uint8_t halfV[SIZE*SIZE];\
2318     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2320     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2322 }\
2323 \
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2325     uint8_t full[SIZE*(SIZE+5)];\
2326     uint8_t * const full_mid= full + SIZE*2;\
2327     uint8_t halfH[SIZE*SIZE];\
2328     uint8_t halfV[SIZE*SIZE];\
2329     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2330     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2331     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2336     uint8_t full[SIZE*(SIZE+5)];\
2337     uint8_t * const full_mid= full + SIZE*2;\
2338     uint8_t halfH[SIZE*SIZE];\
2339     uint8_t halfV[SIZE*SIZE];\
2340     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2341     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2342     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344 }\
2345 \
2346 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2347     int16_t tmp[SIZE*(SIZE+5)];\
2348     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2349 }\
2350 \
2351 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2352     int16_t tmp[SIZE*(SIZE+5)];\
2353     uint8_t halfH[SIZE*SIZE];\
2354     uint8_t halfHV[SIZE*SIZE];\
2355     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2356     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2357     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2361     int16_t tmp[SIZE*(SIZE+5)];\
2362     uint8_t halfH[SIZE*SIZE];\
2363     uint8_t halfHV[SIZE*SIZE];\
2364     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2365     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2366     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2367 }\
2368 \
2369 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2370     uint8_t full[SIZE*(SIZE+5)];\
2371     uint8_t * const full_mid= full + SIZE*2;\
2372     int16_t tmp[SIZE*(SIZE+5)];\
2373     uint8_t halfV[SIZE*SIZE];\
2374     uint8_t halfHV[SIZE*SIZE];\
2375     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2376     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2377     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2378     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2379 }\
2380 \
2381 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2382     uint8_t full[SIZE*(SIZE+5)];\
2383     uint8_t * const full_mid= full + SIZE*2;\
2384     int16_t tmp[SIZE*(SIZE+5)];\
2385     uint8_t halfV[SIZE*SIZE];\
2386     uint8_t halfHV[SIZE*SIZE];\
2387     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2388     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2389     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2390     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2391 }\
2392
2393 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2394 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2395 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2396 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2397 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2398
2399 H264_LOWPASS(put_       , op_put, op2_put)
2400 H264_LOWPASS(avg_       , op_avg, op2_avg)
2401 H264_MC(put_, 4)
2402 H264_MC(put_, 8)
2403 H264_MC(put_, 16)
2404 H264_MC(avg_, 4)
2405 H264_MC(avg_, 8)
2406 H264_MC(avg_, 16)
2407
2408 #undef op_avg
2409 #undef op_put
2410 #undef op2_avg
2411 #undef op2_put
2412 #endif
2413
2414 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2415 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2416 #define H264_WEIGHT(W,H) \
2417 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2418     int attribute_unused x, y; \
2419     offset <<= log2_denom; \
2420     if(log2_denom) offset += 1<<(log2_denom-1); \
2421     for(y=0; y<H; y++, block += stride){ \
2422         op_scale1(0); \
2423         op_scale1(1); \
2424         if(W==2) continue; \
2425         op_scale1(2); \
2426         op_scale1(3); \
2427         if(W==4) continue; \
2428         op_scale1(4); \
2429         op_scale1(5); \
2430         op_scale1(6); \
2431         op_scale1(7); \
2432         if(W==8) continue; \
2433         op_scale1(8); \
2434         op_scale1(9); \
2435         op_scale1(10); \
2436         op_scale1(11); \
2437         op_scale1(12); \
2438         op_scale1(13); \
2439         op_scale1(14); \
2440         op_scale1(15); \
2441     } \
2442 } \
2443 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2444     int attribute_unused x, y; \
2445     int offset = (offsets + offsetd + 1) >> 1; \
2446     offset = ((offset << 1) + 1) << log2_denom; \
2447     for(y=0; y<H; y++, dst += stride, src += stride){ \
2448         op_scale2(0); \
2449         op_scale2(1); \
2450         if(W==2) continue; \
2451         op_scale2(2); \
2452         op_scale2(3); \
2453         if(W==4) continue; \
2454         op_scale2(4); \
2455         op_scale2(5); \
2456         op_scale2(6); \
2457         op_scale2(7); \
2458         if(W==8) continue; \
2459         op_scale2(8); \
2460         op_scale2(9); \
2461         op_scale2(10); \
2462         op_scale2(11); \
2463         op_scale2(12); \
2464         op_scale2(13); \
2465         op_scale2(14); \
2466         op_scale2(15); \
2467     } \
2468 }
2469
2470 H264_WEIGHT(16,16)
2471 H264_WEIGHT(16,8)
2472 H264_WEIGHT(8,16)
2473 H264_WEIGHT(8,8)
2474 H264_WEIGHT(8,4)
2475 H264_WEIGHT(4,8)
2476 H264_WEIGHT(4,4)
2477 H264_WEIGHT(4,2)
2478 H264_WEIGHT(2,4)
2479 H264_WEIGHT(2,2)
2480
2481 #undef op_scale1
2482 #undef op_scale2
2483 #undef H264_WEIGHT
2484
2485 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2486     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2487     int i;
2488
2489     for(i=0; i<h; i++){
2490         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2491         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2492         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2493         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2494         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2495         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2496         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2497         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2498         dst+=dstStride;
2499         src+=srcStride;
2500     }
2501 }
2502
2503 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2504     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2505     int i;
2506
2507     for(i=0; i<w; i++){
2508         const int src_1= src[ -srcStride];
2509         const int src0 = src[0          ];
2510         const int src1 = src[  srcStride];
2511         const int src2 = src[2*srcStride];
2512         const int src3 = src[3*srcStride];
2513         const int src4 = src[4*srcStride];
2514         const int src5 = src[5*srcStride];
2515         const int src6 = src[6*srcStride];
2516         const int src7 = src[7*srcStride];
2517         const int src8 = src[8*srcStride];
2518         const int src9 = src[9*srcStride];
2519         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2520         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2521         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2522         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2523         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2524         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2525         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2526         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2527         src++;
2528         dst++;
2529     }
2530 }
2531
2532 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2533     put_pixels8_c(dst, src, stride, 8);
2534 }
2535
2536 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2537     uint8_t half[64];
2538     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2539     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2540 }
2541
2542 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2543     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2544 }
2545
2546 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2547     uint8_t half[64];
2548     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2549     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2550 }
2551
2552 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2553     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2554 }
2555
2556 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2557     uint8_t halfH[88];
2558     uint8_t halfV[64];
2559     uint8_t halfHV[64];
2560     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2561     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2562     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2563     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2564 }
2565 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2566     uint8_t halfH[88];
2567     uint8_t halfV[64];
2568     uint8_t halfHV[64];
2569     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2570     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2571     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2572     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2573 }
2574 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2575     uint8_t halfH[88];
2576     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2577     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2578 }
2579
2580 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2581     int x;
2582     const int strength= ff_h263_loop_filter_strength[qscale];
2583
2584     for(x=0; x<8; x++){
2585         int d1, d2, ad1;
2586         int p0= src[x-2*stride];
2587         int p1= src[x-1*stride];
2588         int p2= src[x+0*stride];
2589         int p3= src[x+1*stride];
2590         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2591
2592         if     (d<-2*strength) d1= 0;
2593         else if(d<-  strength) d1=-2*strength - d;
2594         else if(d<   strength) d1= d;
2595         else if(d< 2*strength) d1= 2*strength - d;
2596         else                   d1= 0;
2597
2598         p1 += d1;
2599         p2 -= d1;
2600         if(p1&256) p1= ~(p1>>31);
2601         if(p2&256) p2= ~(p2>>31);
2602
2603         src[x-1*stride] = p1;
2604         src[x+0*stride] = p2;
2605
2606         ad1= ABS(d1)>>1;
2607
2608         d2= clip((p0-p3)/4, -ad1, ad1);
2609
2610         src[x-2*stride] = p0 - d2;
2611         src[x+  stride] = p3 + d2;
2612     }
2613 }
2614
2615 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2616     int y;
2617     const int strength= ff_h263_loop_filter_strength[qscale];
2618
2619     for(y=0; y<8; y++){
2620         int d1, d2, ad1;
2621         int p0= src[y*stride-2];
2622         int p1= src[y*stride-1];
2623         int p2= src[y*stride+0];
2624         int p3= src[y*stride+1];
2625         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2626
2627         if     (d<-2*strength) d1= 0;
2628         else if(d<-  strength) d1=-2*strength - d;
2629         else if(d<   strength) d1= d;
2630         else if(d< 2*strength) d1= 2*strength - d;
2631         else                   d1= 0;
2632
2633         p1 += d1;
2634         p2 -= d1;
2635         if(p1&256) p1= ~(p1>>31);
2636         if(p2&256) p2= ~(p2>>31);
2637
2638         src[y*stride-1] = p1;
2639         src[y*stride+0] = p2;
2640
2641         ad1= ABS(d1)>>1;
2642
2643         d2= clip((p0-p3)/4, -ad1, ad1);
2644
2645         src[y*stride-2] = p0 - d2;
2646         src[y*stride+1] = p3 + d2;
2647     }
2648 }
2649
2650 static void h261_loop_filter_c(uint8_t *src, int stride){
2651     int x,y,xy,yz;
2652     int temp[64];
2653
2654     for(x=0; x<8; x++){
2655         temp[x      ] = 4*src[x           ];
2656         temp[x + 7*8] = 4*src[x + 7*stride];
2657     }
2658     for(y=1; y<7; y++){
2659         for(x=0; x<8; x++){
2660             xy = y * stride + x;
2661             yz = y * 8 + x;
2662             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2663         }
2664     }
2665
2666     for(y=0; y<8; y++){
2667         src[  y*stride] = (temp[  y*8] + 2)>>2;
2668         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2669         for(x=1; x<7; x++){
2670             xy = y * stride + x;
2671             yz = y * 8 + x;
2672             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2673         }
2674     }
2675 }
2676
2677 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2678 {
2679     int i, d;
2680     for( i = 0; i < 4; i++ ) {
2681         if( tc0[i] < 0 ) {
2682             pix += 4*ystride;
2683             continue;
2684         }
2685         for( d = 0; d < 4; d++ ) {
2686             const int p0 = pix[-1*xstride];
2687             const int p1 = pix[-2*xstride];
2688             const int p2 = pix[-3*xstride];
2689             const int q0 = pix[0];
2690             const int q1 = pix[1*xstride];
2691             const int q2 = pix[2*xstride];
2692
2693             if( ABS( p0 - q0 ) < alpha &&
2694                 ABS( p1 - p0 ) < beta &&
2695                 ABS( q1 - q0 ) < beta ) {
2696
2697                 int tc = tc0[i];
2698                 int i_delta;
2699
2700                 if( ABS( p2 - p0 ) < beta ) {
2701                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2702                     tc++;
2703                 }
2704                 if( ABS( q2 - q0 ) < beta ) {
2705                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2706                     tc++;
2707                 }
2708
2709                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2710                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2711                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2712             }
2713             pix += ystride;
2714         }
2715     }
2716 }
2717 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2718 {
2719     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2720 }
2721 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2722 {
2723     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2724 }
2725
2726 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2727 {
2728     int i, d;
2729     for( i = 0; i < 4; i++ ) {
2730         const int tc = tc0[i];
2731         if( tc <= 0 ) {
2732             pix += 2*ystride;
2733             continue;
2734         }
2735         for( d = 0; d < 2; d++ ) {
2736             const int p0 = pix[-1*xstride];
2737             const int p1 = pix[-2*xstride];
2738             const int q0 = pix[0];
2739             const int q1 = pix[1*xstride];
2740
2741             if( ABS( p0 - q0 ) < alpha &&
2742                 ABS( p1 - p0 ) < beta &&
2743                 ABS( q1 - q0 ) < beta ) {
2744
2745                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2746
2747                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2748                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2749             }
2750             pix += ystride;
2751         }
2752     }
2753 }
2754 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2755 {
2756     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2757 }
2758 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2759 {
2760     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2761 }
2762
2763 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2764 {
2765     int d;
2766     for( d = 0; d < 8; d++ ) {
2767         const int p0 = pix[-1*xstride];
2768         const int p1 = pix[-2*xstride];
2769         const int q0 = pix[0];
2770         const int q1 = pix[1*xstride];
2771
2772         if( ABS( p0 - q0 ) < alpha &&
2773             ABS( p1 - p0 ) < beta &&
2774             ABS( q1 - q0 ) < beta ) {
2775
2776             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2777             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2778         }
2779         pix += ystride;
2780     }
2781 }
2782 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2783 {
2784     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2785 }
2786 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2787 {
2788     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2789 }
2790
2791 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2792 {
2793     int s, i;
2794
2795     s = 0;
2796     for(i=0;i<h;i++) {
2797         s += abs(pix1[0] - pix2[0]);
2798         s += abs(pix1[1] - pix2[1]);
2799         s += abs(pix1[2] - pix2[2]);
2800         s += abs(pix1[3] - pix2[3]);
2801         s += abs(pix1[4] - pix2[4]);
2802         s += abs(pix1[5] - pix2[5]);
2803         s += abs(pix1[6] - pix2[6]);
2804         s += abs(pix1[7] - pix2[7]);
2805         s += abs(pix1[8] - pix2[8]);
2806         s += abs(pix1[9] - pix2[9]);
2807         s += abs(pix1[10] - pix2[10]);
2808         s += abs(pix1[11] - pix2[11]);
2809         s += abs(pix1[12] - pix2[12]);
2810         s += abs(pix1[13] - pix2[13]);
2811         s += abs(pix1[14] - pix2[14]);
2812         s += abs(pix1[15] - pix2[15]);
2813         pix1 += line_size;
2814         pix2 += line_size;
2815     }
2816     return s;
2817 }
2818
2819 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2820 {
2821     int s, i;
2822
2823     s = 0;
2824     for(i=0;i<h;i++) {
2825         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2826         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2827         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2828         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2829         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2830         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2831         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2832         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2833         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2834         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2835         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2836         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2837         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2838         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2839         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2840         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2841         pix1 += line_size;
2842         pix2 += line_size;
2843     }
2844     return s;
2845 }
2846
2847 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2848 {
2849     int s, i;
2850     uint8_t *pix3 = pix2 + line_size;
2851
2852     s = 0;
2853     for(i=0;i<h;i++) {
2854         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2855         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2856         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2857         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2858         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2859         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2860         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2861         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2862         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2863         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2864         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2865         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2866         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2867         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2868         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2869         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2870         pix1 += line_size;
2871         pix2 += line_size;
2872         pix3 += line_size;
2873     }
2874     return s;
2875 }
2876
2877 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2878 {
2879     int s, i;
2880     uint8_t *pix3 = pix2 + line_size;
2881
2882     s = 0;
2883     for(i=0;i<h;i++) {
2884         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2885         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2886         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2887         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2888         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2889         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2890         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2891         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2892         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2893         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2894         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2895         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2896         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2897         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2898         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2899         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2900         pix1 += line_size;
2901         pix2 += line_size;
2902         pix3 += line_size;
2903     }
2904     return s;
2905 }
2906
2907 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908 {
2909     int s, i;
2910
2911     s = 0;
2912     for(i=0;i<h;i++) {
2913         s += abs(pix1[0] - pix2[0]);
2914         s += abs(pix1[1] - pix2[1]);
2915         s += abs(pix1[2] - pix2[2]);
2916         s += abs(pix1[3] - pix2[3]);
2917         s += abs(pix1[4] - pix2[4]);
2918         s += abs(pix1[5] - pix2[5]);
2919         s += abs(pix1[6] - pix2[6]);
2920         s += abs(pix1[7] - pix2[7]);
2921         pix1 += line_size;
2922         pix2 += line_size;
2923     }
2924     return s;
2925 }
2926
2927 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928 {
2929     int s, i;
2930
2931     s = 0;
2932     for(i=0;i<h;i++) {
2933         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941         pix1 += line_size;
2942         pix2 += line_size;
2943     }
2944     return s;
2945 }
2946
2947 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2948 {
2949     int s, i;
2950     uint8_t *pix3 = pix2 + line_size;
2951
2952     s = 0;
2953     for(i=0;i<h;i++) {
2954         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962         pix1 += line_size;
2963         pix2 += line_size;
2964         pix3 += line_size;
2965     }
2966     return s;
2967 }
2968
2969 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970 {
2971     int s, i;
2972     uint8_t *pix3 = pix2 + line_size;
2973
2974     s = 0;
2975     for(i=0;i<h;i++) {
2976         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2977         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2978         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2979         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2980         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2981         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2982         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2983         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2984         pix1 += line_size;
2985         pix2 += line_size;
2986         pix3 += line_size;
2987     }
2988     return s;
2989 }
2990
2991 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2992     int score1=0;
2993     int score2=0;
2994     int x,y;
2995
2996     for(y=0; y<h; y++){
2997         for(x=0; x<16; x++){
2998             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2999         }
3000         if(y+1<h){
3001             for(x=0; x<15; x++){
3002                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3003                              - s1[x+1] + s1[x+1+stride])
3004                         -ABS(  s2[x  ] - s2[x  +stride]
3005                              - s2[x+1] + s2[x+1+stride]);
3006             }
3007         }
3008         s1+= stride;
3009         s2+= stride;
3010     }
3011
3012     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3013     else  return score1 + ABS(score2)*8;
3014 }
3015
3016 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3017     int score1=0;
3018     int score2=0;
3019     int x,y;
3020
3021     for(y=0; y<h; y++){
3022         for(x=0; x<8; x++){
3023             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3024         }
3025         if(y+1<h){
3026             for(x=0; x<7; x++){
3027                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3028                              - s1[x+1] + s1[x+1+stride])
3029                         -ABS(  s2[x  ] - s2[x  +stride]
3030                              - s2[x+1] + s2[x+1+stride]);
3031             }
3032         }
3033         s1+= stride;
3034         s2+= stride;
3035     }
3036
3037     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3038     else  return score1 + ABS(score2)*8;
3039 }
3040
3041 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3042     int i;
3043     unsigned int sum=0;
3044
3045     for(i=0; i<8*8; i++){
3046         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3047         int w= weight[i];
3048         b>>= RECON_SHIFT;
3049         assert(-512<b && b<512);
3050
3051         sum += (w*b)*(w*b)>>4;
3052     }
3053     return sum>>2;
3054 }
3055
3056 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3057     int i;
3058
3059     for(i=0; i<8*8; i++){
3060         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3061     }
3062 }
3063
3064 /**
3065  * permutes an 8x8 block.
3066  * @param block the block which will be permuted according to the given permutation vector
3067  * @param permutation the permutation vector
3068  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3069  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3070  *                  (inverse) permutated to scantable order!
3071  */
3072 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3073 {
3074     int i;
3075     DCTELEM temp[64];
3076
3077     if(last<=0) return;
3078     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3079
3080     for(i=0; i<=last; i++){
3081         const int j= scantable[i];
3082         temp[j]= block[j];
3083         block[j]=0;
3084     }
3085
3086     for(i=0; i<=last; i++){
3087         const int j= scantable[i];
3088         const int perm_j= permutation[j];
3089         block[perm_j]= temp[j];
3090     }
3091 }
3092
3093 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3094     return 0;
3095 }
3096
3097 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3098     int i;
3099
3100     memset(cmp, 0, sizeof(void*)*5);
3101
3102     for(i=0; i<5; i++){
3103         switch(type&0xFF){
3104         case FF_CMP_SAD:
3105             cmp[i]= c->sad[i];
3106             break;
3107         case FF_CMP_SATD:
3108             cmp[i]= c->hadamard8_diff[i];
3109             break;
3110         case FF_CMP_SSE:
3111             cmp[i]= c->sse[i];
3112             break;
3113         case FF_CMP_DCT:
3114             cmp[i]= c->dct_sad[i];
3115             break;
3116         case FF_CMP_DCTMAX:
3117             cmp[i]= c->dct_max[i];
3118             break;
3119         case FF_CMP_PSNR:
3120             cmp[i]= c->quant_psnr[i];
3121             break;
3122         case FF_CMP_BIT:
3123             cmp[i]= c->bit[i];
3124             break;
3125         case FF_CMP_RD:
3126             cmp[i]= c->rd[i];
3127             break;
3128         case FF_CMP_VSAD:
3129             cmp[i]= c->vsad[i];
3130             break;
3131         case FF_CMP_VSSE:
3132             cmp[i]= c->vsse[i];
3133             break;
3134         case FF_CMP_ZERO:
3135             cmp[i]= zero_cmp;
3136             break;
3137         case FF_CMP_NSSE:
3138             cmp[i]= c->nsse[i];
3139             break;
3140         case FF_CMP_W53:
3141             cmp[i]= c->w53[i];
3142             break;
3143         case FF_CMP_W97:
3144             cmp[i]= c->w97[i];
3145             break;
3146         default:
3147             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3148         }
3149     }
3150 }
3151
3152 /**
3153  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3154  */
3155 static void clear_blocks_c(DCTELEM *blocks)
3156 {
3157     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3158 }
3159
3160 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3161     int i;
3162     for(i=0; i+7<w; i+=8){
3163         dst[i+0] += src[i+0];
3164         dst[i+1] += src[i+1];
3165         dst[i+2] += src[i+2];
3166         dst[i+3] += src[i+3];
3167         dst[i+4] += src[i+4];
3168         dst[i+5] += src[i+5];
3169         dst[i+6] += src[i+6];
3170         dst[i+7] += src[i+7];
3171     }
3172     for(; i<w; i++)
3173         dst[i+0] += src[i+0];
3174 }
3175
3176 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3177     int i;
3178     for(i=0; i+7<w; i+=8){
3179         dst[i+0] = src1[i+0]-src2[i+0];
3180         dst[i+1] = src1[i+1]-src2[i+1];
3181         dst[i+2] = src1[i+2]-src2[i+2];
3182         dst[i+3] = src1[i+3]-src2[i+3];
3183         dst[i+4] = src1[i+4]-src2[i+4];
3184         dst[i+5] = src1[i+5]-src2[i+5];
3185         dst[i+6] = src1[i+6]-src2[i+6];
3186         dst[i+7] = src1[i+7]-src2[i+7];
3187     }
3188     for(; i<w; i++)
3189         dst[i+0] = src1[i+0]-src2[i+0];
3190 }
3191
3192 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3193     int i;
3194     uint8_t l, lt;
3195
3196     l= *left;
3197     lt= *left_top;
3198
3199     for(i=0; i<w; i++){
3200         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3201         lt= src1[i];
3202         l= src2[i];
3203         dst[i]= l - pred;
3204     }
3205
3206     *left= l;
3207     *left_top= lt;
3208 }
3209
3210 #define BUTTERFLY2(o1,o2,i1,i2) \
3211 o1= (i1)+(i2);\
3212 o2= (i1)-(i2);
3213
3214 #define BUTTERFLY1(x,y) \
3215 {\
3216     int a,b;\
3217     a= x;\
3218     b= y;\
3219     x= a+b;\
3220     y= a-b;\
3221 }
3222
3223 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3224
3225 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3226     int i;
3227     int temp[64];
3228     int sum=0;
3229
3230     assert(h==8);
3231
3232     for(i=0; i<8; i++){
3233         //FIXME try pointer walks
3234         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3235         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3236         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3237         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3238
3239         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3240         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3241         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3242         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3243
3244         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3245         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3246         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3247         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3248     }
3249
3250     for(i=0; i<8; i++){
3251         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3252         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3253         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3254         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3255
3256         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3257         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3258         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3259         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3260
3261         sum +=
3262              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3263             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3264             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3265             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3266     }
3267 #if 0
3268 static int maxi=0;
3269 if(sum>maxi){
3270     maxi=sum;
3271     printf("MAX:%d\n", maxi);
3272 }
3273 #endif
3274     return sum;
3275 }
3276
3277 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3278     int i;
3279     int temp[64];
3280     int sum=0;
3281
3282     assert(h==8);
3283
3284     for(i=0; i<8; i++){
3285         //FIXME try pointer walks
3286         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3287         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3288         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3289         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3290
3291         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3292         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3293         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3294         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3295
3296         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3297         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3298         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3299         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3300     }
3301
3302     for(i=0; i<8; i++){
3303         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3304         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3305         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3306         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3307
3308         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3309         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3310         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3311         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3312
3313         sum +=
3314              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3315             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3316             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3317             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3318     }
3319
3320     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3321
3322     return sum;
3323 }
3324
3325 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3326     MpegEncContext * const s= (MpegEncContext *)c;
3327     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3328     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3329     int sum=0, i;
3330
3331     assert(h==8);
3332
3333     s->dsp.diff_pixels(temp, src1, src2, stride);
3334     s->dsp.fdct(temp);
3335
3336     for(i=0; i<64; i++)
3337         sum+= ABS(temp[i]);
3338
3339     return sum;
3340 }
3341
3342 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3343     MpegEncContext * const s= (MpegEncContext *)c;
3344     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3345     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3346     int sum=0, i;
3347
3348     assert(h==8);
3349
3350     s->dsp.diff_pixels(temp, src1, src2, stride);
3351     s->dsp.fdct(temp);
3352
3353     for(i=0; i<64; i++)
3354         sum= FFMAX(sum, ABS(temp[i]));
3355
3356     return sum;
3357 }
3358
3359 void simple_idct(DCTELEM *block); //FIXME
3360
3361 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3362     MpegEncContext * const s= (MpegEncContext *)c;
3363     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3364     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3365     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3366     int sum=0, i;
3367
3368     assert(h==8);
3369     s->mb_intra=0;
3370
3371     s->dsp.diff_pixels(temp, src1, src2, stride);
3372
3373     memcpy(bak, temp, 64*sizeof(DCTELEM));
3374
3375     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3376     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3377     simple_idct(temp); //FIXME
3378
3379     for(i=0; i<64; i++)
3380         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3381
3382     return sum;
3383 }
3384
3385 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3386     MpegEncContext * const s= (MpegEncContext *)c;
3387     const uint8_t *scantable= s->intra_scantable.permutated;
3388     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3389     uint64_t __align8 aligned_bak[stride];
3390     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3391     uint8_t * const bak= (uint8_t*)aligned_bak;
3392     int i, last, run, bits, level, distoration, start_i;
3393     const int esc_length= s->ac_esc_length;
3394     uint8_t * length;
3395     uint8_t * last_length;
3396
3397     assert(h==8);
3398
3399     for(i=0; i<8; i++){
3400         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3401         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3402     }
3403
3404     s->dsp.diff_pixels(temp, src1, src2, stride);
3405
3406     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3407
3408     bits=0;
3409
3410     if (s->mb_intra) {
3411         start_i = 1;
3412         length     = s->intra_ac_vlc_length;
3413         last_length= s->intra_ac_vlc_last_length;
3414         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3415     } else {
3416         start_i = 0;
3417         length     = s->inter_ac_vlc_length;
3418         last_length= s->inter_ac_vlc_last_length;
3419     }
3420
3421     if(last>=start_i){
3422         run=0;
3423         for(i=start_i; i<last; i++){
3424             int j= scantable[i];
3425             level= temp[j];
3426
3427             if(level){
3428                 level+=64;
3429                 if((level&(~127)) == 0){
3430                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3431                 }else
3432                     bits+= esc_length;
3433                 run=0;
3434             }else
3435                 run++;
3436         }
3437         i= scantable[last];
3438
3439         level= temp[i] + 64;
3440
3441         assert(level - 64);
3442
3443         if((level&(~127)) == 0){
3444             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3445         }else
3446             bits+= esc_length;
3447
3448     }
3449
3450     if(last>=0){
3451         if(s->mb_intra)
3452             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3453         else
3454             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3455     }
3456
3457     s->dsp.idct_add(bak, stride, temp);
3458
3459     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3460
3461     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3462 }
3463
3464 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3465     MpegEncContext * const s= (MpegEncContext *)c;
3466     const uint8_t *scantable= s->intra_scantable.permutated;
3467     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3468     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3469     int i, last, run, bits, level, start_i;
3470     const int esc_length= s->ac_esc_length;
3471     uint8_t * length;
3472     uint8_t * last_length;
3473
3474     assert(h==8);
3475
3476     s->dsp.diff_pixels(temp, src1, src2, stride);
3477
3478     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3479
3480     bits=0;
3481
3482     if (s->mb_intra) {
3483         start_i = 1;
3484         length     = s->intra_ac_vlc_length;
3485         last_length= s->intra_ac_vlc_last_length;
3486         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3487     } else {
3488         start_i = 0;
3489         length     = s->inter_ac_vlc_length;
3490         last_length= s->inter_ac_vlc_last_length;
3491     }
3492
3493     if(last>=start_i){
3494         run=0;
3495         for(i=start_i; i<last; i++){
3496             int j= scantable[i];
3497             level= temp[j];
3498
3499             if(level){
3500                 level+=64;
3501                 if((level&(~127)) == 0){
3502                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3503                 }else
3504                     bits+= esc_length;
3505                 run=0;
3506             }else
3507                 run++;
3508         }
3509         i= scantable[last];
3510
3511         level= temp[i] + 64;
3512
3513         assert(level - 64);
3514
3515         if((level&(~127)) == 0){
3516             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3517         }else
3518             bits+= esc_length;
3519     }
3520
3521     return bits;
3522 }
3523
3524 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3525     int score=0;
3526     int x,y;
3527
3528     for(y=1; y<h; y++){
3529         for(x=0; x<16; x+=4){
3530             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3531                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3532         }
3533         s+= stride;
3534     }
3535
3536     return score;
3537 }
3538
3539 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3540     int score=0;
3541     int x,y;
3542
3543     for(y=1; y<h; y++){
3544         for(x=0; x<16; x++){
3545             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3546         }
3547         s1+= stride;
3548         s2+= stride;
3549     }
3550
3551     return score;
3552 }
3553
3554 #define SQ(a) ((a)*(a))
3555 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3556     int score=0;
3557     int x,y;
3558
3559     for(y=1; y<h; y++){
3560         for(x=0; x<16; x+=4){
3561             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3562                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3563         }
3564         s+= stride;
3565     }
3566
3567     return score;
3568 }
3569
3570 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3571     int score=0;
3572     int x,y;
3573
3574     for(y=1; y<h; y++){
3575         for(x=0; x<16; x++){
3576             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3577         }
3578         s1+= stride;
3579         s2+= stride;
3580     }
3581
3582     return score;
3583 }
3584
3585 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3586 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3587 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3588 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3589 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3590 WARPER8_16_SQ(rd8x8_c, rd16_c)
3591 WARPER8_16_SQ(bit8x8_c, bit16_c)
3592
3593 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3594  converted */
3595 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3596 {
3597     j_rev_dct (block);
3598     put_pixels_clamped_c(block, dest, line_size);
3599 }
3600 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3601 {
3602     j_rev_dct (block);
3603     add_pixels_clamped_c(block, dest, line_size);
3604 }
3605
3606 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3607 {
3608     j_rev_dct4 (block);
3609     put_pixels_clamped4_c(block, dest, line_size);
3610 }
3611 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3612 {
3613     j_rev_dct4 (block);
3614     add_pixels_clamped4_c(block, dest, line_size);
3615 }
3616
3617 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3618 {
3619     j_rev_dct2 (block);
3620     put_pixels_clamped2_c(block, dest, line_size);
3621 }
3622 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3623 {
3624     j_rev_dct2 (block);
3625     add_pixels_clamped2_c(block, dest, line_size);
3626 }
3627
3628 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3629 {
3630     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3631
3632     dest[0] = cm[(block[0] + 4)>>3];
3633 }
3634 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3635 {
3636     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3637
3638     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3639 }
3640
3641 /* init static data */
3642 void dsputil_static_init(void)
3643 {
3644     int i;
3645
3646     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3647     for(i=0;i<MAX_NEG_CROP;i++) {
3648         cropTbl[i] = 0;
3649         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3650     }
3651
3652     for(i=0;i<512;i++) {
3653         squareTbl[i] = (i - 256) * (i - 256);
3654     }
3655
3656     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3657 }
3658
3659
3660 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3661 {
3662     int i;
3663
3664 #ifdef CONFIG_ENCODERS
3665     if(avctx->dct_algo==FF_DCT_FASTINT) {
3666         c->fdct = fdct_ifast;
3667         c->fdct248 = fdct_ifast248;
3668     }
3669     else if(avctx->dct_algo==FF_DCT_FAAN) {
3670         c->fdct = ff_faandct;
3671         c->fdct248 = ff_faandct248;
3672     }
3673     else {
3674         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3675         c->fdct248 = ff_fdct248_islow;
3676     }
3677 #endif //CONFIG_ENCODERS
3678
3679     if(avctx->lowres==1){
3680         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3681             c->idct_put= ff_jref_idct4_put;
3682             c->idct_add= ff_jref_idct4_add;
3683         }else{
3684             c->idct_put= ff_h264_lowres_idct_put_c;
3685             c->idct_add= ff_h264_lowres_idct_add_c;
3686         }
3687         c->idct    = j_rev_dct4;
3688         c->idct_permutation_type= FF_NO_IDCT_PERM;
3689     }else if(avctx->lowres==2){
3690         c->idct_put= ff_jref_idct2_put;
3691         c->idct_add= ff_jref_idct2_add;
3692         c->idct    = j_rev_dct2;
3693         c->idct_permutation_type= FF_NO_IDCT_PERM;
3694     }else if(avctx->lowres==3){
3695         c->idct_put= ff_jref_idct1_put;
3696         c->idct_add= ff_jref_idct1_add;
3697         c->idct    = j_rev_dct1;
3698         c->idct_permutation_type= FF_NO_IDCT_PERM;
3699     }else{
3700         if(avctx->idct_algo==FF_IDCT_INT){
3701             c->idct_put= ff_jref_idct_put;
3702             c->idct_add= ff_jref_idct_add;
3703             c->idct    = j_rev_dct;
3704             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3705         }else if(avctx->idct_algo==FF_IDCT_VP3){
3706             c->idct_put= ff_vp3_idct_put_c;
3707             c->idct_add= ff_vp3_idct_add_c;
3708             c->idct    = ff_vp3_idct_c;
3709             c->idct_permutation_type= FF_NO_IDCT_PERM;
3710         }else{ //accurate/default
3711             c->idct_put= simple_idct_put;
3712             c->idct_add= simple_idct_add;
3713             c->idct    = simple_idct;
3714             c->idct_permutation_type= FF_NO_IDCT_PERM;
3715         }
3716     }
3717
3718     c->h264_idct_add= ff_h264_idct_add_c;
3719     c->h264_idct8_add= ff_h264_idct8_add_c;
3720
3721     c->get_pixels = get_pixels_c;
3722     c->diff_pixels = diff_pixels_c;
3723     c->put_pixels_clamped = put_pixels_clamped_c;
3724     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3725     c->add_pixels_clamped = add_pixels_clamped_c;
3726     c->add_pixels8 = add_pixels8_c;
3727     c->add_pixels4 = add_pixels4_c;
3728     c->gmc1 = gmc1_c;
3729     c->gmc = gmc_c;
3730     c->clear_blocks = clear_blocks_c;
3731     c->pix_sum = pix_sum_c;
3732     c->pix_norm1 = pix_norm1_c;
3733
3734     /* TODO [0] 16  [1] 8 */
3735     c->pix_abs[0][0] = pix_abs16_c;
3736     c->pix_abs[0][1] = pix_abs16_x2_c;
3737     c->pix_abs[0][2] = pix_abs16_y2_c;
3738     c->pix_abs[0][3] = pix_abs16_xy2_c;
3739     c->pix_abs[1][0] = pix_abs8_c;
3740     c->pix_abs[1][1] = pix_abs8_x2_c;
3741     c->pix_abs[1][2] = pix_abs8_y2_c;
3742     c->pix_abs[1][3] = pix_abs8_xy2_c;
3743
3744 #define dspfunc(PFX, IDX, NUM) \
3745     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3746     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3747     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3748     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3749
3750     dspfunc(put, 0, 16);
3751     dspfunc(put_no_rnd, 0, 16);
3752     dspfunc(put, 1, 8);
3753     dspfunc(put_no_rnd, 1, 8);
3754     dspfunc(put, 2, 4);
3755     dspfunc(put, 3, 2);
3756
3757     dspfunc(avg, 0, 16);
3758     dspfunc(avg_no_rnd, 0, 16);
3759     dspfunc(avg, 1, 8);
3760     dspfunc(avg_no_rnd, 1, 8);
3761     dspfunc(avg, 2, 4);
3762     dspfunc(avg, 3, 2);
3763 #undef dspfunc
3764
3765     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3766     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3767
3768     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3769     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3770     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3771     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3772     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3773     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3774     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3775     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3776     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3777
3778     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3779     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3780     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3781     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3782     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3783     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3784     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3785     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3786     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3787
3788 #define dspfunc(PFX, IDX, NUM) \
3789     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3790     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3791     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3792     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3793     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3794     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3795     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3796     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3797     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3798     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3799     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3800     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3801     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3802     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3803     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3804     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3805
3806     dspfunc(put_qpel, 0, 16);
3807     dspfunc(put_no_rnd_qpel, 0, 16);
3808
3809     dspfunc(avg_qpel, 0, 16);
3810     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3811
3812     dspfunc(put_qpel, 1, 8);
3813     dspfunc(put_no_rnd_qpel, 1, 8);
3814
3815     dspfunc(avg_qpel, 1, 8);
3816     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3817
3818     dspfunc(put_h264_qpel, 0, 16);
3819     dspfunc(put_h264_qpel, 1, 8);
3820     dspfunc(put_h264_qpel, 2, 4);
3821     dspfunc(avg_h264_qpel, 0, 16);
3822     dspfunc(avg_h264_qpel, 1, 8);
3823     dspfunc(avg_h264_qpel, 2, 4);
3824
3825 #undef dspfunc
3826     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3827     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3828     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3829     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3830     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3831     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3832
3833     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3834     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3835     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3836     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3837     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3838     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3839     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3840     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3841     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3842     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3843     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3844     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3845     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3846     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3847     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3848     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3849     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3850     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3851     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3852     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3853
3854     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3855     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3856     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3857     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3858     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3859     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3860     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3861     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3862
3863 #define SET_CMP_FUNC(name) \
3864     c->name[0]= name ## 16_c;\
3865     c->name[1]= name ## 8x8_c;
3866
3867     SET_CMP_FUNC(hadamard8_diff)
3868     c->hadamard8_diff[4]= hadamard8_intra16_c;
3869     SET_CMP_FUNC(dct_sad)
3870     SET_CMP_FUNC(dct_max)
3871     c->sad[0]= pix_abs16_c;
3872     c->sad[1]= pix_abs8_c;
3873     c->sse[0]= sse16_c;
3874     c->sse[1]= sse8_c;
3875     c->sse[2]= sse4_c;
3876     SET_CMP_FUNC(quant_psnr)
3877     SET_CMP_FUNC(rd)
3878     SET_CMP_FUNC(bit)
3879     c->vsad[0]= vsad16_c;
3880     c->vsad[4]= vsad_intra16_c;
3881     c->vsse[0]= vsse16_c;
3882     c->vsse[4]= vsse_intra16_c;
3883     c->nsse[0]= nsse16_c;
3884     c->nsse[1]= nsse8_c;
3885     c->w53[0]= w53_16_c;
3886     c->w53[1]= w53_8_c;
3887     c->w97[0]= w97_16_c;
3888     c->w97[1]= w97_8_c;
3889
3890     c->add_bytes= add_bytes_c;
3891     c->diff_bytes= diff_bytes_c;
3892     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3893     c->bswap_buf= bswap_buf;
3894
3895     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
3896     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
3897     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
3898     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
3899     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
3900     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
3901
3902     c->h263_h_loop_filter= h263_h_loop_filter_c;
3903     c->h263_v_loop_filter= h263_v_loop_filter_c;
3904
3905     c->h261_loop_filter= h261_loop_filter_c;
3906
3907     c->try_8x8basis= try_8x8basis_c;
3908     c->add_8x8basis= add_8x8basis_c;
3909
3910 #ifdef HAVE_MMX
3911     dsputil_init_mmx(c, avctx);
3912 #endif
3913 #ifdef ARCH_ARMV4L
3914     dsputil_init_armv4l(c, avctx);
3915 #endif
3916 #ifdef HAVE_MLIB
3917     dsputil_init_mlib(c, avctx);
3918 #endif
3919 #ifdef ARCH_SPARC
3920    dsputil_init_vis(c,avctx);
3921 #endif
3922 #ifdef ARCH_ALPHA
3923     dsputil_init_alpha(c, avctx);
3924 #endif
3925 #ifdef ARCH_POWERPC
3926     dsputil_init_ppc(c, avctx);
3927 #endif
3928 #ifdef HAVE_MMI
3929     dsputil_init_mmi(c, avctx);
3930 #endif
3931 #ifdef ARCH_SH4
3932     dsputil_init_sh4(c,avctx);
3933 #endif
3934
3935     switch(c->idct_permutation_type){
3936     case FF_NO_IDCT_PERM:
3937         for(i=0; i<64; i++)
3938             c->idct_permutation[i]= i;
3939         break;
3940     case FF_LIBMPEG2_IDCT_PERM:
3941         for(i=0; i<64; i++)
3942             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3943         break;
3944     case FF_SIMPLE_IDCT_PERM:
3945         for(i=0; i<64; i++)
3946             c->idct_permutation[i]= simple_mmx_permutation[i];
3947         break;
3948     case FF_TRANSPOSE_IDCT_PERM:
3949         for(i=0; i<64; i++)
3950             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3951         break;
3952     case FF_PARTTRANS_IDCT_PERM:
3953         for(i=0; i<64; i++)
3954             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3955         break;
3956     default:
3957         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3958     }
3959 }
3960