libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "libavutil/internal.h"
  32 #include "avcodec.h"
  33 #include "copy_block.h"
  34 #include "dct.h"
  35 #include "dsputil.h"
  36 #include "simple_idct.h"
  37 #include "faandct.h"
  38 #include "faanidct.h"
  39 #include "imgconvert.h"
  40 #include "mathops.h"
  41 #include "mpegvideo.h"
  42 #include "config.h"
  43
  44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 #define BIT_DEPTH 9
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 10
  52 #include "dsputil_template.c"
  53 #undef BIT_DEPTH
  54
  55 #define BIT_DEPTH 8
  56 #include "dsputil_template.c"
  57
  58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  59 #define pb_7f (~0UL/255 * 0x7f)
  60 #define pb_80 (~0UL/255 * 0x80)
  61
  62 const uint8_t ff_zigzag_direct[64] = {
  63     0,   1,  8, 16,  9,  2,  3, 10,
  64     17, 24, 32, 25, 18, 11,  4,  5,
  65     12, 19, 26, 33, 40, 48, 41, 34,
  66     27, 20, 13,  6,  7, 14, 21, 28,
  67     35, 42, 49, 56, 57, 50, 43, 36,
  68     29, 22, 15, 23, 30, 37, 44, 51,
  69     58, 59, 52, 45, 38, 31, 39, 46,
  70     53, 60, 61, 54, 47, 55, 62, 63
  71 };
  72
  73 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  74    specification, we interleave the fields */
  75 const uint8_t ff_zigzag248_direct[64] = {
  76      0,  8,  1,  9, 16, 24,  2, 10,
  77     17, 25, 32, 40, 48, 56, 33, 41,
  78     18, 26,  3, 11,  4, 12, 19, 27,
  79     34, 42, 49, 57, 50, 58, 35, 43,
  80     20, 28,  5, 13,  6, 14, 21, 29,
  81     36, 44, 51, 59, 52, 60, 37, 45,
  82     22, 30,  7, 15, 23, 31, 38, 46,
  83     53, 61, 54, 62, 39, 47, 55, 63,
  84 };
  85
  86 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  87 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  88
  89 const uint8_t ff_alternate_horizontal_scan[64] = {
  90     0,  1,   2,  3,  8,  9, 16, 17,
  91     10, 11,  4,  5,  6,  7, 15, 14,
  92     13, 12, 19, 18, 24, 25, 32, 33,
  93     26, 27, 20, 21, 22, 23, 28, 29,
  94     30, 31, 34, 35, 40, 41, 48, 49,
  95     42, 43, 36, 37, 38, 39, 44, 45,
  96     46, 47, 50, 51, 56, 57, 58, 59,
  97     52, 53, 54, 55, 60, 61, 62, 63,
  98 };
  99
 100 const uint8_t ff_alternate_vertical_scan[64] = {
 101     0,  8,  16, 24,  1,  9,  2, 10,
 102     17, 25, 32, 40, 48, 56, 57, 49,
 103     41, 33, 26, 18,  3, 11,  4, 12,
 104     19, 27, 34, 42, 50, 58, 35, 43,
 105     51, 59, 20, 28,  5, 13,  6, 14,
 106     21, 29, 36, 44, 52, 60, 37, 45,
 107     53, 61, 22, 30,  7, 15, 23, 31,
 108     38, 46, 54, 62, 39, 47, 55, 63,
 109 };
 110
 111 /* Input permutation for the simple_idct_mmx */
 112 static const uint8_t simple_mmx_permutation[64]={
 113         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 114         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 115         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 116         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 117         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 118         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 119         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 120         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 121 };
 122
 123 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 124
 125 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 126     int i;
 127     int end;
 128
 129     st->scantable= src_scantable;
 130
 131     for(i=0; i<64; i++){
 132         int j;
 133         j = src_scantable[i];
 134         st->permutated[i] = permutation[j];
 135     }
 136
 137     end=-1;
 138     for(i=0; i<64; i++){
 139         int j;
 140         j = st->permutated[i];
 141         if(j>end) end=j;
 142         st->raster_end[i]= end;
 143     }
 144 }
 145
 146 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 147                                    int idct_permutation_type)
 148 {
 149     int i;
 150
 151     switch(idct_permutation_type){
 152     case FF_NO_IDCT_PERM:
 153         for(i=0; i<64; i++)
 154             idct_permutation[i]= i;
 155         break;
 156     case FF_LIBMPEG2_IDCT_PERM:
 157         for(i=0; i<64; i++)
 158             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 159         break;
 160     case FF_SIMPLE_IDCT_PERM:
 161         for(i=0; i<64; i++)
 162             idct_permutation[i]= simple_mmx_permutation[i];
 163         break;
 164     case FF_TRANSPOSE_IDCT_PERM:
 165         for(i=0; i<64; i++)
 166             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 167         break;
 168     case FF_PARTTRANS_IDCT_PERM:
 169         for(i=0; i<64; i++)
 170             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 171         break;
 172     case FF_SSE2_IDCT_PERM:
 173         for(i=0; i<64; i++)
 174             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 175         break;
 176     default:
 177         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 178     }
 179 }
 180
 181 static int pix_sum_c(uint8_t * pix, int line_size)
 182 {
 183     int s, i, j;
 184
 185     s = 0;
 186     for (i = 0; i < 16; i++) {
 187         for (j = 0; j < 16; j += 8) {
 188             s += pix[0];
 189             s += pix[1];
 190             s += pix[2];
 191             s += pix[3];
 192             s += pix[4];
 193             s += pix[5];
 194             s += pix[6];
 195             s += pix[7];
 196             pix += 8;
 197         }
 198         pix += line_size - 16;
 199     }
 200     return s;
 201 }
 202
 203 static int pix_norm1_c(uint8_t * pix, int line_size)
 204 {
 205     int s, i, j;
 206     uint32_t *sq = ff_squareTbl + 256;
 207
 208     s = 0;
 209     for (i = 0; i < 16; i++) {
 210         for (j = 0; j < 16; j += 8) {
 211 #if 0
 212             s += sq[pix[0]];
 213             s += sq[pix[1]];
 214             s += sq[pix[2]];
 215             s += sq[pix[3]];
 216             s += sq[pix[4]];
 217             s += sq[pix[5]];
 218             s += sq[pix[6]];
 219             s += sq[pix[7]];
 220 #else
 221 #if HAVE_FAST_64BIT
 222             register uint64_t x=*(uint64_t*)pix;
 223             s += sq[x&0xff];
 224             s += sq[(x>>8)&0xff];
 225             s += sq[(x>>16)&0xff];
 226             s += sq[(x>>24)&0xff];
 227             s += sq[(x>>32)&0xff];
 228             s += sq[(x>>40)&0xff];
 229             s += sq[(x>>48)&0xff];
 230             s += sq[(x>>56)&0xff];
 231 #else
 232             register uint32_t x=*(uint32_t*)pix;
 233             s += sq[x&0xff];
 234             s += sq[(x>>8)&0xff];
 235             s += sq[(x>>16)&0xff];
 236             s += sq[(x>>24)&0xff];
 237             x=*(uint32_t*)(pix+4);
 238             s += sq[x&0xff];
 239             s += sq[(x>>8)&0xff];
 240             s += sq[(x>>16)&0xff];
 241             s += sq[(x>>24)&0xff];
 242 #endif
 243 #endif
 244             pix += 8;
 245         }
 246         pix += line_size - 16;
 247     }
 248     return s;
 249 }
 250
 251 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 252     int i;
 253
 254     for(i=0; i+8<=w; i+=8){
 255         dst[i+0]= av_bswap32(src[i+0]);
 256         dst[i+1]= av_bswap32(src[i+1]);
 257         dst[i+2]= av_bswap32(src[i+2]);
 258         dst[i+3]= av_bswap32(src[i+3]);
 259         dst[i+4]= av_bswap32(src[i+4]);
 260         dst[i+5]= av_bswap32(src[i+5]);
 261         dst[i+6]= av_bswap32(src[i+6]);
 262         dst[i+7]= av_bswap32(src[i+7]);
 263     }
 264     for(;i<w; i++){
 265         dst[i+0]= av_bswap32(src[i+0]);
 266     }
 267 }
 268
 269 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 270 {
 271     while (len--)
 272         *dst++ = av_bswap16(*src++);
 273 }
 274
 275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 276 {
 277     int s, i;
 278     uint32_t *sq = ff_squareTbl + 256;
 279
 280     s = 0;
 281     for (i = 0; i < h; i++) {
 282         s += sq[pix1[0] - pix2[0]];
 283         s += sq[pix1[1] - pix2[1]];
 284         s += sq[pix1[2] - pix2[2]];
 285         s += sq[pix1[3] - pix2[3]];
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289     return s;
 290 }
 291
 292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 293 {
 294     int s, i;
 295     uint32_t *sq = ff_squareTbl + 256;
 296
 297     s = 0;
 298     for (i = 0; i < h; i++) {
 299         s += sq[pix1[0] - pix2[0]];
 300         s += sq[pix1[1] - pix2[1]];
 301         s += sq[pix1[2] - pix2[2]];
 302         s += sq[pix1[3] - pix2[3]];
 303         s += sq[pix1[4] - pix2[4]];
 304         s += sq[pix1[5] - pix2[5]];
 305         s += sq[pix1[6] - pix2[6]];
 306         s += sq[pix1[7] - pix2[7]];
 307         pix1 += line_size;
 308         pix2 += line_size;
 309     }
 310     return s;
 311 }
 312
 313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 314 {
 315     int s, i;
 316     uint32_t *sq = ff_squareTbl + 256;
 317
 318     s = 0;
 319     for (i = 0; i < h; i++) {
 320         s += sq[pix1[ 0] - pix2[ 0]];
 321         s += sq[pix1[ 1] - pix2[ 1]];
 322         s += sq[pix1[ 2] - pix2[ 2]];
 323         s += sq[pix1[ 3] - pix2[ 3]];
 324         s += sq[pix1[ 4] - pix2[ 4]];
 325         s += sq[pix1[ 5] - pix2[ 5]];
 326         s += sq[pix1[ 6] - pix2[ 6]];
 327         s += sq[pix1[ 7] - pix2[ 7]];
 328         s += sq[pix1[ 8] - pix2[ 8]];
 329         s += sq[pix1[ 9] - pix2[ 9]];
 330         s += sq[pix1[10] - pix2[10]];
 331         s += sq[pix1[11] - pix2[11]];
 332         s += sq[pix1[12] - pix2[12]];
 333         s += sq[pix1[13] - pix2[13]];
 334         s += sq[pix1[14] - pix2[14]];
 335         s += sq[pix1[15] - pix2[15]];
 336
 337         pix1 += line_size;
 338         pix2 += line_size;
 339     }
 340     return s;
 341 }
 342
 343 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 344                           const uint8_t *s2, int stride){
 345     int i;
 346
 347     /* read the pixels */
 348     for(i=0;i<8;i++) {
 349         block[0] = s1[0] - s2[0];
 350         block[1] = s1[1] - s2[1];
 351         block[2] = s1[2] - s2[2];
 352         block[3] = s1[3] - s2[3];
 353         block[4] = s1[4] - s2[4];
 354         block[5] = s1[5] - s2[5];
 355         block[6] = s1[6] - s2[6];
 356         block[7] = s1[7] - s2[7];
 357         s1 += stride;
 358         s2 += stride;
 359         block += 8;
 360     }
 361 }
 362
 363
 364 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 365                                  int line_size)
 366 {
 367     int i;
 368
 369     /* read the pixels */
 370     for(i=0;i<8;i++) {
 371         pixels[0] = av_clip_uint8(block[0]);
 372         pixels[1] = av_clip_uint8(block[1]);
 373         pixels[2] = av_clip_uint8(block[2]);
 374         pixels[3] = av_clip_uint8(block[3]);
 375         pixels[4] = av_clip_uint8(block[4]);
 376         pixels[5] = av_clip_uint8(block[5]);
 377         pixels[6] = av_clip_uint8(block[6]);
 378         pixels[7] = av_clip_uint8(block[7]);
 379
 380         pixels += line_size;
 381         block += 8;
 382     }
 383 }
 384
 385 static void put_signed_pixels_clamped_c(const int16_t *block,
 386                                         uint8_t *restrict pixels,
 387                                         int line_size)
 388 {
 389     int i, j;
 390
 391     for (i = 0; i < 8; i++) {
 392         for (j = 0; j < 8; j++) {
 393             if (*block < -128)
 394                 *pixels = 0;
 395             else if (*block > 127)
 396                 *pixels = 255;
 397             else
 398                 *pixels = (uint8_t)(*block + 128);
 399             block++;
 400             pixels++;
 401         }
 402         pixels += (line_size - 8);
 403     }
 404 }
 405
 406 static void add_pixels8_c(uint8_t *restrict pixels,
 407                           int16_t *block,
 408                           int line_size)
 409 {
 410     int i;
 411
 412     for(i=0;i<8;i++) {
 413         pixels[0] += block[0];
 414         pixels[1] += block[1];
 415         pixels[2] += block[2];
 416         pixels[3] += block[3];
 417         pixels[4] += block[4];
 418         pixels[5] += block[5];
 419         pixels[6] += block[6];
 420         pixels[7] += block[7];
 421         pixels += line_size;
 422         block += 8;
 423     }
 424 }
 425
 426 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 427                                  int line_size)
 428 {
 429     int i;
 430
 431     /* read the pixels */
 432     for(i=0;i<8;i++) {
 433         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 434         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 435         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 436         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 437         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 438         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 439         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 440         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 441         pixels += line_size;
 442         block += 8;
 443     }
 444 }
 445
 446 static int sum_abs_dctelem_c(int16_t *block)
 447 {
 448     int sum=0, i;
 449     for(i=0; i<64; i++)
 450         sum+= FFABS(block[i]);
 451     return sum;
 452 }
 453
 454 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 455 {
 456     int i;
 457
 458     for (i = 0; i < h; i++) {
 459         memset(block, value, 16);
 460         block += line_size;
 461     }
 462 }
 463
 464 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 465 {
 466     int i;
 467
 468     for (i = 0; i < h; i++) {
 469         memset(block, value, 8);
 470         block += line_size;
 471     }
 472 }
 473
 474 #define avg2(a,b) ((a+b+1)>>1)
 475 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 476
 477 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 478 {
 479     const int A=(16-x16)*(16-y16);
 480     const int B=(   x16)*(16-y16);
 481     const int C=(16-x16)*(   y16);
 482     const int D=(   x16)*(   y16);
 483     int i;
 484
 485     for(i=0; i<h; i++)
 486     {
 487         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 488         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 489         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 490         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 491         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 492         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 493         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 494         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 495         dst+= stride;
 496         src+= stride;
 497     }
 498 }
 499
 500 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 501                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 502 {
 503     int y, vx, vy;
 504     const int s= 1<<shift;
 505
 506     width--;
 507     height--;
 508
 509     for(y=0; y<h; y++){
 510         int x;
 511
 512         vx= ox;
 513         vy= oy;
 514         for(x=0; x<8; x++){ //XXX FIXME optimize
 515             int src_x, src_y, frac_x, frac_y, index;
 516
 517             src_x= vx>>16;
 518             src_y= vy>>16;
 519             frac_x= src_x&(s-1);
 520             frac_y= src_y&(s-1);
 521             src_x>>=shift;
 522             src_y>>=shift;
 523
 524             if((unsigned)src_x < width){
 525                 if((unsigned)src_y < height){
 526                     index= src_x + src_y*stride;
 527                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 528                                            + src[index       +1]*   frac_x )*(s-frac_y)
 529                                         + (  src[index+stride  ]*(s-frac_x)
 530                                            + src[index+stride+1]*   frac_x )*   frac_y
 531                                         + r)>>(shift*2);
 532                 }else{
 533                     index= src_x + av_clip(src_y, 0, height)*stride;
 534                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 535                                           + src[index       +1]*   frac_x )*s
 536                                         + r)>>(shift*2);
 537                 }
 538             }else{
 539                 if((unsigned)src_y < height){
 540                     index= av_clip(src_x, 0, width) + src_y*stride;
 541                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 542                                            + src[index+stride  ]*   frac_y )*s
 543                                         + r)>>(shift*2);
 544                 }else{
 545                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 546                     dst[y*stride + x]=    src[index         ];
 547                 }
 548             }
 549
 550             vx+= dxx;
 551             vy+= dyx;
 552         }
 553         ox += dxy;
 554         oy += dyy;
 555     }
 556 }
 557
 558 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 559     switch(width){
 560     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 561     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 562     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 563     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 564     }
 565 }
 566
 567 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 568     int i,j;
 569     for (i=0; i < height; i++) {
 570       for (j=0; j < width; j++) {
 571         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 572       }
 573       src += stride;
 574       dst += stride;
 575     }
 576 }
 577
 578 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 579     int i,j;
 580     for (i=0; i < height; i++) {
 581       for (j=0; j < width; j++) {
 582         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 583       }
 584       src += stride;
 585       dst += stride;
 586     }
 587 }
 588
 589 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 590     int i,j;
 591     for (i=0; i < height; i++) {
 592       for (j=0; j < width; j++) {
 593         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 594       }
 595       src += stride;
 596       dst += stride;
 597     }
 598 }
 599
 600 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 601     int i,j;
 602     for (i=0; i < height; i++) {
 603       for (j=0; j < width; j++) {
 604         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 605       }
 606       src += stride;
 607       dst += stride;
 608     }
 609 }
 610
 611 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 612     int i,j;
 613     for (i=0; i < height; i++) {
 614       for (j=0; j < width; j++) {
 615         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 616       }
 617       src += stride;
 618       dst += stride;
 619     }
 620 }
 621
 622 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 623     int i,j;
 624     for (i=0; i < height; i++) {
 625       for (j=0; j < width; j++) {
 626         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 627       }
 628       src += stride;
 629       dst += stride;
 630     }
 631 }
 632
 633 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 634     int i,j;
 635     for (i=0; i < height; i++) {
 636       for (j=0; j < width; j++) {
 637         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 638       }
 639       src += stride;
 640       dst += stride;
 641     }
 642 }
 643
 644 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 645     int i,j;
 646     for (i=0; i < height; i++) {
 647       for (j=0; j < width; j++) {
 648         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 649       }
 650       src += stride;
 651       dst += stride;
 652     }
 653 }
 654
 655 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 656     switch(width){
 657     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 658     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 659     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 660     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 661     }
 662 }
 663
 664 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 665     int i,j;
 666     for (i=0; i < height; i++) {
 667       for (j=0; j < width; j++) {
 668         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 669       }
 670       src += stride;
 671       dst += stride;
 672     }
 673 }
 674
 675 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 676     int i,j;
 677     for (i=0; i < height; i++) {
 678       for (j=0; j < width; j++) {
 679         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 680       }
 681       src += stride;
 682       dst += stride;
 683     }
 684 }
 685
 686 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 687     int i,j;
 688     for (i=0; i < height; i++) {
 689       for (j=0; j < width; j++) {
 690         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 691       }
 692       src += stride;
 693       dst += stride;
 694     }
 695 }
 696
 697 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 698     int i,j;
 699     for (i=0; i < height; i++) {
 700       for (j=0; j < width; j++) {
 701         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 702       }
 703       src += stride;
 704       dst += stride;
 705     }
 706 }
 707
 708 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 709     int i,j;
 710     for (i=0; i < height; i++) {
 711       for (j=0; j < width; j++) {
 712         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 713       }
 714       src += stride;
 715       dst += stride;
 716     }
 717 }
 718
 719 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 720     int i,j;
 721     for (i=0; i < height; i++) {
 722       for (j=0; j < width; j++) {
 723         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 724       }
 725       src += stride;
 726       dst += stride;
 727     }
 728 }
 729
 730 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 731     int i,j;
 732     for (i=0; i < height; i++) {
 733       for (j=0; j < width; j++) {
 734         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 735       }
 736       src += stride;
 737       dst += stride;
 738     }
 739 }
 740
 741 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 742     int i,j;
 743     for (i=0; i < height; i++) {
 744       for (j=0; j < width; j++) {
 745         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 746       }
 747       src += stride;
 748       dst += stride;
 749     }
 750 }
 751
 752 #define QPEL_MC(r, OPNAME, RND, OP) \
 753 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 755     int i;\
 756     for(i=0; i<h; i++)\
 757     {\
 758         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 759         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 760         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 761         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 762         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 763         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 764         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 765         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 766         dst+=dstStride;\
 767         src+=srcStride;\
 768     }\
 769 }\
 770 \
 771 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 772     const int w=8;\
 773     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 774     int i;\
 775     for(i=0; i<w; i++)\
 776     {\
 777         const int src0= src[0*srcStride];\
 778         const int src1= src[1*srcStride];\
 779         const int src2= src[2*srcStride];\
 780         const int src3= src[3*srcStride];\
 781         const int src4= src[4*srcStride];\
 782         const int src5= src[5*srcStride];\
 783         const int src6= src[6*srcStride];\
 784         const int src7= src[7*srcStride];\
 785         const int src8= src[8*srcStride];\
 786         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 787         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 788         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 789         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 790         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 791         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 792         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 793         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 794         dst++;\
 795         src++;\
 796     }\
 797 }\
 798 \
 799 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 800     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 801     int i;\
 802     \
 803     for(i=0; i<h; i++)\
 804     {\
 805         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 806         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 807         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 808         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 809         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 810         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 811         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 812         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 813         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 814         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 815         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 816         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 817         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 818         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 819         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 820         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 821         dst+=dstStride;\
 822         src+=srcStride;\
 823     }\
 824 }\
 825 \
 826 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 827     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 828     int i;\
 829     const int w=16;\
 830     for(i=0; i<w; i++)\
 831     {\
 832         const int src0= src[0*srcStride];\
 833         const int src1= src[1*srcStride];\
 834         const int src2= src[2*srcStride];\
 835         const int src3= src[3*srcStride];\
 836         const int src4= src[4*srcStride];\
 837         const int src5= src[5*srcStride];\
 838         const int src6= src[6*srcStride];\
 839         const int src7= src[7*srcStride];\
 840         const int src8= src[8*srcStride];\
 841         const int src9= src[9*srcStride];\
 842         const int src10= src[10*srcStride];\
 843         const int src11= src[11*srcStride];\
 844         const int src12= src[12*srcStride];\
 845         const int src13= src[13*srcStride];\
 846         const int src14= src[14*srcStride];\
 847         const int src15= src[15*srcStride];\
 848         const int src16= src[16*srcStride];\
 849         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 850         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 851         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 852         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 853         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 854         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 855         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 856         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 857         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 858         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 859         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 860         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 861         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 862         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 863         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 864         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 865         dst++;\
 866         src++;\
 867     }\
 868 }\
 869 \
 870 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 871     uint8_t half[64];\
 872     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 873     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 874 }\
 875 \
 876 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 877     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 878 }\
 879 \
 880 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 881     uint8_t half[64];\
 882     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 883     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 884 }\
 885 \
 886 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 887     uint8_t full[16*9];\
 888     uint8_t half[64];\
 889     copy_block9(full, src, 16, stride, 9);\
 890     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 891     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 892 }\
 893 \
 894 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 895     uint8_t full[16*9];\
 896     copy_block9(full, src, 16, stride, 9);\
 897     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 898 }\
 899 \
 900 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 901     uint8_t full[16*9];\
 902     uint8_t half[64];\
 903     copy_block9(full, src, 16, stride, 9);\
 904     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 905     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 906 }\
 907 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 908     uint8_t full[16*9];\
 909     uint8_t halfH[72];\
 910     uint8_t halfV[64];\
 911     uint8_t halfHV[64];\
 912     copy_block9(full, src, 16, stride, 9);\
 913     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 914     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 915     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 916     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 917 }\
 918 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 919     uint8_t full[16*9];\
 920     uint8_t halfH[72];\
 921     uint8_t halfHV[64];\
 922     copy_block9(full, src, 16, stride, 9);\
 923     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 924     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 925     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 926     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 927 }\
 928 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 929     uint8_t full[16*9];\
 930     uint8_t halfH[72];\
 931     uint8_t halfV[64];\
 932     uint8_t halfHV[64];\
 933     copy_block9(full, src, 16, stride, 9);\
 934     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 935     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 936     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 937     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 938 }\
 939 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 940     uint8_t full[16*9];\
 941     uint8_t halfH[72];\
 942     uint8_t halfHV[64];\
 943     copy_block9(full, src, 16, stride, 9);\
 944     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 945     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 946     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 947     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 948 }\
 949 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 950     uint8_t full[16*9];\
 951     uint8_t halfH[72];\
 952     uint8_t halfV[64];\
 953     uint8_t halfHV[64];\
 954     copy_block9(full, src, 16, stride, 9);\
 955     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 956     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 957     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 958     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 959 }\
 960 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 961     uint8_t full[16*9];\
 962     uint8_t halfH[72];\
 963     uint8_t halfHV[64];\
 964     copy_block9(full, src, 16, stride, 9);\
 965     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 966     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 967     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 968     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 969 }\
 970 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 971     uint8_t full[16*9];\
 972     uint8_t halfH[72];\
 973     uint8_t halfV[64];\
 974     uint8_t halfHV[64];\
 975     copy_block9(full, src, 16, stride, 9);\
 976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 977     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 978     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 979     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 980 }\
 981 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 982     uint8_t full[16*9];\
 983     uint8_t halfH[72];\
 984     uint8_t halfHV[64];\
 985     copy_block9(full, src, 16, stride, 9);\
 986     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 987     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 988     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 989     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 990 }\
 991 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
 992     uint8_t halfH[72];\
 993     uint8_t halfHV[64];\
 994     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 995     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 996     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 997 }\
 998 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
 999     uint8_t halfH[72];\
1000     uint8_t halfHV[64];\
1001     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1002     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1004 }\
1005 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1006     uint8_t full[16*9];\
1007     uint8_t halfH[72];\
1008     uint8_t halfV[64];\
1009     uint8_t halfHV[64];\
1010     copy_block9(full, src, 16, stride, 9);\
1011     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1012     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1013     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1014     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1015 }\
1016 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1017     uint8_t full[16*9];\
1018     uint8_t halfH[72];\
1019     copy_block9(full, src, 16, stride, 9);\
1020     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1021     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1022     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1023 }\
1024 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1025     uint8_t full[16*9];\
1026     uint8_t halfH[72];\
1027     uint8_t halfV[64];\
1028     uint8_t halfHV[64];\
1029     copy_block9(full, src, 16, stride, 9);\
1030     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1031     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1032     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1033     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1034 }\
1035 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1036     uint8_t full[16*9];\
1037     uint8_t halfH[72];\
1038     copy_block9(full, src, 16, stride, 9);\
1039     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1040     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1041     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1042 }\
1043 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1044     uint8_t halfH[72];\
1045     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1046     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1047 }\
1048 \
1049 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1050     uint8_t half[256];\
1051     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1052     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1053 }\
1054 \
1055 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1056     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1057 }\
1058 \
1059 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1060     uint8_t half[256];\
1061     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1062     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1063 }\
1064 \
1065 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1066     uint8_t full[24*17];\
1067     uint8_t half[256];\
1068     copy_block17(full, src, 24, stride, 17);\
1069     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1070     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1071 }\
1072 \
1073 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1074     uint8_t full[24*17];\
1075     copy_block17(full, src, 24, stride, 17);\
1076     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1077 }\
1078 \
1079 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1080     uint8_t full[24*17];\
1081     uint8_t half[256];\
1082     copy_block17(full, src, 24, stride, 17);\
1083     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1084     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1085 }\
1086 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1087     uint8_t full[24*17];\
1088     uint8_t halfH[272];\
1089     uint8_t halfV[256];\
1090     uint8_t halfHV[256];\
1091     copy_block17(full, src, 24, stride, 17);\
1092     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1093     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1094     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1095     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1096 }\
1097 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1098     uint8_t full[24*17];\
1099     uint8_t halfH[272];\
1100     uint8_t halfHV[256];\
1101     copy_block17(full, src, 24, stride, 17);\
1102     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1103     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1104     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1105     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1106 }\
1107 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1108     uint8_t full[24*17];\
1109     uint8_t halfH[272];\
1110     uint8_t halfV[256];\
1111     uint8_t halfHV[256];\
1112     copy_block17(full, src, 24, stride, 17);\
1113     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1114     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1115     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1116     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1117 }\
1118 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1119     uint8_t full[24*17];\
1120     uint8_t halfH[272];\
1121     uint8_t halfHV[256];\
1122     copy_block17(full, src, 24, stride, 17);\
1123     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1124     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1125     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1126     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1127 }\
1128 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1129     uint8_t full[24*17];\
1130     uint8_t halfH[272];\
1131     uint8_t halfV[256];\
1132     uint8_t halfHV[256];\
1133     copy_block17(full, src, 24, stride, 17);\
1134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1135     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1136     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1137     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1138 }\
1139 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1140     uint8_t full[24*17];\
1141     uint8_t halfH[272];\
1142     uint8_t halfHV[256];\
1143     copy_block17(full, src, 24, stride, 17);\
1144     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1145     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1146     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1147     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1148 }\
1149 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1150     uint8_t full[24*17];\
1151     uint8_t halfH[272];\
1152     uint8_t halfV[256];\
1153     uint8_t halfHV[256];\
1154     copy_block17(full, src, 24, stride, 17);\
1155     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1156     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1157     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1158     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1159 }\
1160 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1161     uint8_t full[24*17];\
1162     uint8_t halfH[272];\
1163     uint8_t halfHV[256];\
1164     copy_block17(full, src, 24, stride, 17);\
1165     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1166     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1168     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1169 }\
1170 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1171     uint8_t halfH[272];\
1172     uint8_t halfHV[256];\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1176 }\
1177 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1178     uint8_t halfH[272];\
1179     uint8_t halfHV[256];\
1180     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1181     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1182     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1183 }\
1184 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1185     uint8_t full[24*17];\
1186     uint8_t halfH[272];\
1187     uint8_t halfV[256];\
1188     uint8_t halfHV[256];\
1189     copy_block17(full, src, 24, stride, 17);\
1190     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1191     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1192     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1193     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1194 }\
1195 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1196     uint8_t full[24*17];\
1197     uint8_t halfH[272];\
1198     copy_block17(full, src, 24, stride, 17);\
1199     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1200     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1201     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1202 }\
1203 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1204     uint8_t full[24*17];\
1205     uint8_t halfH[272];\
1206     uint8_t halfV[256];\
1207     uint8_t halfHV[256];\
1208     copy_block17(full, src, 24, stride, 17);\
1209     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1210     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1211     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1212     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1213 }\
1214 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1215     uint8_t full[24*17];\
1216     uint8_t halfH[272];\
1217     copy_block17(full, src, 24, stride, 17);\
1218     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1219     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1220     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1221 }\
1222 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1223     uint8_t halfH[272];\
1224     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1225     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1226 }
1227
1228 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1229 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1230 #define op_put(a, b) a = cm[((b) + 16)>>5]
1231 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1232
1233 QPEL_MC(0, put_       , _       , op_put)
1234 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1235 QPEL_MC(0, avg_       , _       , op_avg)
1236 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1237 #undef op_avg
1238 #undef op_avg_no_rnd
1239 #undef op_put
1240 #undef op_put_no_rnd
1241
1242 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1243 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1244 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1245 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1246 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1247 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1248
1249 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1250     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1251     int i;
1252
1253     for(i=0; i<h; i++){
1254         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1255         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1256         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1257         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1258         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1259         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1260         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1261         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1262         dst+=dstStride;
1263         src+=srcStride;
1264     }
1265 }
1266
1267 #if CONFIG_RV40_DECODER
1268 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1269     put_pixels16_xy2_8_c(dst, src, stride, 16);
1270 }
1271 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1272     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1273 }
1274 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1275     put_pixels8_xy2_8_c(dst, src, stride, 8);
1276 }
1277 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1278     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1279 }
1280 #endif /* CONFIG_RV40_DECODER */
1281
1282 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1283     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1284     int i;
1285
1286     for(i=0; i<w; i++){
1287         const int src_1= src[ -srcStride];
1288         const int src0 = src[0          ];
1289         const int src1 = src[  srcStride];
1290         const int src2 = src[2*srcStride];
1291         const int src3 = src[3*srcStride];
1292         const int src4 = src[4*srcStride];
1293         const int src5 = src[5*srcStride];
1294         const int src6 = src[6*srcStride];
1295         const int src7 = src[7*srcStride];
1296         const int src8 = src[8*srcStride];
1297         const int src9 = src[9*srcStride];
1298         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1299         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1300         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1301         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1302         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1303         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1304         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1305         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1306         src++;
1307         dst++;
1308     }
1309 }
1310
1311 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1312     uint8_t half[64];
1313     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1314     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1315 }
1316
1317 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1318     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1319 }
1320
1321 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1322     uint8_t half[64];
1323     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1324     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1325 }
1326
1327 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1328     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1329 }
1330
1331 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1332     uint8_t halfH[88];
1333     uint8_t halfV[64];
1334     uint8_t halfHV[64];
1335     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1336     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1337     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1338     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1339 }
1340 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1341     uint8_t halfH[88];
1342     uint8_t halfV[64];
1343     uint8_t halfHV[64];
1344     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1345     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1346     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1347     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1348 }
1349 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1350     uint8_t halfH[88];
1351     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1352     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1353 }
1354
1355 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1356     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1357     int x;
1358     const int strength= ff_h263_loop_filter_strength[qscale];
1359
1360     for(x=0; x<8; x++){
1361         int d1, d2, ad1;
1362         int p0= src[x-2*stride];
1363         int p1= src[x-1*stride];
1364         int p2= src[x+0*stride];
1365         int p3= src[x+1*stride];
1366         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1367
1368         if     (d<-2*strength) d1= 0;
1369         else if(d<-  strength) d1=-2*strength - d;
1370         else if(d<   strength) d1= d;
1371         else if(d< 2*strength) d1= 2*strength - d;
1372         else                   d1= 0;
1373
1374         p1 += d1;
1375         p2 -= d1;
1376         if(p1&256) p1= ~(p1>>31);
1377         if(p2&256) p2= ~(p2>>31);
1378
1379         src[x-1*stride] = p1;
1380         src[x+0*stride] = p2;
1381
1382         ad1= FFABS(d1)>>1;
1383
1384         d2= av_clip((p0-p3)/4, -ad1, ad1);
1385
1386         src[x-2*stride] = p0 - d2;
1387         src[x+  stride] = p3 + d2;
1388     }
1389     }
1390 }
1391
1392 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1393     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1394     int y;
1395     const int strength= ff_h263_loop_filter_strength[qscale];
1396
1397     for(y=0; y<8; y++){
1398         int d1, d2, ad1;
1399         int p0= src[y*stride-2];
1400         int p1= src[y*stride-1];
1401         int p2= src[y*stride+0];
1402         int p3= src[y*stride+1];
1403         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1404
1405         if     (d<-2*strength) d1= 0;
1406         else if(d<-  strength) d1=-2*strength - d;
1407         else if(d<   strength) d1= d;
1408         else if(d< 2*strength) d1= 2*strength - d;
1409         else                   d1= 0;
1410
1411         p1 += d1;
1412         p2 -= d1;
1413         if(p1&256) p1= ~(p1>>31);
1414         if(p2&256) p2= ~(p2>>31);
1415
1416         src[y*stride-1] = p1;
1417         src[y*stride+0] = p2;
1418
1419         ad1= FFABS(d1)>>1;
1420
1421         d2= av_clip((p0-p3)/4, -ad1, ad1);
1422
1423         src[y*stride-2] = p0 - d2;
1424         src[y*stride+1] = p3 + d2;
1425     }
1426     }
1427 }
1428
1429 static void h261_loop_filter_c(uint8_t *src, int stride){
1430     int x,y,xy,yz;
1431     int temp[64];
1432
1433     for(x=0; x<8; x++){
1434         temp[x      ] = 4*src[x           ];
1435         temp[x + 7*8] = 4*src[x + 7*stride];
1436     }
1437     for(y=1; y<7; y++){
1438         for(x=0; x<8; x++){
1439             xy = y * stride + x;
1440             yz = y * 8 + x;
1441             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1442         }
1443     }
1444
1445     for(y=0; y<8; y++){
1446         src[  y*stride] = (temp[  y*8] + 2)>>2;
1447         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1448         for(x=1; x<7; x++){
1449             xy = y * stride + x;
1450             yz = y * 8 + x;
1451             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1452         }
1453     }
1454 }
1455
1456 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1457 {
1458     int s, i;
1459
1460     s = 0;
1461     for(i=0;i<h;i++) {
1462         s += abs(pix1[0] - pix2[0]);
1463         s += abs(pix1[1] - pix2[1]);
1464         s += abs(pix1[2] - pix2[2]);
1465         s += abs(pix1[3] - pix2[3]);
1466         s += abs(pix1[4] - pix2[4]);
1467         s += abs(pix1[5] - pix2[5]);
1468         s += abs(pix1[6] - pix2[6]);
1469         s += abs(pix1[7] - pix2[7]);
1470         s += abs(pix1[8] - pix2[8]);
1471         s += abs(pix1[9] - pix2[9]);
1472         s += abs(pix1[10] - pix2[10]);
1473         s += abs(pix1[11] - pix2[11]);
1474         s += abs(pix1[12] - pix2[12]);
1475         s += abs(pix1[13] - pix2[13]);
1476         s += abs(pix1[14] - pix2[14]);
1477         s += abs(pix1[15] - pix2[15]);
1478         pix1 += line_size;
1479         pix2 += line_size;
1480     }
1481     return s;
1482 }
1483
1484 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1485 {
1486     int s, i;
1487
1488     s = 0;
1489     for(i=0;i<h;i++) {
1490         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1491         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1492         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1493         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1494         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1495         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1496         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1497         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1498         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1499         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1500         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1501         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1502         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1503         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1504         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1505         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1506         pix1 += line_size;
1507         pix2 += line_size;
1508     }
1509     return s;
1510 }
1511
1512 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1513 {
1514     int s, i;
1515     uint8_t *pix3 = pix2 + line_size;
1516
1517     s = 0;
1518     for(i=0;i<h;i++) {
1519         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1520         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1521         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1522         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1523         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1524         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1525         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1526         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1527         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1528         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1529         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1530         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1531         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1532         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1533         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1534         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1535         pix1 += line_size;
1536         pix2 += line_size;
1537         pix3 += line_size;
1538     }
1539     return s;
1540 }
1541
1542 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1543 {
1544     int s, i;
1545     uint8_t *pix3 = pix2 + line_size;
1546
1547     s = 0;
1548     for(i=0;i<h;i++) {
1549         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1550         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1551         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1552         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1553         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1554         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1555         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1556         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1557         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1558         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1559         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1560         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1561         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1562         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1563         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1564         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1565         pix1 += line_size;
1566         pix2 += line_size;
1567         pix3 += line_size;
1568     }
1569     return s;
1570 }
1571
1572 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1573 {
1574     int s, i;
1575
1576     s = 0;
1577     for(i=0;i<h;i++) {
1578         s += abs(pix1[0] - pix2[0]);
1579         s += abs(pix1[1] - pix2[1]);
1580         s += abs(pix1[2] - pix2[2]);
1581         s += abs(pix1[3] - pix2[3]);
1582         s += abs(pix1[4] - pix2[4]);
1583         s += abs(pix1[5] - pix2[5]);
1584         s += abs(pix1[6] - pix2[6]);
1585         s += abs(pix1[7] - pix2[7]);
1586         pix1 += line_size;
1587         pix2 += line_size;
1588     }
1589     return s;
1590 }
1591
1592 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1593 {
1594     int s, i;
1595
1596     s = 0;
1597     for(i=0;i<h;i++) {
1598         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1599         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1600         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1601         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1602         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1603         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1604         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1605         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1606         pix1 += line_size;
1607         pix2 += line_size;
1608     }
1609     return s;
1610 }
1611
1612 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1613 {
1614     int s, i;
1615     uint8_t *pix3 = pix2 + line_size;
1616
1617     s = 0;
1618     for(i=0;i<h;i++) {
1619         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1620         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1621         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1622         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1623         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1624         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1625         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1626         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1627         pix1 += line_size;
1628         pix2 += line_size;
1629         pix3 += line_size;
1630     }
1631     return s;
1632 }
1633
1634 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1635 {
1636     int s, i;
1637     uint8_t *pix3 = pix2 + line_size;
1638
1639     s = 0;
1640     for(i=0;i<h;i++) {
1641         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1642         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1643         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1644         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1645         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1646         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1647         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1648         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1649         pix1 += line_size;
1650         pix2 += line_size;
1651         pix3 += line_size;
1652     }
1653     return s;
1654 }
1655
1656 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1657     MpegEncContext *c = v;
1658     int score1=0;
1659     int score2=0;
1660     int x,y;
1661
1662     for(y=0; y<h; y++){
1663         for(x=0; x<16; x++){
1664             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1665         }
1666         if(y+1<h){
1667             for(x=0; x<15; x++){
1668                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1669                              - s1[x+1] + s1[x+1+stride])
1670                         -FFABS(  s2[x  ] - s2[x  +stride]
1671                              - s2[x+1] + s2[x+1+stride]);
1672             }
1673         }
1674         s1+= stride;
1675         s2+= stride;
1676     }
1677
1678     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1679     else  return score1 + FFABS(score2)*8;
1680 }
1681
1682 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1683     MpegEncContext *c = v;
1684     int score1=0;
1685     int score2=0;
1686     int x,y;
1687
1688     for(y=0; y<h; y++){
1689         for(x=0; x<8; x++){
1690             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1691         }
1692         if(y+1<h){
1693             for(x=0; x<7; x++){
1694                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1695                              - s1[x+1] + s1[x+1+stride])
1696                         -FFABS(  s2[x  ] - s2[x  +stride]
1697                              - s2[x+1] + s2[x+1+stride]);
1698             }
1699         }
1700         s1+= stride;
1701         s2+= stride;
1702     }
1703
1704     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1705     else  return score1 + FFABS(score2)*8;
1706 }
1707
1708 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1709     int i;
1710     unsigned int sum=0;
1711
1712     for(i=0; i<8*8; i++){
1713         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1714         int w= weight[i];
1715         b>>= RECON_SHIFT;
1716         assert(-512<b && b<512);
1717
1718         sum += (w*b)*(w*b)>>4;
1719     }
1720     return sum>>2;
1721 }
1722
1723 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1724     int i;
1725
1726     for(i=0; i<8*8; i++){
1727         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1728     }
1729 }
1730
1731 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1732     return 0;
1733 }
1734
1735 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1736     int i;
1737
1738     memset(cmp, 0, sizeof(void*)*6);
1739
1740     for(i=0; i<6; i++){
1741         switch(type&0xFF){
1742         case FF_CMP_SAD:
1743             cmp[i]= c->sad[i];
1744             break;
1745         case FF_CMP_SATD:
1746             cmp[i]= c->hadamard8_diff[i];
1747             break;
1748         case FF_CMP_SSE:
1749             cmp[i]= c->sse[i];
1750             break;
1751         case FF_CMP_DCT:
1752             cmp[i]= c->dct_sad[i];
1753             break;
1754         case FF_CMP_DCT264:
1755             cmp[i]= c->dct264_sad[i];
1756             break;
1757         case FF_CMP_DCTMAX:
1758             cmp[i]= c->dct_max[i];
1759             break;
1760         case FF_CMP_PSNR:
1761             cmp[i]= c->quant_psnr[i];
1762             break;
1763         case FF_CMP_BIT:
1764             cmp[i]= c->bit[i];
1765             break;
1766         case FF_CMP_RD:
1767             cmp[i]= c->rd[i];
1768             break;
1769         case FF_CMP_VSAD:
1770             cmp[i]= c->vsad[i];
1771             break;
1772         case FF_CMP_VSSE:
1773             cmp[i]= c->vsse[i];
1774             break;
1775         case FF_CMP_ZERO:
1776             cmp[i]= zero_cmp;
1777             break;
1778         case FF_CMP_NSSE:
1779             cmp[i]= c->nsse[i];
1780             break;
1781         default:
1782             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1783         }
1784     }
1785 }
1786
1787 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1788     long i;
1789     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1790         long a = *(long*)(src+i);
1791         long b = *(long*)(dst+i);
1792         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1793     }
1794     for(; i<w; i++)
1795         dst[i+0] += src[i+0];
1796 }
1797
1798 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1799     long i;
1800 #if !HAVE_FAST_UNALIGNED
1801     if((long)src2 & (sizeof(long)-1)){
1802         for(i=0; i+7<w; i+=8){
1803             dst[i+0] = src1[i+0]-src2[i+0];
1804             dst[i+1] = src1[i+1]-src2[i+1];
1805             dst[i+2] = src1[i+2]-src2[i+2];
1806             dst[i+3] = src1[i+3]-src2[i+3];
1807             dst[i+4] = src1[i+4]-src2[i+4];
1808             dst[i+5] = src1[i+5]-src2[i+5];
1809             dst[i+6] = src1[i+6]-src2[i+6];
1810             dst[i+7] = src1[i+7]-src2[i+7];
1811         }
1812     }else
1813 #endif
1814     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1815         long a = *(long*)(src1+i);
1816         long b = *(long*)(src2+i);
1817         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1818     }
1819     for(; i<w; i++)
1820         dst[i+0] = src1[i+0]-src2[i+0];
1821 }
1822
1823 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1824     int i;
1825     uint8_t l, lt;
1826
1827     l= *left;
1828     lt= *left_top;
1829
1830     for(i=0; i<w; i++){
1831         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1832         lt= src1[i];
1833         dst[i]= l;
1834     }
1835
1836     *left= l;
1837     *left_top= lt;
1838 }
1839
1840 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1841     int i;
1842     uint8_t l, lt;
1843
1844     l= *left;
1845     lt= *left_top;
1846
1847     for(i=0; i<w; i++){
1848         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1849         lt= src1[i];
1850         l= src2[i];
1851         dst[i]= l - pred;
1852     }
1853
1854     *left= l;
1855     *left_top= lt;
1856 }
1857
1858 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1859     int i;
1860
1861     for(i=0; i<w-1; i++){
1862         acc+= src[i];
1863         dst[i]= acc;
1864         i++;
1865         acc+= src[i];
1866         dst[i]= acc;
1867     }
1868
1869     for(; i<w; i++){
1870         acc+= src[i];
1871         dst[i]= acc;
1872     }
1873
1874     return acc;
1875 }
1876
1877 #if HAVE_BIGENDIAN
1878 #define B 3
1879 #define G 2
1880 #define R 1
1881 #define A 0
1882 #else
1883 #define B 0
1884 #define G 1
1885 #define R 2
1886 #define A 3
1887 #endif
1888 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1889     int i;
1890     int r,g,b,a;
1891     r= *red;
1892     g= *green;
1893     b= *blue;
1894     a= *alpha;
1895
1896     for(i=0; i<w; i++){
1897         b+= src[4*i+B];
1898         g+= src[4*i+G];
1899         r+= src[4*i+R];
1900         a+= src[4*i+A];
1901
1902         dst[4*i+B]= b;
1903         dst[4*i+G]= g;
1904         dst[4*i+R]= r;
1905         dst[4*i+A]= a;
1906     }
1907
1908     *red= r;
1909     *green= g;
1910     *blue= b;
1911     *alpha= a;
1912 }
1913 #undef B
1914 #undef G
1915 #undef R
1916 #undef A
1917
1918 #define BUTTERFLY2(o1,o2,i1,i2) \
1919 o1= (i1)+(i2);\
1920 o2= (i1)-(i2);
1921
1922 #define BUTTERFLY1(x,y) \
1923 {\
1924     int a,b;\
1925     a= x;\
1926     b= y;\
1927     x= a+b;\
1928     y= a-b;\
1929 }
1930
1931 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1932
1933 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1934     int i;
1935     int temp[64];
1936     int sum=0;
1937
1938     assert(h==8);
1939
1940     for(i=0; i<8; i++){
1941         //FIXME try pointer walks
1942         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1943         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1944         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1945         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1946
1947         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1948         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1949         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1950         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1951
1952         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1953         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1954         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1955         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1956     }
1957
1958     for(i=0; i<8; i++){
1959         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1960         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1961         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1962         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1963
1964         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1965         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1966         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1967         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1968
1969         sum +=
1970              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1971             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1972             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1973             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1974     }
1975     return sum;
1976 }
1977
1978 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1979     int i;
1980     int temp[64];
1981     int sum=0;
1982
1983     assert(h==8);
1984
1985     for(i=0; i<8; i++){
1986         //FIXME try pointer walks
1987         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1988         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1989         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1990         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1991
1992         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1993         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1994         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1995         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1996
1997         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1998         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1999         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2000         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2001     }
2002
2003     for(i=0; i<8; i++){
2004         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2005         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2006         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2007         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2008
2009         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2010         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2011         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2012         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2013
2014         sum +=
2015              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2016             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2017             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2018             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2019     }
2020
2021     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2022
2023     return sum;
2024 }
2025
2026 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2027     MpegEncContext * const s= (MpegEncContext *)c;
2028     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2029
2030     assert(h==8);
2031
2032     s->dsp.diff_pixels(temp, src1, src2, stride);
2033     s->dsp.fdct(temp);
2034     return s->dsp.sum_abs_dctelem(temp);
2035 }
2036
2037 #if CONFIG_GPL
2038 #define DCT8_1D {\
2039     const int s07 = SRC(0) + SRC(7);\
2040     const int s16 = SRC(1) + SRC(6);\
2041     const int s25 = SRC(2) + SRC(5);\
2042     const int s34 = SRC(3) + SRC(4);\
2043     const int a0 = s07 + s34;\
2044     const int a1 = s16 + s25;\
2045     const int a2 = s07 - s34;\
2046     const int a3 = s16 - s25;\
2047     const int d07 = SRC(0) - SRC(7);\
2048     const int d16 = SRC(1) - SRC(6);\
2049     const int d25 = SRC(2) - SRC(5);\
2050     const int d34 = SRC(3) - SRC(4);\
2051     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2052     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2053     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2054     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2055     DST(0,  a0 + a1     ) ;\
2056     DST(1,  a4 + (a7>>2)) ;\
2057     DST(2,  a2 + (a3>>1)) ;\
2058     DST(3,  a5 + (a6>>2)) ;\
2059     DST(4,  a0 - a1     ) ;\
2060     DST(5,  a6 - (a5>>2)) ;\
2061     DST(6, (a2>>1) - a3 ) ;\
2062     DST(7, (a4>>2) - a7 ) ;\
2063 }
2064
2065 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2066     MpegEncContext * const s= (MpegEncContext *)c;
2067     int16_t dct[8][8];
2068     int i;
2069     int sum=0;
2070
2071     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2072
2073 #define SRC(x) dct[i][x]
2074 #define DST(x,v) dct[i][x]= v
2075     for( i = 0; i < 8; i++ )
2076         DCT8_1D
2077 #undef SRC
2078 #undef DST
2079
2080 #define SRC(x) dct[x][i]
2081 #define DST(x,v) sum += FFABS(v)
2082     for( i = 0; i < 8; i++ )
2083         DCT8_1D
2084 #undef SRC
2085 #undef DST
2086     return sum;
2087 }
2088 #endif
2089
2090 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2091     MpegEncContext * const s= (MpegEncContext *)c;
2092     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2093     int sum=0, i;
2094
2095     assert(h==8);
2096
2097     s->dsp.diff_pixels(temp, src1, src2, stride);
2098     s->dsp.fdct(temp);
2099
2100     for(i=0; i<64; i++)
2101         sum= FFMAX(sum, FFABS(temp[i]));
2102
2103     return sum;
2104 }
2105
2106 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2107     MpegEncContext * const s= (MpegEncContext *)c;
2108     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2109     int16_t * const bak = temp+64;
2110     int sum=0, i;
2111
2112     assert(h==8);
2113     s->mb_intra=0;
2114
2115     s->dsp.diff_pixels(temp, src1, src2, stride);
2116
2117     memcpy(bak, temp, 64*sizeof(int16_t));
2118
2119     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2120     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2121     ff_simple_idct_8(temp); //FIXME
2122
2123     for(i=0; i<64; i++)
2124         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2125
2126     return sum;
2127 }
2128
2129 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2130     MpegEncContext * const s= (MpegEncContext *)c;
2131     const uint8_t *scantable= s->intra_scantable.permutated;
2132     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2133     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2134     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2135     int i, last, run, bits, level, distortion, start_i;
2136     const int esc_length= s->ac_esc_length;
2137     uint8_t * length;
2138     uint8_t * last_length;
2139
2140     assert(h==8);
2141
2142     copy_block8(lsrc1, src1, 8, stride, 8);
2143     copy_block8(lsrc2, src2, 8, stride, 8);
2144
2145     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2146
2147     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2148
2149     bits=0;
2150
2151     if (s->mb_intra) {
2152         start_i = 1;
2153         length     = s->intra_ac_vlc_length;
2154         last_length= s->intra_ac_vlc_last_length;
2155         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2156     } else {
2157         start_i = 0;
2158         length     = s->inter_ac_vlc_length;
2159         last_length= s->inter_ac_vlc_last_length;
2160     }
2161
2162     if(last>=start_i){
2163         run=0;
2164         for(i=start_i; i<last; i++){
2165             int j= scantable[i];
2166             level= temp[j];
2167
2168             if(level){
2169                 level+=64;
2170                 if((level&(~127)) == 0){
2171                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2172                 }else
2173                     bits+= esc_length;
2174                 run=0;
2175             }else
2176                 run++;
2177         }
2178         i= scantable[last];
2179
2180         level= temp[i] + 64;
2181
2182         assert(level - 64);
2183
2184         if((level&(~127)) == 0){
2185             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2186         }else
2187             bits+= esc_length;
2188
2189     }
2190
2191     if(last>=0){
2192         if(s->mb_intra)
2193             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2194         else
2195             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2196     }
2197
2198     s->dsp.idct_add(lsrc2, 8, temp);
2199
2200     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2201
2202     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2203 }
2204
2205 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2206     MpegEncContext * const s= (MpegEncContext *)c;
2207     const uint8_t *scantable= s->intra_scantable.permutated;
2208     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2209     int i, last, run, bits, level, start_i;
2210     const int esc_length= s->ac_esc_length;
2211     uint8_t * length;
2212     uint8_t * last_length;
2213
2214     assert(h==8);
2215
2216     s->dsp.diff_pixels(temp, src1, src2, stride);
2217
2218     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2219
2220     bits=0;
2221
2222     if (s->mb_intra) {
2223         start_i = 1;
2224         length     = s->intra_ac_vlc_length;
2225         last_length= s->intra_ac_vlc_last_length;
2226         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2227     } else {
2228         start_i = 0;
2229         length     = s->inter_ac_vlc_length;
2230         last_length= s->inter_ac_vlc_last_length;
2231     }
2232
2233     if(last>=start_i){
2234         run=0;
2235         for(i=start_i; i<last; i++){
2236             int j= scantable[i];
2237             level= temp[j];
2238
2239             if(level){
2240                 level+=64;
2241                 if((level&(~127)) == 0){
2242                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2243                 }else
2244                     bits+= esc_length;
2245                 run=0;
2246             }else
2247                 run++;
2248         }
2249         i= scantable[last];
2250
2251         level= temp[i] + 64;
2252
2253         assert(level - 64);
2254
2255         if((level&(~127)) == 0){
2256             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2257         }else
2258             bits+= esc_length;
2259     }
2260
2261     return bits;
2262 }
2263
2264 #define VSAD_INTRA(size) \
2265 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2266     int score=0;                                                                                            \
2267     int x,y;                                                                                                \
2268                                                                                                             \
2269     for(y=1; y<h; y++){                                                                                     \
2270         for(x=0; x<size; x+=4){                                                                             \
2271             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2272                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2273         }                                                                                                   \
2274         s+= stride;                                                                                         \
2275     }                                                                                                       \
2276                                                                                                             \
2277     return score;                                                                                           \
2278 }
2279 VSAD_INTRA(8)
2280 VSAD_INTRA(16)
2281
2282 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2283     int score=0;
2284     int x,y;
2285
2286     for(y=1; y<h; y++){
2287         for(x=0; x<16; x++){
2288             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2289         }
2290         s1+= stride;
2291         s2+= stride;
2292     }
2293
2294     return score;
2295 }
2296
2297 #define SQ(a) ((a)*(a))
2298 #define VSSE_INTRA(size) \
2299 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2300     int score=0;                                                                                            \
2301     int x,y;                                                                                                \
2302                                                                                                             \
2303     for(y=1; y<h; y++){                                                                                     \
2304         for(x=0; x<size; x+=4){                                                                               \
2305             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2306                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2307         }                                                                                                   \
2308         s+= stride;                                                                                         \
2309     }                                                                                                       \
2310                                                                                                             \
2311     return score;                                                                                           \
2312 }
2313 VSSE_INTRA(8)
2314 VSSE_INTRA(16)
2315
2316 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2317     int score=0;
2318     int x,y;
2319
2320     for(y=1; y<h; y++){
2321         for(x=0; x<16; x++){
2322             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2323         }
2324         s1+= stride;
2325         s2+= stride;
2326     }
2327
2328     return score;
2329 }
2330
2331 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2332                                int size){
2333     int score=0;
2334     int i;
2335     for(i=0; i<size; i++)
2336         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2337     return score;
2338 }
2339
2340 #define WRAPPER8_16_SQ(name8, name16)\
2341 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2342     int score=0;\
2343     score +=name8(s, dst           , src           , stride, 8);\
2344     score +=name8(s, dst+8         , src+8         , stride, 8);\
2345     if(h==16){\
2346         dst += 8*stride;\
2347         src += 8*stride;\
2348         score +=name8(s, dst           , src           , stride, 8);\
2349         score +=name8(s, dst+8         , src+8         , stride, 8);\
2350     }\
2351     return score;\
2352 }
2353
2354 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2355 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2356 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2357 #if CONFIG_GPL
2358 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2359 #endif
2360 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2361 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2362 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2363 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2364
2365 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2366                    uint32_t maxi, uint32_t maxisign)
2367 {
2368
2369     if(a > mini) return mini;
2370     else if((a^(1U<<31)) > maxisign) return maxi;
2371     else return a;
2372 }
2373
2374 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2375     int i;
2376     uint32_t mini = *(uint32_t*)min;
2377     uint32_t maxi = *(uint32_t*)max;
2378     uint32_t maxisign = maxi ^ (1U<<31);
2379     uint32_t *dsti = (uint32_t*)dst;
2380     const uint32_t *srci = (const uint32_t*)src;
2381     for(i=0; i<len; i+=8) {
2382         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2383         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2384         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2385         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2386         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2387         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2388         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2389         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2390     }
2391 }
2392 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2393     int i;
2394     if(min < 0 && max > 0) {
2395         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2396     } else {
2397         for(i=0; i < len; i+=8) {
2398             dst[i    ] = av_clipf(src[i    ], min, max);
2399             dst[i + 1] = av_clipf(src[i + 1], min, max);
2400             dst[i + 2] = av_clipf(src[i + 2], min, max);
2401             dst[i + 3] = av_clipf(src[i + 3], min, max);
2402             dst[i + 4] = av_clipf(src[i + 4], min, max);
2403             dst[i + 5] = av_clipf(src[i + 5], min, max);
2404             dst[i + 6] = av_clipf(src[i + 6], min, max);
2405             dst[i + 7] = av_clipf(src[i + 7], min, max);
2406         }
2407     }
2408 }
2409
2410 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2411 {
2412     int res = 0;
2413
2414     while (order--)
2415         res += *v1++ * *v2++;
2416
2417     return res;
2418 }
2419
2420 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2421 {
2422     int res = 0;
2423     while (order--) {
2424         res   += *v1 * *v2++;
2425         *v1++ += mul * *v3++;
2426     }
2427     return res;
2428 }
2429
2430 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2431                                  const int16_t *window, unsigned int len)
2432 {
2433     int i;
2434     int len2 = len >> 1;
2435
2436     for (i = 0; i < len2; i++) {
2437         int16_t w       = window[i];
2438         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2439         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2440     }
2441 }
2442
2443 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2444                                 int32_t max, unsigned int len)
2445 {
2446     do {
2447         *dst++ = av_clip(*src++, min, max);
2448         *dst++ = av_clip(*src++, min, max);
2449         *dst++ = av_clip(*src++, min, max);
2450         *dst++ = av_clip(*src++, min, max);
2451         *dst++ = av_clip(*src++, min, max);
2452         *dst++ = av_clip(*src++, min, max);
2453         *dst++ = av_clip(*src++, min, max);
2454         *dst++ = av_clip(*src++, min, max);
2455         len -= 8;
2456     } while (len > 0);
2457 }
2458
2459 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2460 {
2461     ff_j_rev_dct (block);
2462     put_pixels_clamped_c(block, dest, line_size);
2463 }
2464 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2465 {
2466     ff_j_rev_dct (block);
2467     add_pixels_clamped_c(block, dest, line_size);
2468 }
2469
2470 /* init static data */
2471 av_cold void ff_dsputil_static_init(void)
2472 {
2473     int i;
2474
2475     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2476     for(i=0;i<MAX_NEG_CROP;i++) {
2477         ff_cropTbl[i] = 0;
2478         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2479     }
2480
2481     for(i=0;i<512;i++) {
2482         ff_squareTbl[i] = (i - 256) * (i - 256);
2483     }
2484
2485     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2486 }
2487
2488 int ff_check_alignment(void){
2489     static int did_fail=0;
2490     LOCAL_ALIGNED_16(int, aligned, [4]);
2491
2492     if((intptr_t)aligned & 15){
2493         if(!did_fail){
2494 #if HAVE_MMX || HAVE_ALTIVEC
2495             av_log(NULL, AV_LOG_ERROR,
2496                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2497                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2498                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2499                 "Do not report crashes to Libav developers.\n");
2500 #endif
2501             did_fail=1;
2502         }
2503         return -1;
2504     }
2505     return 0;
2506 }
2507
2508 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2509 {
2510     ff_check_alignment();
2511
2512 #if CONFIG_ENCODERS
2513     if (avctx->bits_per_raw_sample == 10) {
2514         c->fdct    = ff_jpeg_fdct_islow_10;
2515         c->fdct248 = ff_fdct248_islow_10;
2516     } else {
2517         if(avctx->dct_algo==FF_DCT_FASTINT) {
2518             c->fdct    = ff_fdct_ifast;
2519             c->fdct248 = ff_fdct_ifast248;
2520         }
2521         else if(avctx->dct_algo==FF_DCT_FAAN) {
2522             c->fdct    = ff_faandct;
2523             c->fdct248 = ff_faandct248;
2524         }
2525         else {
2526             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2527             c->fdct248 = ff_fdct248_islow_8;
2528         }
2529     }
2530 #endif //CONFIG_ENCODERS
2531
2532     if (avctx->bits_per_raw_sample == 10) {
2533         c->idct_put              = ff_simple_idct_put_10;
2534         c->idct_add              = ff_simple_idct_add_10;
2535         c->idct                  = ff_simple_idct_10;
2536         c->idct_permutation_type = FF_NO_IDCT_PERM;
2537     } else {
2538         if(avctx->idct_algo==FF_IDCT_INT){
2539             c->idct_put= ff_jref_idct_put;
2540             c->idct_add= ff_jref_idct_add;
2541             c->idct    = ff_j_rev_dct;
2542             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2543         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2544             c->idct_put= ff_faanidct_put;
2545             c->idct_add= ff_faanidct_add;
2546             c->idct    = ff_faanidct;
2547             c->idct_permutation_type= FF_NO_IDCT_PERM;
2548         }else{ //accurate/default
2549             c->idct_put = ff_simple_idct_put_8;
2550             c->idct_add = ff_simple_idct_add_8;
2551             c->idct     = ff_simple_idct_8;
2552             c->idct_permutation_type= FF_NO_IDCT_PERM;
2553         }
2554     }
2555
2556     c->diff_pixels = diff_pixels_c;
2557     c->put_pixels_clamped = put_pixels_clamped_c;
2558     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2559     c->add_pixels_clamped = add_pixels_clamped_c;
2560     c->sum_abs_dctelem = sum_abs_dctelem_c;
2561     c->gmc1 = gmc1_c;
2562     c->gmc = ff_gmc_c;
2563     c->pix_sum = pix_sum_c;
2564     c->pix_norm1 = pix_norm1_c;
2565
2566     c->fill_block_tab[0] = fill_block16_c;
2567     c->fill_block_tab[1] = fill_block8_c;
2568
2569     /* TODO [0] 16  [1] 8 */
2570     c->pix_abs[0][0] = pix_abs16_c;
2571     c->pix_abs[0][1] = pix_abs16_x2_c;
2572     c->pix_abs[0][2] = pix_abs16_y2_c;
2573     c->pix_abs[0][3] = pix_abs16_xy2_c;
2574     c->pix_abs[1][0] = pix_abs8_c;
2575     c->pix_abs[1][1] = pix_abs8_x2_c;
2576     c->pix_abs[1][2] = pix_abs8_y2_c;
2577     c->pix_abs[1][3] = pix_abs8_xy2_c;
2578
2579     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2580     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2581     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2582     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2583     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2584     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2585     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2586     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2587     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2588
2589     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2590     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2591     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2592     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2593     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2594     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2595     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2596     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2597     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2598
2599 #define dspfunc(PFX, IDX, NUM) \
2600     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2601     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2602     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2603     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2604     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2605     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2606     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2607     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2608     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2609     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2610     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2611     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2612     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2613     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2614     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2615     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2616
2617     dspfunc(put_qpel, 0, 16);
2618     dspfunc(put_no_rnd_qpel, 0, 16);
2619
2620     dspfunc(avg_qpel, 0, 16);
2621     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2622
2623     dspfunc(put_qpel, 1, 8);
2624     dspfunc(put_no_rnd_qpel, 1, 8);
2625
2626     dspfunc(avg_qpel, 1, 8);
2627     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2628
2629 #undef dspfunc
2630
2631     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2632     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2633     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2634     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2635     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2636     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2637     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2638     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2639
2640 #define SET_CMP_FUNC(name) \
2641     c->name[0]= name ## 16_c;\
2642     c->name[1]= name ## 8x8_c;
2643
2644     SET_CMP_FUNC(hadamard8_diff)
2645     c->hadamard8_diff[4]= hadamard8_intra16_c;
2646     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2647     SET_CMP_FUNC(dct_sad)
2648     SET_CMP_FUNC(dct_max)
2649 #if CONFIG_GPL
2650     SET_CMP_FUNC(dct264_sad)
2651 #endif
2652     c->sad[0]= pix_abs16_c;
2653     c->sad[1]= pix_abs8_c;
2654     c->sse[0]= sse16_c;
2655     c->sse[1]= sse8_c;
2656     c->sse[2]= sse4_c;
2657     SET_CMP_FUNC(quant_psnr)
2658     SET_CMP_FUNC(rd)
2659     SET_CMP_FUNC(bit)
2660     c->vsad[0]= vsad16_c;
2661     c->vsad[4]= vsad_intra16_c;
2662     c->vsad[5]= vsad_intra8_c;
2663     c->vsse[0]= vsse16_c;
2664     c->vsse[4]= vsse_intra16_c;
2665     c->vsse[5]= vsse_intra8_c;
2666     c->nsse[0]= nsse16_c;
2667     c->nsse[1]= nsse8_c;
2668
2669     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2670
2671     c->add_bytes= add_bytes_c;
2672     c->diff_bytes= diff_bytes_c;
2673     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2674     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2675     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2676     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2677     c->bswap_buf= bswap_buf;
2678     c->bswap16_buf = bswap16_buf;
2679
2680     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2681         c->h263_h_loop_filter= h263_h_loop_filter_c;
2682         c->h263_v_loop_filter= h263_v_loop_filter_c;
2683     }
2684
2685     c->h261_loop_filter= h261_loop_filter_c;
2686
2687     c->try_8x8basis= try_8x8basis_c;
2688     c->add_8x8basis= add_8x8basis_c;
2689
2690     c->vector_clipf = vector_clipf_c;
2691     c->scalarproduct_int16 = scalarproduct_int16_c;
2692     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2693     c->apply_window_int16 = apply_window_int16_c;
2694     c->vector_clip_int32 = vector_clip_int32_c;
2695
2696     c->shrink[0]= av_image_copy_plane;
2697     c->shrink[1]= ff_shrink22;
2698     c->shrink[2]= ff_shrink44;
2699     c->shrink[3]= ff_shrink88;
2700
2701     c->add_pixels8 = add_pixels8_c;
2702
2703 #define hpel_funcs(prefix, idx, num) \
2704     c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2705     c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2706     c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2707     c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2708
2709     hpel_funcs(put, [0], 16);
2710     hpel_funcs(put, [1],  8);
2711     hpel_funcs(put, [2],  4);
2712     hpel_funcs(put, [3],  2);
2713     hpel_funcs(put_no_rnd, [0], 16);
2714     hpel_funcs(put_no_rnd, [1],  8);
2715     hpel_funcs(avg, [0], 16);
2716     hpel_funcs(avg, [1],  8);
2717     hpel_funcs(avg, [2],  4);
2718     hpel_funcs(avg, [3],  2);
2719     hpel_funcs(avg_no_rnd,, 16);
2720
2721 #undef FUNC
2722 #undef FUNCC
2723 #define FUNC(f, depth) f ## _ ## depth
2724 #define FUNCC(f, depth) f ## _ ## depth ## _c
2725
2726 #define BIT_DEPTH_FUNCS(depth, dct)\
2727     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
2728     c->draw_edges                    = FUNCC(draw_edges            , depth);\
2729     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
2730     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
2731
2732     switch (avctx->bits_per_raw_sample) {
2733     case 9:
2734         if (c->dct_bits == 32) {
2735             BIT_DEPTH_FUNCS(9, _32);
2736         } else {
2737             BIT_DEPTH_FUNCS(9, _16);
2738         }
2739         break;
2740     case 10:
2741         if (c->dct_bits == 32) {
2742             BIT_DEPTH_FUNCS(10, _32);
2743         } else {
2744             BIT_DEPTH_FUNCS(10, _16);
2745         }
2746         break;
2747     default:
2748         BIT_DEPTH_FUNCS(8, _16);
2749         break;
2750     }
2751
2752
2753     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2754     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2755     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2756     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2757     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2758     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2759     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2760
2761     ff_init_scantable_permutation(c->idct_permutation,
2762                                   c->idct_permutation_type);
2763 }