quicktime/ffmpeg/libavcodec/libpostproc/postprocess.c

   1 /*
   2     Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3
   4     AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5
   6     This program is free software; you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation; either version 2 of the License, or
   9     (at your option) any later version.
  10
  11     This program is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with this program; if not, write to the Free Software
  18     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19 */
  20
  21 /**
  22  * @file postprocess.c
  23  * postprocessing.
  24  */
  25
  26 /*
  27                         C       MMX     MMX2    3DNow   AltiVec
  28 isVertDC                Ec      Ec                      Ec
  29 isVertMinMaxOk          Ec      Ec                      Ec
  30 doVertLowPass           E               e       e       Ec
  31 doVertDefFilter         Ec      Ec      e       e       Ec
  32 isHorizDC               Ec      Ec                      Ec
  33 isHorizMinMaxOk         a       E                       Ec
  34 doHorizLowPass          E               e       e       Ec
  35 doHorizDefFilter        Ec      Ec      e       e       Ec
  36 do_a_deblock            Ec      E       Ec      E
  37 deRing                  E               e       e*      Ecp
  38 Vertical RKAlgo1        E               a       a
  39 Horizontal RKAlgo1                      a       a
  40 Vertical X1#            a               E       E
  41 Horizontal X1#          a               E       E
  42 LinIpolDeinterlace      e               E       E*
  43 CubicIpolDeinterlace    a               e       e*
  44 LinBlendDeinterlace     e               E       E*
  45 MedianDeinterlace#      E       Ec      Ec
  46 TempDeNoiser#           E               e       e       Ec
  47
  48 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
  49 # more or less selfinvented filters so the exactness isnt too meaningfull
  50 E = Exact implementation
  51 e = allmost exact implementation (slightly different rounding,...)
  52 a = alternative / approximate impl
  53 c = checked against the other implementations (-vo md5)
  54 p = partially optimized, still some work to do
  55 */
  56
  57 /*
  58 TODO:
  59 reduce the time wasted on the mem transfer
  60 unroll stuff if instructions depend too much on the prior one
  61 move YScale thing to the end instead of fixing QP
  62 write a faster and higher quality deblocking filter :)
  63 make the mainloop more flexible (variable number of blocks at once
  64         (the if/else stuff per block is slowing things down)
  65 compare the quality & speed of all filters
  66 split this huge file
  67 optimize c versions
  68 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  69 ...
  70 */
  71
  72 //Changelog: use the CVS log
  73
  74 #include "config.h"
  75 #include <inttypes.h>
  76 #include <stdio.h>
  77 #include <stdlib.h>
  78 #include <string.h>
  79 #ifdef HAVE_MALLOC_H
  80 #include <malloc.h>
  81 #endif
  82 //#undef HAVE_MMX2
  83 //#define HAVE_3DNOW
  84 //#undef HAVE_MMX
  85 //#undef ARCH_X86
  86 //#define DEBUG_BRIGHTNESS
  87 #ifdef USE_FASTMEMCPY
  88 #include "fastmemcpy.h"
  89 #endif
  90 #include "postprocess.h"
  91 #include "postprocess_internal.h"
  92
  93 #include "mangle.h" //FIXME should be supressed
  94
  95 #ifdef HAVE_ALTIVEC_H
  96 #include <altivec.h>
  97 #endif
  98
  99 #ifndef HAVE_MEMALIGN
 100 #define memalign(a,b) malloc(b)
 101 #endif
 102
 103 #define MIN(a,b) ((a) > (b) ? (b) : (a))
 104 #define MAX(a,b) ((a) < (b) ? (b) : (a))
 105 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
 106 #define SIGN(a) ((a) > 0 ? 1 : -1)
 107
 108 #define GET_MODE_BUFFER_SIZE 500
 109 #define OPTIONS_ARRAY_SIZE 10
 110 #define BLOCK_SIZE 8
 111 #define TEMP_STRIDE 8
 112 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 113
 114 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
 115 #    define attribute_used __attribute__((used))
 116 #    define always_inline __attribute__((always_inline)) inline
 117 #else
 118 #    define attribute_used
 119 #    define always_inline inline
 120 #endif
 121
 122 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 123 static uint64_t __attribute__((aligned(8))) attribute_used w05=         0x0005000500050005LL;
 124 static uint64_t __attribute__((aligned(8))) attribute_used w04=         0x0004000400040004LL;
 125 static uint64_t __attribute__((aligned(8))) attribute_used w20=         0x0020002000200020LL;
 126 static uint64_t __attribute__((aligned(8))) attribute_used b00=                 0x0000000000000000LL;
 127 static uint64_t __attribute__((aligned(8))) attribute_used b01=                 0x0101010101010101LL;
 128 static uint64_t __attribute__((aligned(8))) attribute_used b02=                 0x0202020202020202LL;
 129 static uint64_t __attribute__((aligned(8))) attribute_used b08=                 0x0808080808080808LL;
 130 static uint64_t __attribute__((aligned(8))) attribute_used b80=                 0x8080808080808080LL;
 131 #endif
 132
 133 static uint8_t clip_table[3*256];
 134 static uint8_t * const clip_tab= clip_table + 256;
 135
 136 static const int verbose= 0;
 137
 138 static const int attribute_used deringThreshold= 20;
 139
 140
 141 static struct PPFilter filters[]=
 142 {
 143         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 144         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 145 /*      {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 146         {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 147         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 148         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 149         {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 150         {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 151         {"dr", "dering",                1, 5, 6, DERING},
 152         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 153         {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 154         {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 155         {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 156         {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 157         {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 158         {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 159         {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 160         {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 161         {NULL, NULL,0,0,0,0} //End Marker
 162 };
 163
 164 static char *replaceTable[]=
 165 {
 166         "default",      "hdeblock:a,vdeblock:a,dering:a",
 167         "de",           "hdeblock:a,vdeblock:a,dering:a",
 168         "fast",         "x1hdeblock:a,x1vdeblock:a,dering:a",
 169         "fa",           "x1hdeblock:a,x1vdeblock:a,dering:a",
 170         "ac",           "ha:a:128:7,va:a,dering:a",
 171         NULL //End Marker
 172 };
 173
 174
 175 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 176 static inline void prefetchnta(void *p)
 177 {
 178         asm volatile(   "prefetchnta (%0)\n\t"
 179                 : : "r" (p)
 180         );
 181 }
 182
 183 static inline void prefetcht0(void *p)
 184 {
 185         asm volatile(   "prefetcht0 (%0)\n\t"
 186                 : : "r" (p)
 187         );
 188 }
 189
 190 static inline void prefetcht1(void *p)
 191 {
 192         asm volatile(   "prefetcht1 (%0)\n\t"
 193                 : : "r" (p)
 194         );
 195 }
 196
 197 static inline void prefetcht2(void *p)
 198 {
 199         asm volatile(   "prefetcht2 (%0)\n\t"
 200                 : : "r" (p)
 201         );
 202 }
 203 #endif
 204
 205 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
 206
 207 /**
 208  * Check if the given 8x8 Block is mostly "flat"
 209  */
 210 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
 211 {
 212         int numEq= 0;
 213         int y;
 214         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 215         const int dcThreshold= dcOffset*2 + 1;
 216
 217         for(y=0; y<BLOCK_SIZE; y++)
 218         {
 219                 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
 220                 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
 221                 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
 222                 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
 223                 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
 224                 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
 225                 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
 226                 src+= stride;
 227         }
 228         return numEq > c->ppMode.flatnessThreshold;
 229 }
 230
 231 /**
 232  * Check if the middle 8x8 Block in the given 8x16 block is flat
 233  */
 234 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
 235         int numEq= 0;
 236         int y;
 237         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 238         const int dcThreshold= dcOffset*2 + 1;
 239
 240         src+= stride*4; // src points to begin of the 8x8 Block
 241         for(y=0; y<BLOCK_SIZE-1; y++)
 242         {
 243                 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
 244                 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
 245                 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
 246                 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
 247                 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
 248                 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
 249                 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
 250                 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
 251                 src+= stride;
 252         }
 253         return numEq > c->ppMode.flatnessThreshold;
 254 }
 255
 256 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
 257 {
 258         int i;
 259 #if 1
 260         for(i=0; i<2; i++){
 261                 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 262                 src += stride;
 263                 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 264                 src += stride;
 265                 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 266                 src += stride;
 267                 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 268                 src += stride;
 269         }
 270 #else
 271         for(i=0; i<8; i++){
 272                 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
 273                 src += stride;
 274         }
 275 #endif
 276         return 1;
 277 }
 278
 279 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
 280 {
 281 #if 1
 282 #if 1
 283         int x;
 284         src+= stride*4;
 285         for(x=0; x<BLOCK_SIZE; x+=4)
 286         {
 287                 if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 288                 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 289                 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 290                 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 291         }
 292 #else
 293         int x;
 294         src+= stride*3;
 295         for(x=0; x<BLOCK_SIZE; x++)
 296         {
 297                 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
 298         }
 299 #endif
 300         return 1;
 301 #else
 302         int x;
 303         src+= stride*4;
 304         for(x=0; x<BLOCK_SIZE; x++)
 305         {
 306                 int min=255;
 307                 int max=0;
 308                 int y;
 309                 for(y=0; y<8; y++){
 310                         int v= src[x + y*stride];
 311                         if(v>max) max=v;
 312                         if(v<min) min=v;
 313                 }
 314                 if(max-min > 2*QP) return 0;
 315         }
 316         return 1;
 317 #endif
 318 }
 319
 320 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
 321         if( isHorizDC_C(src, stride, c) ){
 322                 if( isHorizMinMaxOk_C(src, stride, c->QP) )
 323                         return 1;
 324                 else
 325                         return 0;
 326         }else{
 327                 return 2;
 328         }
 329 }
 330
 331 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
 332         if( isVertDC_C(src, stride, c) ){
 333                 if( isVertMinMaxOk_C(src, stride, c->QP) )
 334                         return 1;
 335                 else
 336                         return 0;
 337         }else{
 338                 return 2;
 339         }
 340 }
 341
 342 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
 343 {
 344         int y;
 345         for(y=0; y<BLOCK_SIZE; y++)
 346         {
 347                 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 348
 349                 if(ABS(middleEnergy) < 8*c->QP)
 350                 {
 351                         const int q=(dst[3] - dst[4])/2;
 352                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 353                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 354
 355                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
 356                         d= MAX(d, 0);
 357
 358                         d= (5*d + 32) >> 6;
 359                         d*= SIGN(-middleEnergy);
 360
 361                         if(q>0)
 362                         {
 363                                 d= d<0 ? 0 : d;
 364                                 d= d>q ? q : d;
 365                         }
 366                         else
 367                         {
 368                                 d= d>0 ? 0 : d;
 369                                 d= d<q ? q : d;
 370                         }
 371
 372                         dst[3]-= d;
 373                         dst[4]+= d;
 374                 }
 375                 dst+= stride;
 376         }
 377 }
 378
 379 /**
 380  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 381  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 382  */
 383 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
 384 {
 385         int y;
 386         for(y=0; y<BLOCK_SIZE; y++)
 387         {
 388                 const int first= ABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 389                 const int last= ABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 390
 391                 int sums[10];
 392                 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 393                 sums[1] = sums[0] - first  + dst[3];
 394                 sums[2] = sums[1] - first  + dst[4];
 395                 sums[3] = sums[2] - first  + dst[5];
 396                 sums[4] = sums[3] - first  + dst[6];
 397                 sums[5] = sums[4] - dst[0] + dst[7];
 398                 sums[6] = sums[5] - dst[1] + last;
 399                 sums[7] = sums[6] - dst[2] + last;
 400                 sums[8] = sums[7] - dst[3] + last;
 401                 sums[9] = sums[8] - dst[4] + last;
 402
 403                 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 404                 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 405                 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 406                 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 407                 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 408                 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 409                 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 410                 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 411
 412                 dst+= stride;
 413         }
 414 }
 415
 416 /**
 417  * Experimental Filter 1 (Horizontal)
 418  * will not damage linear gradients
 419  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 420  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
 421  * MMX2 version does correct clipping C version doesnt
 422  * not identical with the vertical one
 423  */
 424 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 425 {
 426         int y;
 427         static uint64_t *lut= NULL;
 428         if(lut==NULL)
 429         {
 430                 int i;
 431                 lut= (uint64_t*)memalign(8, 256*8);
 432                 for(i=0; i<256; i++)
 433                 {
 434                         int v= i < 128 ? 2*i : 2*(i-256);
 435 /*
 436 //Simulate 112242211 9-Tap filter
 437                         uint64_t a= (v/16) & 0xFF;
 438                         uint64_t b= (v/8) & 0xFF;
 439                         uint64_t c= (v/4) & 0xFF;
 440                         uint64_t d= (3*v/8) & 0xFF;
 441 */
 442 //Simulate piecewise linear interpolation
 443                         uint64_t a= (v/16) & 0xFF;
 444                         uint64_t b= (v*3/16) & 0xFF;
 445                         uint64_t c= (v*5/16) & 0xFF;
 446                         uint64_t d= (7*v/16) & 0xFF;
 447                         uint64_t A= (0x100 - a)&0xFF;
 448                         uint64_t B= (0x100 - b)&0xFF;
 449                         uint64_t C= (0x100 - c)&0xFF;
 450                         uint64_t D= (0x100 - c)&0xFF;
 451
 452                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 453                                 (D<<24) | (C<<16) | (B<<8) | (A);
 454                         //lut[i] = (v<<32) | (v<<24);
 455                 }
 456         }
 457
 458         for(y=0; y<BLOCK_SIZE; y++)
 459         {
 460                 int a= src[1] - src[2];
 461                 int b= src[3] - src[4];
 462                 int c= src[5] - src[6];
 463
 464                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
 465
 466                 if(d < QP)
 467                 {
 468                         int v = d * SIGN(-b);
 469
 470                         src[1] +=v/8;
 471                         src[2] +=v/4;
 472                         src[3] +=3*v/8;
 473                         src[4] -=3*v/8;
 474                         src[5] -=v/4;
 475                         src[6] -=v/8;
 476
 477                 }
 478                 src+=stride;
 479         }
 480 }
 481
 482 /**
 483  * accurate deblock filter
 484  */
 485 static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
 486         int y;
 487         const int QP= c->QP;
 488         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 489         const int dcThreshold= dcOffset*2 + 1;
 490 //START_TIMER
 491         src+= step*4; // src points to begin of the 8x8 Block
 492         for(y=0; y<8; y++){
 493                 int numEq= 0;
 494
 495                 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
 496                 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
 497                 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
 498                 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
 499                 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
 500                 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
 501                 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
 502                 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
 503                 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
 504                 if(numEq > c->ppMode.flatnessThreshold){
 505                         int min, max, x;
 506
 507                         if(src[0] > src[step]){
 508                             max= src[0];
 509                             min= src[step];
 510                         }else{
 511                             max= src[step];
 512                             min= src[0];
 513                         }
 514                         for(x=2; x<8; x+=2){
 515                                 if(src[x*step] > src[(x+1)*step]){
 516                                         if(src[x    *step] > max) max= src[ x   *step];
 517                                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 518                                 }else{
 519                                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 520                                         if(src[ x   *step] < min) min= src[ x   *step];
 521                                 }
 522                         }
 523                         if(max-min < 2*QP){
 524                                 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 525                                 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 526
 527                                 int sums[10];
 528                                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 529                                 sums[1] = sums[0] - first       + src[3*step];
 530                                 sums[2] = sums[1] - first       + src[4*step];
 531                                 sums[3] = sums[2] - first       + src[5*step];
 532                                 sums[4] = sums[3] - first       + src[6*step];
 533                                 sums[5] = sums[4] - src[0*step] + src[7*step];
 534                                 sums[6] = sums[5] - src[1*step] + last;
 535                                 sums[7] = sums[6] - src[2*step] + last;
 536                                 sums[8] = sums[7] - src[3*step] + last;
 537                                 sums[9] = sums[8] - src[4*step] + last;
 538
 539                                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 540                                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 541                                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 542                                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 543                                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 544                                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 545                                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 546                                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 547                         }
 548                 }else{
 549                         const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 550
 551                         if(ABS(middleEnergy) < 8*QP)
 552                         {
 553                                 const int q=(src[3*step] - src[4*step])/2;
 554                                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 555                                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 556
 557                                 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
 558                                 d= MAX(d, 0);
 559
 560                                 d= (5*d + 32) >> 6;
 561                                 d*= SIGN(-middleEnergy);
 562
 563                                 if(q>0)
 564                                 {
 565                                         d= d<0 ? 0 : d;
 566                                         d= d>q ? q : d;
 567                                 }
 568                                 else
 569                                 {
 570                                         d= d>0 ? 0 : d;
 571                                         d= d<q ? q : d;
 572                                 }
 573
 574                                 src[3*step]-= d;
 575                                 src[4*step]+= d;
 576                         }
 577                 }
 578
 579                 src += stride;
 580         }
 581 /*if(step==16){
 582     STOP_TIMER("step16")
 583 }else{
 584     STOP_TIMER("stepX")
 585 }*/
 586 }
 587
 588 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 589 //Plain C versions
 590 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
 591 #define COMPILE_C
 592 #endif
 593
 594 #ifdef ARCH_POWERPC
 595 #ifdef HAVE_ALTIVEC
 596 #define COMPILE_ALTIVEC
 597 #endif //HAVE_ALTIVEC
 598 #endif //ARCH_POWERPC
 599
 600 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 601
 602 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 603 #define COMPILE_MMX
 604 #endif
 605
 606 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
 607 #define COMPILE_MMX2
 608 #endif
 609
 610 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 611 #define COMPILE_3DNOW
 612 #endif
 613 #endif //ARCH_X86
 614
 615 #undef HAVE_MMX
 616 #undef HAVE_MMX2
 617 #undef HAVE_3DNOW
 618 #undef HAVE_ALTIVEC
 619
 620 #ifdef COMPILE_C
 621 #undef HAVE_MMX
 622 #undef HAVE_MMX2
 623 #undef HAVE_3DNOW
 624 #define RENAME(a) a ## _C
 625 #include "postprocess_template.c"
 626 #endif
 627
 628 #ifdef ARCH_POWERPC
 629 #ifdef COMPILE_ALTIVEC
 630 #undef RENAME
 631 #define HAVE_ALTIVEC
 632 #define RENAME(a) a ## _altivec
 633 #include "postprocess_altivec_template.c"
 634 #include "postprocess_template.c"
 635 #endif
 636 #endif //ARCH_POWERPC
 637
 638 //MMX versions
 639 #ifdef COMPILE_MMX
 640 #undef RENAME
 641 #define HAVE_MMX
 642 #undef HAVE_MMX2
 643 #undef HAVE_3DNOW
 644 #define RENAME(a) a ## _MMX
 645 #include "postprocess_template.c"
 646 #endif
 647
 648 //MMX2 versions
 649 #ifdef COMPILE_MMX2
 650 #undef RENAME
 651 #define HAVE_MMX
 652 #define HAVE_MMX2
 653 #undef HAVE_3DNOW
 654 #define RENAME(a) a ## _MMX2
 655 #include "postprocess_template.c"
 656 #endif
 657
 658 //3DNOW versions
 659 #ifdef COMPILE_3DNOW
 660 #undef RENAME
 661 #define HAVE_MMX
 662 #undef HAVE_MMX2
 663 #define HAVE_3DNOW
 664 #define RENAME(a) a ## _3DNow
 665 #include "postprocess_template.c"
 666 #endif
 667
 668 // minor note: the HAVE_xyz is messed up after that line so dont use it
 669
 670 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 671         QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
 672 {
 673         PPContext *c= (PPContext *)vc;
 674         PPMode *ppMode= (PPMode *)vm;
 675         c->ppMode= *ppMode; //FIXME
 676
 677         // useing ifs here as they are faster than function pointers allthough the
 678         // difference wouldnt be messureable here but its much better because
 679         // someone might exchange the cpu whithout restarting mplayer ;)
 680 #ifdef RUNTIME_CPUDETECT
 681 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 682         // ordered per speed fasterst first
 683         if(c->cpuCaps & PP_CPU_CAPS_MMX2)
 684                 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 685         else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
 686                 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 687         else if(c->cpuCaps & PP_CPU_CAPS_MMX)
 688                 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 689         else
 690                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 691 #else
 692 #ifdef ARCH_POWERPC
 693 #ifdef HAVE_ALTIVEC
 694         if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
 695                 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 696         else
 697 #endif
 698 #endif
 699                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 700 #endif
 701 #else //RUNTIME_CPUDETECT
 702 #ifdef HAVE_MMX2
 703                 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 704 #elif defined (HAVE_3DNOW)
 705                 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 706 #elif defined (HAVE_MMX)
 707                 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 708 #elif defined (HAVE_ALTIVEC)
 709                 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 710 #else
 711                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 712 #endif
 713 #endif //!RUNTIME_CPUDETECT
 714 }
 715
 716 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 717 //      QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
 718
 719 /* -pp Command line Help
 720 */
 721 char *pp_help=
 722 "Available postprocessing filters:\n"
 723 "Filters                        Options\n"
 724 "short  long name       short   long option     Description\n"
 725 "*      *               a       autoq           CPU power dependent enabler\n"
 726 "                       c       chrom           chrominance filtering enabled\n"
 727 "                       y       nochrom         chrominance filtering disabled\n"
 728 "                       n       noluma          luma filtering disabled\n"
 729 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 730 "       1. difference factor: default=32, higher -> more deblocking\n"
 731 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 732 "                       the h & v deblocking filters share these\n"
 733 "                       so you can't set different thresholds for h / v\n"
 734 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 735 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 736 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 737 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 738 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 739 "dr     dering                                  deringing filter\n"
 740 "al     autolevels                              automatic brightness / contrast\n"
 741 "                       f       fullyrange      stretch luminance to (0..255)\n"
 742 "lb     linblenddeint                           linear blend deinterlacer\n"
 743 "li     linipoldeint                            linear interpolating deinterlace\n"
 744 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 745 "md     mediandeint                             median deinterlacer\n"
 746 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 747 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 748 "de     default                                 hb:a,vb:a,dr:a\n"
 749 "fa     fast                                    h1:a,v1:a,dr:a\n"
 750 "ac                                             ha:a:128:7,va:a,dr:a\n"
 751 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 752 "                       1. <= 2. <= 3.          larger -> stronger filtering\n"
 753 "fq     forceQuant      <quantizer>             force quantizer\n"
 754 "Usage:\n"
 755 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 756 "long form example:\n"
 757 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 758 "short form example:\n"
 759 "vb:a/hb:a/lb                                   de,-vb\n"
 760 "more examples:\n"
 761 "tn:64:128:256\n"
 762 ;
 763
 764 pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
 765 {
 766         char temp[GET_MODE_BUFFER_SIZE];
 767         char *p= temp;
 768         char *filterDelimiters= ",/";
 769         char *optionDelimiters= ":";
 770         struct PPMode *ppMode;
 771         char *filterToken;
 772
 773         ppMode= memalign(8, sizeof(PPMode));
 774
 775         ppMode->lumMode= 0;
 776         ppMode->chromMode= 0;
 777         ppMode->maxTmpNoise[0]= 700;
 778         ppMode->maxTmpNoise[1]= 1500;
 779         ppMode->maxTmpNoise[2]= 3000;
 780         ppMode->maxAllowedY= 234;
 781         ppMode->minAllowedY= 16;
 782         ppMode->baseDcDiff= 256/8;
 783         ppMode->flatnessThreshold= 56-16-1;
 784         ppMode->maxClippedThreshold= 0.01;
 785         ppMode->error=0;
 786
 787         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
 788
 789         if(verbose>1) printf("pp: %s\n", name);
 790
 791         for(;;){
 792                 char *filterName;
 793                 int q= 1000000; //PP_QUALITY_MAX;
 794                 int chrom=-1;
 795                 int luma=-1;
 796                 char *option;
 797                 char *options[OPTIONS_ARRAY_SIZE];
 798                 int i;
 799                 int filterNameOk=0;
 800                 int numOfUnknownOptions=0;
 801                 int enable=1; //does the user want us to enabled or disabled the filter
 802
 803                 filterToken= strtok(p, filterDelimiters);
 804                 if(filterToken == NULL) break;
 805                 p+= strlen(filterToken) + 1; // p points to next filterToken
 806                 filterName= strtok(filterToken, optionDelimiters);
 807                 if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
 808
 809                 if(*filterName == '-')
 810                 {
 811                         enable=0;
 812                         filterName++;
 813                 }
 814
 815                 for(;;){ //for all options
 816                         option= strtok(NULL, optionDelimiters);
 817                         if(option == NULL) break;
 818
 819                         if(verbose>1) printf("pp: option: %s\n", option);
 820                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 821                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 822                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 823                         else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 824                         else
 825                         {
 826                                 options[numOfUnknownOptions] = option;
 827                                 numOfUnknownOptions++;
 828                         }
 829                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 830                 }
 831                 options[numOfUnknownOptions] = NULL;
 832
 833                 /* replace stuff from the replace Table */
 834                 for(i=0; replaceTable[2*i]!=NULL; i++)
 835                 {
 836                         if(!strcmp(replaceTable[2*i], filterName))
 837                         {
 838                                 int newlen= strlen(replaceTable[2*i + 1]);
 839                                 int plen;
 840                                 int spaceLeft;
 841
 842                                 if(p==NULL) p= temp, *p=0;      //last filter
 843                                 else p--, *p=',';               //not last filter
 844
 845                                 plen= strlen(p);
 846                                 spaceLeft= p - temp + plen;
 847                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
 848                                 {
 849                                         ppMode->error++;
 850                                         break;
 851                                 }
 852                                 memmove(p + newlen, p, plen+1);
 853                                 memcpy(p, replaceTable[2*i + 1], newlen);
 854                                 filterNameOk=1;
 855                         }
 856                 }
 857
 858                 for(i=0; filters[i].shortName!=NULL; i++)
 859                 {
 860 //                      printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
 861                         if(   !strcmp(filters[i].longName, filterName)
 862                            || !strcmp(filters[i].shortName, filterName))
 863                         {
 864                                 ppMode->lumMode &= ~filters[i].mask;
 865                                 ppMode->chromMode &= ~filters[i].mask;
 866
 867                                 filterNameOk=1;
 868                                 if(!enable) break; // user wants to disable it
 869
 870                                 if(q >= filters[i].minLumQuality && luma)
 871                                         ppMode->lumMode|= filters[i].mask;
 872                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 873                                         if(q >= filters[i].minChromQuality)
 874                                                 ppMode->chromMode|= filters[i].mask;
 875
 876                                 if(filters[i].mask == LEVEL_FIX)
 877                                 {
 878                                         int o;
 879                                         ppMode->minAllowedY= 16;
 880                                         ppMode->maxAllowedY= 234;
 881                                         for(o=0; options[o]!=NULL; o++)
 882                                         {
 883                                                 if(  !strcmp(options[o],"fullyrange")
 884                                                    ||!strcmp(options[o],"f"))
 885                                                 {
 886                                                         ppMode->minAllowedY= 0;
 887                                                         ppMode->maxAllowedY= 255;
 888                                                         numOfUnknownOptions--;
 889                                                 }
 890                                         }
 891                                 }
 892                                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 893                                 {
 894                                         int o;
 895                                         int numOfNoises=0;
 896
 897                                         for(o=0; options[o]!=NULL; o++)
 898                                         {
 899                                                 char *tail;
 900                                                 ppMode->maxTmpNoise[numOfNoises]=
 901                                                         strtol(options[o], &tail, 0);
 902                                                 if(tail!=options[o])
 903                                                 {
 904                                                         numOfNoises++;
 905                                                         numOfUnknownOptions--;
 906                                                         if(numOfNoises >= 3) break;
 907                                                 }
 908                                         }
 909                                 }
 910                                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 911                                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
 912                                 {
 913                                         int o;
 914
 915                                         for(o=0; options[o]!=NULL && o<2; o++)
 916                                         {
 917                                                 char *tail;
 918                                                 int val= strtol(options[o], &tail, 0);
 919                                                 if(tail==options[o]) break;
 920
 921                                                 numOfUnknownOptions--;
 922                                                 if(o==0) ppMode->baseDcDiff= val;
 923                                                 else ppMode->flatnessThreshold= val;
 924                                         }
 925                                 }
 926                                 else if(filters[i].mask == FORCE_QUANT)
 927                                 {
 928                                         int o;
 929                                         ppMode->forcedQuant= 15;
 930
 931                                         for(o=0; options[o]!=NULL && o<1; o++)
 932                                         {
 933                                                 char *tail;
 934                                                 int val= strtol(options[o], &tail, 0);
 935                                                 if(tail==options[o]) break;
 936
 937                                                 numOfUnknownOptions--;
 938                                                 ppMode->forcedQuant= val;
 939                                         }
 940                                 }
 941                         }
 942                 }
 943                 if(!filterNameOk) ppMode->error++;
 944                 ppMode->error += numOfUnknownOptions;
 945         }
 946
 947         if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 948         if(ppMode->error)
 949         {
 950                 fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 951                 free(ppMode);
 952                 return NULL;
 953         }
 954         return ppMode;
 955 }
 956
 957 void pp_free_mode(pp_mode_t *mode){
 958     if(mode) free(mode);
 959 }
 960
 961 static void reallocAlign(void **p, int alignment, int size){
 962         if(*p) free(*p);
 963         *p= memalign(alignment, size);
 964         memset(*p, 0, size);
 965 }
 966
 967 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 968         int mbWidth = (width+15)>>4;
 969         int mbHeight= (height+15)>>4;
 970         int i;
 971
 972         c->stride= stride;
 973         c->qpStride= qpStride;
 974
 975         reallocAlign((void **)&c->tempDst, 8, stride*24);
 976         reallocAlign((void **)&c->tempSrc, 8, stride*24);
 977         reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
 978         reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
 979         for(i=0; i<256; i++)
 980                 c->yHistogram[i]= width*height/64*15/256;
 981
 982         for(i=0; i<3; i++)
 983         {
 984                 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
 985                 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
 986                 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 987         }
 988
 989         reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
 990         reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 991         reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 992         reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
 993 }
 994
 995 static void global_init(void){
 996         int i;
 997         memset(clip_table, 0, 256);
 998         for(i=256; i<512; i++)
 999                 clip_table[i]= i;
1000         memset(clip_table+512, 0, 256);
1001 }
1002
1003 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
1004         PPContext *c= memalign(32, sizeof(PPContext));
1005         int stride= (width+15)&(~15); //assumed / will realloc if needed
1006         int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
1007
1008         global_init();
1009
1010         memset(c, 0, sizeof(PPContext));
1011         c->cpuCaps= cpuCaps;
1012         if(cpuCaps&PP_FORMAT){
1013                 c->hChromaSubSample= cpuCaps&0x3;
1014                 c->vChromaSubSample= (cpuCaps>>4)&0x3;
1015         }else{
1016                 c->hChromaSubSample= 1;
1017                 c->vChromaSubSample= 1;
1018         }
1019
1020         reallocBuffers(c, width, height, stride, qpStride);
1021
1022         c->frameNum=-1;
1023
1024         return c;
1025 }
1026
1027 void pp_free_context(void *vc){
1028         PPContext *c = (PPContext*)vc;
1029         int i;
1030
1031         for(i=0; i<3; i++) free(c->tempBlured[i]);
1032         for(i=0; i<3; i++) free(c->tempBluredPast[i]);
1033
1034         free(c->tempBlocks);
1035         free(c->yHistogram);
1036         free(c->tempDst);
1037         free(c->tempSrc);
1038         free(c->deintTemp);
1039         free(c->stdQPTable);
1040         free(c->nonBQPTable);
1041         free(c->forcedQPTable);
1042
1043         memset(c, 0, sizeof(PPContext));
1044
1045         free(c);
1046 }
1047
1048 void  pp_postprocess(uint8_t * src[3], int srcStride[3],
1049                  uint8_t * dst[3], int dstStride[3],
1050                  int width, int height,
1051                  QP_STORE_T *QP_store,  int QPStride,
1052                  pp_mode_t *vm,  void *vc, int pict_type)
1053 {
1054         int mbWidth = (width+15)>>4;
1055         int mbHeight= (height+15)>>4;
1056         PPMode *mode = (PPMode*)vm;
1057         PPContext *c = (PPContext*)vc;
1058         int minStride= MAX(ABS(srcStride[0]), ABS(dstStride[0]));
1059         int absQPStride = ABS(QPStride);
1060
1061         // c->stride and c->QPStride are always positive
1062         if(c->stride < minStride || c->qpStride < absQPStride)
1063                 reallocBuffers(c, width, height,
1064                                 MAX(minStride, c->stride),
1065                                 MAX(c->qpStride, absQPStride));
1066
1067         if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1068         {
1069                 int i;
1070                 QP_store= c->forcedQPTable;
1071                 absQPStride = QPStride = 0;
1072                 if(mode->lumMode & FORCE_QUANT)
1073                         for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
1074                 else
1075                         for(i=0; i<mbWidth; i++) QP_store[i]= 1;
1076         }
1077 //printf("pict_type:%d\n", pict_type);
1078
1079         if(pict_type & PP_PICT_TYPE_QP2){
1080                 int i;
1081                 const int count= mbHeight * absQPStride;
1082                 for(i=0; i<(count>>2); i++){
1083                         ((uint32_t*)c->stdQPTable)[i] = (((uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1084                 }
1085                 for(i<<=2; i<count; i++){
1086                         c->stdQPTable[i] = QP_store[i]>>1;
1087                 }
1088                 QP_store= c->stdQPTable;
1089                 QPStride= absQPStride;
1090         }
1091
1092 if(0){
1093 int x,y;
1094 for(y=0; y<mbHeight; y++){
1095         for(x=0; x<mbWidth; x++){
1096                 printf("%2d ", QP_store[x + y*QPStride]);
1097         }
1098         printf("\n");
1099 }
1100         printf("\n");
1101 }
1102
1103         if((pict_type&7)!=3)
1104         {
1105                 if (QPStride >= 0) {
1106                         int i;
1107                         const int count= mbHeight * QPStride;
1108                         for(i=0; i<(count>>2); i++){
1109                                 ((uint32_t*)c->nonBQPTable)[i] = ((uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1110                         }
1111                         for(i<<=2; i<count; i++){
1112                                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1113                         }
1114                 } else {
1115                         int i,j;
1116                         for(i=0; i<mbHeight; i++) {
1117                                 for(j=0; j<absQPStride; j++) {
1118                                         c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1119                                 }
1120                         }
1121                 }
1122         }
1123
1124         if(verbose>2)
1125         {
1126                 printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
1127         }
1128
1129         postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1130                 width, height, QP_store, QPStride, 0, mode, c);
1131
1132         width  = (width )>>c->hChromaSubSample;
1133         height = (height)>>c->vChromaSubSample;
1134
1135         if(mode->chromMode)
1136         {
1137                 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1138                         width, height, QP_store, QPStride, 1, mode, c);
1139                 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1140                         width, height, QP_store, QPStride, 2, mode, c);
1141         }
1142         else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1143         {
1144                 linecpy(dst[1], src[1], height, srcStride[1]);
1145                 linecpy(dst[2], src[2], height, srcStride[2]);
1146         }
1147         else
1148         {
1149                 int y;
1150                 for(y=0; y<height; y++)
1151                 {
1152                         memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1153                         memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1154                 }
1155         }
1156 }
1157