2 Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
4 AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 C MMX MMX2 3DNow AltiVec
29 isVertMinMaxOk Ec Ec Ec
30 doVertLowPass E e e Ec
31 doVertDefFilter Ec Ec e e Ec
33 isHorizMinMaxOk a E Ec
34 doHorizLowPass E e e Ec
35 doHorizDefFilter Ec Ec e e Ec
36 do_a_deblock Ec E Ec E
38 Vertical RKAlgo1 E a a
39 Horizontal RKAlgo1 a a
42 LinIpolDeinterlace e E E*
43 CubicIpolDeinterlace a e e*
44 LinBlendDeinterlace e E E*
45 MedianDeinterlace# E Ec Ec
46 TempDeNoiser# E e e Ec
48 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49 # more or less selfinvented filters so the exactness isnt too meaningfull
50 E = Exact implementation
51 e = allmost exact implementation (slightly different rounding,...)
52 a = alternative / approximate impl
53 c = checked against the other implementations (-vo md5)
54 p = partially optimized, still some work to do
59 reduce the time wasted on the mem transfer
60 unroll stuff if instructions depend too much on the prior one
61 move YScale thing to the end instead of fixing QP
62 write a faster and higher quality deblocking filter :)
63 make the mainloop more flexible (variable number of blocks at once
64 (the if/else stuff per block is slowing things down)
65 compare the quality & speed of all filters
68 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
72 //Changelog: use the CVS log
86 //#define DEBUG_BRIGHTNESS
88 #include "fastmemcpy.h"
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
93 #include "mangle.h" //FIXME should be supressed
100 #define memalign(a,b) malloc(b)
103 #define MIN(a,b) ((a) > (b) ? (b) : (a))
104 #define MAX(a,b) ((a) < (b) ? (b) : (a))
105 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
106 #define SIGN(a) ((a) > 0 ? 1 : -1)
108 #define GET_MODE_BUFFER_SIZE 500
109 #define OPTIONS_ARRAY_SIZE 10
111 #define TEMP_STRIDE 8
112 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
114 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
115 # define attribute_used __attribute__((used))
116 # define always_inline __attribute__((always_inline)) inline
118 # define attribute_used
119 # define always_inline inline
122 #if defined(ARCH_X86) || defined(ARCH_X86_64)
123 static uint64_t __attribute__((aligned(8))) attribute_used w05
= 0x0005000500050005LL
;
124 static uint64_t __attribute__((aligned(8))) attribute_used w04
= 0x0004000400040004LL
;
125 static uint64_t __attribute__((aligned(8))) attribute_used w20
= 0x0020002000200020LL
;
126 static uint64_t __attribute__((aligned(8))) attribute_used b00
= 0x0000000000000000LL
;
127 static uint64_t __attribute__((aligned(8))) attribute_used b01
= 0x0101010101010101LL
;
128 static uint64_t __attribute__((aligned(8))) attribute_used b02
= 0x0202020202020202LL
;
129 static uint64_t __attribute__((aligned(8))) attribute_used b08
= 0x0808080808080808LL
;
130 static uint64_t __attribute__((aligned(8))) attribute_used b80
= 0x8080808080808080LL
;
133 static uint8_t clip_table
[3*256];
134 static uint8_t * const clip_tab
= clip_table
+ 256;
136 static const int verbose
= 0;
138 static const int attribute_used deringThreshold
= 20;
141 static struct PPFilter filters
[]=
143 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK
},
144 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK
},
145 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
146 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
147 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER
},
148 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER
},
149 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK
},
150 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK
},
151 {"dr", "dering", 1, 5, 6, DERING
},
152 {"al", "autolevels", 0, 1, 2, LEVEL_FIX
},
153 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER
},
154 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER
},
155 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER
},
156 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER
},
157 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER
},
158 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER
},
159 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER
},
160 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT
},
161 {NULL
, NULL
,0,0,0,0} //End Marker
164 static char *replaceTable
[]=
166 "default", "hdeblock:a,vdeblock:a,dering:a",
167 "de", "hdeblock:a,vdeblock:a,dering:a",
168 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a",
169 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a",
170 "ac", "ha:a:128:7,va:a,dering:a",
175 #if defined(ARCH_X86) || defined(ARCH_X86_64)
176 static inline void prefetchnta(void *p
)
178 asm volatile( "prefetchnta (%0)\n\t"
183 static inline void prefetcht0(void *p
)
185 asm volatile( "prefetcht0 (%0)\n\t"
190 static inline void prefetcht1(void *p
)
192 asm volatile( "prefetcht1 (%0)\n\t"
197 static inline void prefetcht2(void *p
)
199 asm volatile( "prefetcht2 (%0)\n\t"
205 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
208 * Check if the given 8x8 Block is mostly "flat"
210 static inline int isHorizDC_C(uint8_t src
[], int stride
, PPContext
*c
)
214 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
215 const int dcThreshold
= dcOffset
*2 + 1;
217 for(y
=0; y
<BLOCK_SIZE
; y
++)
219 if(((unsigned)(src
[0] - src
[1] + dcOffset
)) < dcThreshold
) numEq
++;
220 if(((unsigned)(src
[1] - src
[2] + dcOffset
)) < dcThreshold
) numEq
++;
221 if(((unsigned)(src
[2] - src
[3] + dcOffset
)) < dcThreshold
) numEq
++;
222 if(((unsigned)(src
[3] - src
[4] + dcOffset
)) < dcThreshold
) numEq
++;
223 if(((unsigned)(src
[4] - src
[5] + dcOffset
)) < dcThreshold
) numEq
++;
224 if(((unsigned)(src
[5] - src
[6] + dcOffset
)) < dcThreshold
) numEq
++;
225 if(((unsigned)(src
[6] - src
[7] + dcOffset
)) < dcThreshold
) numEq
++;
228 return numEq
> c
->ppMode
.flatnessThreshold
;
232 * Check if the middle 8x8 Block in the given 8x16 block is flat
234 static inline int isVertDC_C(uint8_t src
[], int stride
, PPContext
*c
){
237 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
238 const int dcThreshold
= dcOffset
*2 + 1;
240 src
+= stride
*4; // src points to begin of the 8x8 Block
241 for(y
=0; y
<BLOCK_SIZE
-1; y
++)
243 if(((unsigned)(src
[0] - src
[0+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
244 if(((unsigned)(src
[1] - src
[1+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
245 if(((unsigned)(src
[2] - src
[2+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
246 if(((unsigned)(src
[3] - src
[3+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
247 if(((unsigned)(src
[4] - src
[4+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
248 if(((unsigned)(src
[5] - src
[5+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
249 if(((unsigned)(src
[6] - src
[6+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
250 if(((unsigned)(src
[7] - src
[7+stride
] + dcOffset
)) < dcThreshold
) numEq
++;
253 return numEq
> c
->ppMode
.flatnessThreshold
;
256 static inline int isHorizMinMaxOk_C(uint8_t src
[], int stride
, int QP
)
261 if((unsigned)(src
[0] - src
[5] + 2*QP
) > 4*QP
) return 0;
263 if((unsigned)(src
[2] - src
[7] + 2*QP
) > 4*QP
) return 0;
265 if((unsigned)(src
[4] - src
[1] + 2*QP
) > 4*QP
) return 0;
267 if((unsigned)(src
[6] - src
[3] + 2*QP
) > 4*QP
) return 0;
272 if((unsigned)(src
[0] - src
[7] + 2*QP
) > 4*QP
) return 0;
279 static inline int isVertMinMaxOk_C(uint8_t src
[], int stride
, int QP
)
285 for(x
=0; x
<BLOCK_SIZE
; x
+=4)
287 if((unsigned)(src
[ x
+ 0*stride
] - src
[ x
+ 5*stride
] + 2*QP
) > 4*QP
) return 0;
288 if((unsigned)(src
[1+x
+ 2*stride
] - src
[1+x
+ 7*stride
] + 2*QP
) > 4*QP
) return 0;
289 if((unsigned)(src
[2+x
+ 4*stride
] - src
[2+x
+ 1*stride
] + 2*QP
) > 4*QP
) return 0;
290 if((unsigned)(src
[3+x
+ 6*stride
] - src
[3+x
+ 3*stride
] + 2*QP
) > 4*QP
) return 0;
295 for(x
=0; x
<BLOCK_SIZE
; x
++)
297 if((unsigned)(src
[x
+ stride
] - src
[x
+ (stride
<<3)] + 2*QP
) > 4*QP
) return 0;
304 for(x
=0; x
<BLOCK_SIZE
; x
++)
310 int v
= src
[x
+ y
*stride
];
314 if(max
-min
> 2*QP
) return 0;
320 static inline int horizClassify_C(uint8_t src
[], int stride
, PPContext
*c
){
321 if( isHorizDC_C(src
, stride
, c
) ){
322 if( isHorizMinMaxOk_C(src
, stride
, c
->QP
) )
331 static inline int vertClassify_C(uint8_t src
[], int stride
, PPContext
*c
){
332 if( isVertDC_C(src
, stride
, c
) ){
333 if( isVertMinMaxOk_C(src
, stride
, c
->QP
) )
342 static inline void doHorizDefFilter_C(uint8_t dst
[], int stride
, PPContext
*c
)
345 for(y
=0; y
<BLOCK_SIZE
; y
++)
347 const int middleEnergy
= 5*(dst
[4] - dst
[3]) + 2*(dst
[2] - dst
[5]);
349 if(ABS(middleEnergy
) < 8*c
->QP
)
351 const int q
=(dst
[3] - dst
[4])/2;
352 const int leftEnergy
= 5*(dst
[2] - dst
[1]) + 2*(dst
[0] - dst
[3]);
353 const int rightEnergy
= 5*(dst
[6] - dst
[5]) + 2*(dst
[4] - dst
[7]);
355 int d
= ABS(middleEnergy
) - MIN( ABS(leftEnergy
), ABS(rightEnergy
) );
359 d
*= SIGN(-middleEnergy
);
380 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
381 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
383 static inline void doHorizLowPass_C(uint8_t dst
[], int stride
, PPContext
*c
)
386 for(y
=0; y
<BLOCK_SIZE
; y
++)
388 const int first
= ABS(dst
[-1] - dst
[0]) < c
->QP
? dst
[-1] : dst
[0];
389 const int last
= ABS(dst
[8] - dst
[7]) < c
->QP
? dst
[8] : dst
[7];
392 sums
[0] = 4*first
+ dst
[0] + dst
[1] + dst
[2] + 4;
393 sums
[1] = sums
[0] - first
+ dst
[3];
394 sums
[2] = sums
[1] - first
+ dst
[4];
395 sums
[3] = sums
[2] - first
+ dst
[5];
396 sums
[4] = sums
[3] - first
+ dst
[6];
397 sums
[5] = sums
[4] - dst
[0] + dst
[7];
398 sums
[6] = sums
[5] - dst
[1] + last
;
399 sums
[7] = sums
[6] - dst
[2] + last
;
400 sums
[8] = sums
[7] - dst
[3] + last
;
401 sums
[9] = sums
[8] - dst
[4] + last
;
403 dst
[0]= (sums
[0] + sums
[2] + 2*dst
[0])>>4;
404 dst
[1]= (sums
[1] + sums
[3] + 2*dst
[1])>>4;
405 dst
[2]= (sums
[2] + sums
[4] + 2*dst
[2])>>4;
406 dst
[3]= (sums
[3] + sums
[5] + 2*dst
[3])>>4;
407 dst
[4]= (sums
[4] + sums
[6] + 2*dst
[4])>>4;
408 dst
[5]= (sums
[5] + sums
[7] + 2*dst
[5])>>4;
409 dst
[6]= (sums
[6] + sums
[8] + 2*dst
[6])>>4;
410 dst
[7]= (sums
[7] + sums
[9] + 2*dst
[7])>>4;
417 * Experimental Filter 1 (Horizontal)
418 * will not damage linear gradients
419 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
420 * can only smooth blocks at the expected locations (it cant smooth them if they did move)
421 * MMX2 version does correct clipping C version doesnt
422 * not identical with the vertical one
424 static inline void horizX1Filter(uint8_t *src
, int stride
, int QP
)
427 static uint64_t *lut
= NULL
;
431 lut
= (uint64_t*)memalign(8, 256*8);
434 int v
= i
< 128 ? 2*i
: 2*(i
-256);
436 //Simulate 112242211 9-Tap filter
437 uint64_t a= (v/16) & 0xFF;
438 uint64_t b= (v/8) & 0xFF;
439 uint64_t c= (v/4) & 0xFF;
440 uint64_t d= (3*v/8) & 0xFF;
442 //Simulate piecewise linear interpolation
443 uint64_t a
= (v
/16) & 0xFF;
444 uint64_t b
= (v
*3/16) & 0xFF;
445 uint64_t c
= (v
*5/16) & 0xFF;
446 uint64_t d
= (7*v
/16) & 0xFF;
447 uint64_t A
= (0x100 - a
)&0xFF;
448 uint64_t B
= (0x100 - b
)&0xFF;
449 uint64_t C
= (0x100 - c
)&0xFF;
450 uint64_t D
= (0x100 - c
)&0xFF;
452 lut
[i
] = (a
<<56) | (b
<<48) | (c
<<40) | (d
<<32) |
453 (D
<<24) | (C
<<16) | (B
<<8) | (A
);
454 //lut[i] = (v<<32) | (v<<24);
458 for(y
=0; y
<BLOCK_SIZE
; y
++)
460 int a
= src
[1] - src
[2];
461 int b
= src
[3] - src
[4];
462 int c
= src
[5] - src
[6];
464 int d
= MAX(ABS(b
) - (ABS(a
) + ABS(c
))/2, 0);
468 int v
= d
* SIGN(-b
);
483 * accurate deblock filter
485 static always_inline
void do_a_deblock_C(uint8_t *src
, int step
, int stride
, PPContext
*c
){
488 const int dcOffset
= ((c
->nonBQP
*c
->ppMode
.baseDcDiff
)>>8) + 1;
489 const int dcThreshold
= dcOffset
*2 + 1;
491 src
+= step
*4; // src points to begin of the 8x8 Block
495 if(((unsigned)(src
[-1*step
] - src
[0*step
] + dcOffset
)) < dcThreshold
) numEq
++;
496 if(((unsigned)(src
[ 0*step
] - src
[1*step
] + dcOffset
)) < dcThreshold
) numEq
++;
497 if(((unsigned)(src
[ 1*step
] - src
[2*step
] + dcOffset
)) < dcThreshold
) numEq
++;
498 if(((unsigned)(src
[ 2*step
] - src
[3*step
] + dcOffset
)) < dcThreshold
) numEq
++;
499 if(((unsigned)(src
[ 3*step
] - src
[4*step
] + dcOffset
)) < dcThreshold
) numEq
++;
500 if(((unsigned)(src
[ 4*step
] - src
[5*step
] + dcOffset
)) < dcThreshold
) numEq
++;
501 if(((unsigned)(src
[ 5*step
] - src
[6*step
] + dcOffset
)) < dcThreshold
) numEq
++;
502 if(((unsigned)(src
[ 6*step
] - src
[7*step
] + dcOffset
)) < dcThreshold
) numEq
++;
503 if(((unsigned)(src
[ 7*step
] - src
[8*step
] + dcOffset
)) < dcThreshold
) numEq
++;
504 if(numEq
> c
->ppMode
.flatnessThreshold
){
507 if(src
[0] > src
[step
]){
515 if(src
[x
*step
] > src
[(x
+1)*step
]){
516 if(src
[x
*step
] > max
) max
= src
[ x
*step
];
517 if(src
[(x
+1)*step
] < min
) min
= src
[(x
+1)*step
];
519 if(src
[(x
+1)*step
] > max
) max
= src
[(x
+1)*step
];
520 if(src
[ x
*step
] < min
) min
= src
[ x
*step
];
524 const int first
= ABS(src
[-1*step
] - src
[0]) < QP
? src
[-1*step
] : src
[0];
525 const int last
= ABS(src
[8*step
] - src
[7*step
]) < QP
? src
[8*step
] : src
[7*step
];
528 sums
[0] = 4*first
+ src
[0*step
] + src
[1*step
] + src
[2*step
] + 4;
529 sums
[1] = sums
[0] - first
+ src
[3*step
];
530 sums
[2] = sums
[1] - first
+ src
[4*step
];
531 sums
[3] = sums
[2] - first
+ src
[5*step
];
532 sums
[4] = sums
[3] - first
+ src
[6*step
];
533 sums
[5] = sums
[4] - src
[0*step
] + src
[7*step
];
534 sums
[6] = sums
[5] - src
[1*step
] + last
;
535 sums
[7] = sums
[6] - src
[2*step
] + last
;
536 sums
[8] = sums
[7] - src
[3*step
] + last
;
537 sums
[9] = sums
[8] - src
[4*step
] + last
;
539 src
[0*step
]= (sums
[0] + sums
[2] + 2*src
[0*step
])>>4;
540 src
[1*step
]= (sums
[1] + sums
[3] + 2*src
[1*step
])>>4;
541 src
[2*step
]= (sums
[2] + sums
[4] + 2*src
[2*step
])>>4;
542 src
[3*step
]= (sums
[3] + sums
[5] + 2*src
[3*step
])>>4;
543 src
[4*step
]= (sums
[4] + sums
[6] + 2*src
[4*step
])>>4;
544 src
[5*step
]= (sums
[5] + sums
[7] + 2*src
[5*step
])>>4;
545 src
[6*step
]= (sums
[6] + sums
[8] + 2*src
[6*step
])>>4;
546 src
[7*step
]= (sums
[7] + sums
[9] + 2*src
[7*step
])>>4;
549 const int middleEnergy
= 5*(src
[4*step
] - src
[3*step
]) + 2*(src
[2*step
] - src
[5*step
]);
551 if(ABS(middleEnergy
) < 8*QP
)
553 const int q
=(src
[3*step
] - src
[4*step
])/2;
554 const int leftEnergy
= 5*(src
[2*step
] - src
[1*step
]) + 2*(src
[0*step
] - src
[3*step
]);
555 const int rightEnergy
= 5*(src
[6*step
] - src
[5*step
]) + 2*(src
[4*step
] - src
[7*step
]);
557 int d
= ABS(middleEnergy
) - MIN( ABS(leftEnergy
), ABS(rightEnergy
) );
561 d
*= SIGN(-middleEnergy
);
588 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
590 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
596 #define COMPILE_ALTIVEC
597 #endif //HAVE_ALTIVEC
598 #endif //ARCH_POWERPC
600 #if defined(ARCH_X86) || defined(ARCH_X86_64)
602 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
606 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
610 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
611 #define COMPILE_3DNOW
624 #define RENAME(a) a ## _C
625 #include "postprocess_template.c"
629 #ifdef COMPILE_ALTIVEC
632 #define RENAME(a) a ## _altivec
633 #include "postprocess_altivec_template.c"
634 #include "postprocess_template.c"
636 #endif //ARCH_POWERPC
644 #define RENAME(a) a ## _MMX
645 #include "postprocess_template.c"
654 #define RENAME(a) a ## _MMX2
655 #include "postprocess_template.c"
664 #define RENAME(a) a ## _3DNow
665 #include "postprocess_template.c"
668 // minor note: the HAVE_xyz is messed up after that line so dont use it
670 static inline void postProcess(uint8_t src
[], int srcStride
, uint8_t dst
[], int dstStride
, int width
, int height
,
671 QP_STORE_T QPs
[], int QPStride
, int isColor
, pp_mode_t
*vm
, pp_context_t
*vc
)
673 PPContext
*c
= (PPContext
*)vc
;
674 PPMode
*ppMode
= (PPMode
*)vm
;
675 c
->ppMode
= *ppMode
; //FIXME
677 // useing ifs here as they are faster than function pointers allthough the
678 // difference wouldnt be messureable here but its much better because
679 // someone might exchange the cpu whithout restarting mplayer ;)
680 #ifdef RUNTIME_CPUDETECT
681 #if defined(ARCH_X86) || defined(ARCH_X86_64)
682 // ordered per speed fasterst first
683 if(c
->cpuCaps
& PP_CPU_CAPS_MMX2
)
684 postProcess_MMX2(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
685 else if(c
->cpuCaps
& PP_CPU_CAPS_3DNOW
)
686 postProcess_3DNow(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
687 else if(c
->cpuCaps
& PP_CPU_CAPS_MMX
)
688 postProcess_MMX(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
690 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
694 if(c
->cpuCaps
& PP_CPU_CAPS_ALTIVEC
)
695 postProcess_altivec(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
699 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
701 #else //RUNTIME_CPUDETECT
703 postProcess_MMX2(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
704 #elif defined (HAVE_3DNOW)
705 postProcess_3DNow(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
706 #elif defined (HAVE_MMX)
707 postProcess_MMX(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
708 #elif defined (HAVE_ALTIVEC)
709 postProcess_altivec(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
711 postProcess_C(src
, srcStride
, dst
, dstStride
, width
, height
, QPs
, QPStride
, isColor
, c
);
713 #endif //!RUNTIME_CPUDETECT
716 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
717 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
719 /* -pp Command line Help
722 "Available postprocessing filters:\n"
724 "short long name short long option Description\n"
725 "* * a autoq CPU power dependent enabler\n"
726 " c chrom chrominance filtering enabled\n"
727 " y nochrom chrominance filtering disabled\n"
728 " n noluma luma filtering disabled\n"
729 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
730 " 1. difference factor: default=32, higher -> more deblocking\n"
731 " 2. flatness threshold: default=39, lower -> more deblocking\n"
732 " the h & v deblocking filters share these\n"
733 " so you can't set different thresholds for h / v\n"
734 "vb vdeblock (2 threshold) vertical deblocking filter\n"
735 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
736 "va vadeblock (2 threshold) vertical deblocking filter\n"
737 "h1 x1hdeblock experimental h deblock filter 1\n"
738 "v1 x1vdeblock experimental v deblock filter 1\n"
739 "dr dering deringing filter\n"
740 "al autolevels automatic brightness / contrast\n"
741 " f fullyrange stretch luminance to (0..255)\n"
742 "lb linblenddeint linear blend deinterlacer\n"
743 "li linipoldeint linear interpolating deinterlace\n"
744 "ci cubicipoldeint cubic interpolating deinterlacer\n"
745 "md mediandeint median deinterlacer\n"
746 "fd ffmpegdeint ffmpeg deinterlacer\n"
747 "l5 lowpass5 FIR lowpass deinterlacer\n"
748 "de default hb:a,vb:a,dr:a\n"
749 "fa fast h1:a,v1:a,dr:a\n"
750 "ac ha:a:128:7,va:a,dr:a\n"
751 "tn tmpnoise (3 threshold) temporal noise reducer\n"
752 " 1. <= 2. <= 3. larger -> stronger filtering\n"
753 "fq forceQuant <quantizer> force quantizer\n"
755 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
756 "long form example:\n"
757 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
758 "short form example:\n"
759 "vb:a/hb:a/lb de,-vb\n"
764 pp_mode_t
*pp_get_mode_by_name_and_quality(char *name
, int quality
)
766 char temp
[GET_MODE_BUFFER_SIZE
];
768 char *filterDelimiters
= ",/";
769 char *optionDelimiters
= ":";
770 struct PPMode
*ppMode
;
773 ppMode
= memalign(8, sizeof(PPMode
));
776 ppMode
->chromMode
= 0;
777 ppMode
->maxTmpNoise
[0]= 700;
778 ppMode
->maxTmpNoise
[1]= 1500;
779 ppMode
->maxTmpNoise
[2]= 3000;
780 ppMode
->maxAllowedY
= 234;
781 ppMode
->minAllowedY
= 16;
782 ppMode
->baseDcDiff
= 256/8;
783 ppMode
->flatnessThreshold
= 56-16-1;
784 ppMode
->maxClippedThreshold
= 0.01;
787 strncpy(temp
, name
, GET_MODE_BUFFER_SIZE
);
789 if(verbose
>1) printf("pp: %s\n", name
);
793 int q
= 1000000; //PP_QUALITY_MAX;
797 char *options
[OPTIONS_ARRAY_SIZE
];
800 int numOfUnknownOptions
=0;
801 int enable
=1; //does the user want us to enabled or disabled the filter
803 filterToken
= strtok(p
, filterDelimiters
);
804 if(filterToken
== NULL
) break;
805 p
+= strlen(filterToken
) + 1; // p points to next filterToken
806 filterName
= strtok(filterToken
, optionDelimiters
);
807 if(verbose
>1) printf("pp: %s::%s\n", filterToken
, filterName
);
809 if(*filterName
== '-')
815 for(;;){ //for all options
816 option
= strtok(NULL
, optionDelimiters
);
817 if(option
== NULL
) break;
819 if(verbose
>1) printf("pp: option: %s\n", option
);
820 if(!strcmp("autoq", option
) || !strcmp("a", option
)) q
= quality
;
821 else if(!strcmp("nochrom", option
) || !strcmp("y", option
)) chrom
=0;
822 else if(!strcmp("chrom", option
) || !strcmp("c", option
)) chrom
=1;
823 else if(!strcmp("noluma", option
) || !strcmp("n", option
)) luma
=0;
826 options
[numOfUnknownOptions
] = option
;
827 numOfUnknownOptions
++;
829 if(numOfUnknownOptions
>= OPTIONS_ARRAY_SIZE
-1) break;
831 options
[numOfUnknownOptions
] = NULL
;
833 /* replace stuff from the replace Table */
834 for(i
=0; replaceTable
[2*i
]!=NULL
; i
++)
836 if(!strcmp(replaceTable
[2*i
], filterName
))
838 int newlen
= strlen(replaceTable
[2*i
+ 1]);
842 if(p
==NULL
) p
= temp
, *p
=0; //last filter
843 else p
--, *p
=','; //not last filter
846 spaceLeft
= p
- temp
+ plen
;
847 if(spaceLeft
+ newlen
>= GET_MODE_BUFFER_SIZE
)
852 memmove(p
+ newlen
, p
, plen
+1);
853 memcpy(p
, replaceTable
[2*i
+ 1], newlen
);
858 for(i
=0; filters
[i
].shortName
!=NULL
; i
++)
860 // printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
861 if( !strcmp(filters
[i
].longName
, filterName
)
862 || !strcmp(filters
[i
].shortName
, filterName
))
864 ppMode
->lumMode
&= ~filters
[i
].mask
;
865 ppMode
->chromMode
&= ~filters
[i
].mask
;
868 if(!enable
) break; // user wants to disable it
870 if(q
>= filters
[i
].minLumQuality
&& luma
)
871 ppMode
->lumMode
|= filters
[i
].mask
;
872 if(chrom
==1 || (chrom
==-1 && filters
[i
].chromDefault
))
873 if(q
>= filters
[i
].minChromQuality
)
874 ppMode
->chromMode
|= filters
[i
].mask
;
876 if(filters
[i
].mask
== LEVEL_FIX
)
879 ppMode
->minAllowedY
= 16;
880 ppMode
->maxAllowedY
= 234;
881 for(o
=0; options
[o
]!=NULL
; o
++)
883 if( !strcmp(options
[o
],"fullyrange")
884 ||!strcmp(options
[o
],"f"))
886 ppMode
->minAllowedY
= 0;
887 ppMode
->maxAllowedY
= 255;
888 numOfUnknownOptions
--;
892 else if(filters
[i
].mask
== TEMP_NOISE_FILTER
)
897 for(o
=0; options
[o
]!=NULL
; o
++)
900 ppMode
->maxTmpNoise
[numOfNoises
]=
901 strtol(options
[o
], &tail
, 0);
905 numOfUnknownOptions
--;
906 if(numOfNoises
>= 3) break;
910 else if(filters
[i
].mask
== V_DEBLOCK
|| filters
[i
].mask
== H_DEBLOCK
911 || filters
[i
].mask
== V_A_DEBLOCK
|| filters
[i
].mask
== H_A_DEBLOCK
)
915 for(o
=0; options
[o
]!=NULL
&& o
<2; o
++)
918 int val
= strtol(options
[o
], &tail
, 0);
919 if(tail
==options
[o
]) break;
921 numOfUnknownOptions
--;
922 if(o
==0) ppMode
->baseDcDiff
= val
;
923 else ppMode
->flatnessThreshold
= val
;
926 else if(filters
[i
].mask
== FORCE_QUANT
)
929 ppMode
->forcedQuant
= 15;
931 for(o
=0; options
[o
]!=NULL
&& o
<1; o
++)
934 int val
= strtol(options
[o
], &tail
, 0);
935 if(tail
==options
[o
]) break;
937 numOfUnknownOptions
--;
938 ppMode
->forcedQuant
= val
;
943 if(!filterNameOk
) ppMode
->error
++;
944 ppMode
->error
+= numOfUnknownOptions
;
947 if(verbose
>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode
->lumMode
, ppMode
->chromMode
);
950 fprintf(stderr
, "%d errors in postprocess string \"%s\"\n", ppMode
->error
, name
);
957 void pp_free_mode(pp_mode_t
*mode
){
961 static void reallocAlign(void **p
, int alignment
, int size
){
963 *p
= memalign(alignment
, size
);
967 static void reallocBuffers(PPContext
*c
, int width
, int height
, int stride
, int qpStride
){
968 int mbWidth
= (width
+15)>>4;
969 int mbHeight
= (height
+15)>>4;
973 c
->qpStride
= qpStride
;
975 reallocAlign((void **)&c
->tempDst
, 8, stride
*24);
976 reallocAlign((void **)&c
->tempSrc
, 8, stride
*24);
977 reallocAlign((void **)&c
->tempBlocks
, 8, 2*16*8);
978 reallocAlign((void **)&c
->yHistogram
, 8, 256*sizeof(uint64_t));
980 c
->yHistogram
[i
]= width
*height
/64*15/256;
984 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
985 reallocAlign((void **)&c
->tempBlured
[i
], 8, stride
*mbHeight
*16 + 17*1024);
986 reallocAlign((void **)&c
->tempBluredPast
[i
], 8, 256*((height
+7)&(~7))/2 + 17*1024);//FIXME size
989 reallocAlign((void **)&c
->deintTemp
, 8, 2*width
+32);
990 reallocAlign((void **)&c
->nonBQPTable
, 8, qpStride
*mbHeight
*sizeof(QP_STORE_T
));
991 reallocAlign((void **)&c
->stdQPTable
, 8, qpStride
*mbHeight
*sizeof(QP_STORE_T
));
992 reallocAlign((void **)&c
->forcedQPTable
, 8, mbWidth
*sizeof(QP_STORE_T
));
995 static void global_init(void){
997 memset(clip_table
, 0, 256);
998 for(i
=256; i
<512; i
++)
1000 memset(clip_table
+512, 0, 256);
1003 pp_context_t
*pp_get_context(int width
, int height
, int cpuCaps
){
1004 PPContext
*c
= memalign(32, sizeof(PPContext
));
1005 int stride
= (width
+15)&(~15); //assumed / will realloc if needed
1006 int qpStride
= (width
+15)/16 + 2; //assumed / will realloc if needed
1010 memset(c
, 0, sizeof(PPContext
));
1011 c
->cpuCaps
= cpuCaps
;
1012 if(cpuCaps
&PP_FORMAT
){
1013 c
->hChromaSubSample
= cpuCaps
&0x3;
1014 c
->vChromaSubSample
= (cpuCaps
>>4)&0x3;
1016 c
->hChromaSubSample
= 1;
1017 c
->vChromaSubSample
= 1;
1020 reallocBuffers(c
, width
, height
, stride
, qpStride
);
1027 void pp_free_context(void *vc
){
1028 PPContext
*c
= (PPContext
*)vc
;
1031 for(i
=0; i
<3; i
++) free(c
->tempBlured
[i
]);
1032 for(i
=0; i
<3; i
++) free(c
->tempBluredPast
[i
]);
1034 free(c
->tempBlocks
);
1035 free(c
->yHistogram
);
1039 free(c
->stdQPTable
);
1040 free(c
->nonBQPTable
);
1041 free(c
->forcedQPTable
);
1043 memset(c
, 0, sizeof(PPContext
));
1048 void pp_postprocess(uint8_t * src
[3], int srcStride
[3],
1049 uint8_t * dst
[3], int dstStride
[3],
1050 int width
, int height
,
1051 QP_STORE_T
*QP_store
, int QPStride
,
1052 pp_mode_t
*vm
, void *vc
, int pict_type
)
1054 int mbWidth
= (width
+15)>>4;
1055 int mbHeight
= (height
+15)>>4;
1056 PPMode
*mode
= (PPMode
*)vm
;
1057 PPContext
*c
= (PPContext
*)vc
;
1058 int minStride
= MAX(ABS(srcStride
[0]), ABS(dstStride
[0]));
1059 int absQPStride
= ABS(QPStride
);
1061 // c->stride and c->QPStride are always positive
1062 if(c
->stride
< minStride
|| c
->qpStride
< absQPStride
)
1063 reallocBuffers(c
, width
, height
,
1064 MAX(minStride
, c
->stride
),
1065 MAX(c
->qpStride
, absQPStride
));
1067 if(QP_store
==NULL
|| (mode
->lumMode
& FORCE_QUANT
))
1070 QP_store
= c
->forcedQPTable
;
1071 absQPStride
= QPStride
= 0;
1072 if(mode
->lumMode
& FORCE_QUANT
)
1073 for(i
=0; i
<mbWidth
; i
++) QP_store
[i
]= mode
->forcedQuant
;
1075 for(i
=0; i
<mbWidth
; i
++) QP_store
[i
]= 1;
1077 //printf("pict_type:%d\n", pict_type);
1079 if(pict_type
& PP_PICT_TYPE_QP2
){
1081 const int count
= mbHeight
* absQPStride
;
1082 for(i
=0; i
<(count
>>2); i
++){
1083 ((uint32_t*)c
->stdQPTable
)[i
] = (((uint32_t*)QP_store
)[i
]>>1) & 0x7F7F7F7F;
1085 for(i
<<=2; i
<count
; i
++){
1086 c
->stdQPTable
[i
] = QP_store
[i
]>>1;
1088 QP_store
= c
->stdQPTable
;
1089 QPStride
= absQPStride
;
1094 for(y
=0; y
<mbHeight
; y
++){
1095 for(x
=0; x
<mbWidth
; x
++){
1096 printf("%2d ", QP_store
[x
+ y
*QPStride
]);
1103 if((pict_type
&7)!=3)
1105 if (QPStride
>= 0) {
1107 const int count
= mbHeight
* QPStride
;
1108 for(i
=0; i
<(count
>>2); i
++){
1109 ((uint32_t*)c
->nonBQPTable
)[i
] = ((uint32_t*)QP_store
)[i
] & 0x3F3F3F3F;
1111 for(i
<<=2; i
<count
; i
++){
1112 c
->nonBQPTable
[i
] = QP_store
[i
] & 0x3F;
1116 for(i
=0; i
<mbHeight
; i
++) {
1117 for(j
=0; j
<absQPStride
; j
++) {
1118 c
->nonBQPTable
[i
*absQPStride
+j
] = QP_store
[i
*QPStride
+j
] & 0x3F;
1126 printf("using npp filters 0x%X/0x%X\n", mode
->lumMode
, mode
->chromMode
);
1129 postProcess(src
[0], srcStride
[0], dst
[0], dstStride
[0],
1130 width
, height
, QP_store
, QPStride
, 0, mode
, c
);
1132 width
= (width
)>>c
->hChromaSubSample
;
1133 height
= (height
)>>c
->vChromaSubSample
;
1137 postProcess(src
[1], srcStride
[1], dst
[1], dstStride
[1],
1138 width
, height
, QP_store
, QPStride
, 1, mode
, c
);
1139 postProcess(src
[2], srcStride
[2], dst
[2], dstStride
[2],
1140 width
, height
, QP_store
, QPStride
, 2, mode
, c
);
1142 else if(srcStride
[1] == dstStride
[1] && srcStride
[2] == dstStride
[2])
1144 linecpy(dst
[1], src
[1], height
, srcStride
[1]);
1145 linecpy(dst
[2], src
[2], height
, srcStride
[2]);
1150 for(y
=0; y
<height
; y
++)
1152 memcpy(&(dst
[1][y
*dstStride
[1]]), &(src
[1][y
*srcStride
[1]]), width
);
1153 memcpy(&(dst
[2][y
*dstStride
[2]]), &(src
[2][y
*srcStride
[2]]), width
);