2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test. (c) 2001 Fabrice Bellard.
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/common.h"
37 #include "simple_idct.h"
40 #include "i386/idct_xvid.h"
45 void *fast_memcpy(void *a
, const void *b
, size_t c
){return memcpy(a
,b
,c
);};
47 /* reference fdct/idct */
48 extern void fdct(DCTELEM
*block
);
49 extern void idct(DCTELEM
*block
);
50 extern void init_fdct();
52 extern void ff_mmx_idct(DCTELEM
*data
);
53 extern void ff_mmxext_idct(DCTELEM
*data
);
55 extern void odivx_idct_c (short *block
);
58 extern void ff_bfin_idct (DCTELEM
*block
) ;
59 extern void ff_bfin_fdct (DCTELEM
*block
) ;
62 extern void fdct_altivec (DCTELEM
*block
);
63 //extern void idct_altivec (DCTELEM *block);?? no routine
68 enum { FDCT
, IDCT
} is_idct
;
69 void (* func
) (DCTELEM
*block
);
70 void (* ref
) (DCTELEM
*block
);
71 enum formattag
{ NO_PERM
,MMX_PERM
, MMX_SIMPLE_PERM
, SCALE_PERM
, SSE2_PERM
} format
;
75 #ifndef FAAN_POSTSCALE
76 #define FAAN_SCALE SCALE_PERM
78 #define FAAN_SCALE NO_PERM
83 struct algo algos
[] = {
84 {"REF-DBL", 0, fdct
, fdct
, NO_PERM
},
85 {"FAAN", 0, ff_faandct
, fdct
, FAAN_SCALE
},
86 {"FAANI", 1, ff_faanidct
, idct
, NO_PERM
},
87 {"IJG-AAN-INT", 0, fdct_ifast
, fdct
, SCALE_PERM
},
88 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow
, fdct
, NO_PERM
},
89 {"REF-DBL", 1, idct
, idct
, NO_PERM
},
90 {"INT", 1, j_rev_dct
, idct
, MMX_PERM
},
91 {"SIMPLE-C", 1, ff_simple_idct
, idct
, NO_PERM
},
94 {"MMX", 0, ff_fdct_mmx
, fdct
, NO_PERM
, MM_MMX
},
96 {"MMX2", 0, ff_fdct_mmx2
, fdct
, NO_PERM
, MM_MMXEXT
},
100 {"LIBMPEG2-MMX", 1, ff_mmx_idct
, idct
, MMX_PERM
, MM_MMX
},
101 {"LIBMPEG2-MMXEXT", 1, ff_mmxext_idct
, idct
, MMX_PERM
, MM_MMXEXT
},
103 {"SIMPLE-MMX", 1, ff_simple_idct_mmx
, idct
, MMX_SIMPLE_PERM
, MM_MMX
},
104 {"XVID-MMX", 1, ff_idct_xvid_mmx
, idct
, NO_PERM
, MM_MMX
},
105 {"XVID-MMX2", 1, ff_idct_xvid_mmx2
, idct
, NO_PERM
, MM_MMXEXT
},
106 {"XVID-SSE2", 1, ff_idct_xvid_sse2
, idct
, SSE2_PERM
, MM_SSE2
},
110 {"altivecfdct", 0, fdct_altivec
, fdct
, NO_PERM
, MM_ALTIVEC
},
114 {"BFINfdct", 0, ff_bfin_fdct
, fdct
, NO_PERM
},
115 {"BFINidct", 1, ff_bfin_idct
, idct
, NO_PERM
},
121 #define AANSCALE_BITS 12
122 static const unsigned short aanscales
[64] = {
123 /* precomputed values scaled up by 14 bits */
124 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
125 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
126 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
127 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
128 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
129 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
130 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
131 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
134 uint8_t cropTbl
[256 + 2 * MAX_NEG_CROP
];
136 int64_t gettime(void)
139 gettimeofday(&tv
,NULL
);
140 return (int64_t)tv
.tv_sec
* 1000000 + tv
.tv_usec
;
144 #define NB_ITS_SPEED 50000
146 static short idct_mmx_perm
[64];
148 static short idct_simple_mmx_perm
[64]={
149 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
150 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
151 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
152 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
153 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
154 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
155 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
156 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159 static const uint8_t idct_sse2_row_perm
[8] = {0, 4, 1, 5, 2, 6, 3, 7};
161 void idct_mmx_init(void)
165 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
166 for (i
= 0; i
< 64; i
++) {
167 idct_mmx_perm
[i
] = (i
& 0x38) | ((i
& 6) >> 1) | ((i
& 1) << 2);
168 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
172 static DCTELEM block
[64] __attribute__ ((aligned (16)));
173 static DCTELEM block1
[64] __attribute__ ((aligned (8)));
174 static DCTELEM block_org
[64] __attribute__ ((aligned (8)));
176 static inline void mmx_emms(void)
179 if (cpu_flags
& MM_MMX
)
180 asm volatile ("emms\n\t");
184 void dct_error(const char *name
, int is_idct
,
185 void (*fdct_func
)(DCTELEM
*block
),
186 void (*fdct_ref
)(DCTELEM
*block
), int form
, int test
)
190 int64_t err2
, ti
, ti1
, it1
;
191 int64_t sysErr
[64], sysErrMax
=0;
193 int blockSumErrMax
=0, blockSumErr
;
199 for(i
=0; i
<64; i
++) sysErr
[i
]=0;
200 for(it
=0;it
<NB_ITS
;it
++) {
206 block1
[i
] = (random() % 512) -256;
215 int num
= (random()%10)+1;
217 block1
[random()%64] = (random() % 512) -256;
220 block1
[0]= (random()%4096)-2048;
221 block1
[63]= (block1
[0]&1)^1;
225 #if 0 // simulate mismatch control
230 if((sum
&1)==0) block1
[63]^=1;
235 block_org
[i
]= block1
[i
];
237 if (form
== MMX_PERM
) {
239 block
[idct_mmx_perm
[i
]] = block1
[i
];
240 } else if (form
== MMX_SIMPLE_PERM
) {
242 block
[idct_simple_mmx_perm
[i
]] = block1
[i
];
244 } else if (form
== SSE2_PERM
) {
246 block
[(i
&0x38) | idct_sse2_row_perm
[i
&7]] = block1
[i
];
251 #if 0 // simulate mismatch control for tested IDCT but not the ref
256 if((sum
&1)==0) block
[63]^=1;
263 if (form
== SCALE_PERM
) {
264 for(i
=0; i
<64; i
++) {
265 scale
= 8*(1 << (AANSCALE_BITS
+ 11)) / aanscales
[i
];
266 block
[i
] = (block
[i
] * scale
/*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS
;
274 v
= abs(block
[i
] - block1
[i
]);
278 sysErr
[i
] += block
[i
] - block1
[i
];
280 if( abs(block
[i
])>maxout
) maxout
=abs(block
[i
]);
282 if(blockSumErrMax
< blockSumErr
) blockSumErrMax
= blockSumErr
;
283 #if 0 // print different matrix pairs
287 if((i
&7)==0) printf("\n");
288 printf("%4d ", block_org
[i
]);
291 if((i
&7)==0) printf("\n");
292 printf("%4d ", block
[i
] - block1
[i
]);
297 for(i
=0; i
<64; i
++) sysErrMax
= FFMAX(sysErrMax
, FFABS(sysErr
[i
]));
299 #if 1 // dump systematic errors
301 if(i
%8==0) printf("\n");
302 printf("%5d ", (int)sysErr
[i
]);
307 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
308 is_idct
? "IDCT" : "DCT",
309 name
, err_inf
, (double)err2
/ NB_ITS
/ 64.0, (double)sysErrMax
/ NB_ITS
, maxout
, blockSumErrMax
);
317 block1
[i
] = (random() % 512) -256;
327 block1
[0] = (random() % 512) -256;
328 block1
[1] = (random() % 512) -256;
329 block1
[2] = (random() % 512) -256;
330 block1
[3] = (random() % 512) -256;
334 if (form
== MMX_PERM
) {
336 block
[idct_mmx_perm
[i
]] = block1
[i
];
337 } else if(form
== MMX_SIMPLE_PERM
) {
339 block
[idct_simple_mmx_perm
[i
]] = block1
[i
];
348 for(it
=0;it
<NB_ITS_SPEED
;it
++) {
351 // memcpy(block, block1, sizeof(DCTELEM) * 64);
352 // do not memcpy especially not fastmemcpy because it does movntq !!!
356 ti1
= gettime() - ti
;
357 } while (ti1
< 1000000);
360 printf("%s %s: %0.1f kdct/s\n",
361 is_idct
? "IDCT" : "DCT",
362 name
, (double)it1
* 1000.0 / (double)ti1
);
366 static uint8_t img_dest
[64] __attribute__ ((aligned (8)));
367 static uint8_t img_dest1
[64] __attribute__ ((aligned (8)));
369 void idct248_ref(uint8_t *dest
, int linesize
, int16_t *block
)
372 static double c8
[8][8];
373 static double c4
[4][4];
374 double block1
[64], block2
[64], block3
[64];
384 s
= (i
==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
385 c8
[i
][j
] = s
* cos(M_PI
* i
* (j
+ 0.5) / 8.0);
386 sum
+= c8
[i
][j
] * c8
[i
][j
];
393 s
= (i
==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
394 c4
[i
][j
] = s
* cos(M_PI
* i
* (j
+ 0.5) / 4.0);
395 sum
+= c4
[i
][j
] * c4
[i
][j
];
404 block1
[8*(2*i
)+j
] = (block
[8*(2*i
)+j
] + block
[8*(2*i
+1)+j
]) * s
;
405 block1
[8*(2*i
+1)+j
] = (block
[8*(2*i
)+j
] - block
[8*(2*i
+1)+j
]) * s
;
414 sum
+= c8
[k
][j
] * block1
[8*i
+k
];
425 sum
+= c4
[k
][j
] * block2
[8*(2*k
)+i
];
426 block3
[8*(2*j
)+i
] = sum
;
431 sum
+= c4
[k
][j
] * block2
[8*(2*k
+1)+i
];
432 block3
[8*(2*j
+1)+i
] = sum
;
436 /* clamp and store the result */
444 dest
[i
* linesize
+ j
] = (int)rint(v
);
449 void idct248_error(const char *name
,
450 void (*idct248_put
)(uint8_t *dest
, int line_size
, int16_t *block
))
452 int it
, i
, it1
, ti
, ti1
, err_max
, v
;
456 /* just one test to see if code is correct (precision is less
459 for(it
=0;it
<NB_ITS
;it
++) {
461 /* XXX: use forward transform to generate values */
463 block1
[i
] = (random() % 256) - 128;
468 idct248_ref(img_dest1
, 8, block
);
472 idct248_put(img_dest
, 8, block
);
475 v
= abs((int)img_dest
[i
] - (int)img_dest1
[i
]);
477 printf("%d %d\n", img_dest
[i
], img_dest1
[i
]);
486 printf(" %3d", img_dest1
[i
*8+j
]);
495 printf(" %3d", img_dest
[i
*8+j
]);
501 printf("%s %s: err_inf=%d\n",
502 1 ? "IDCT248" : "DCT248",
508 for(it
=0;it
<NB_ITS_SPEED
;it
++) {
511 // memcpy(block, block1, sizeof(DCTELEM) * 64);
512 // do not memcpy especially not fastmemcpy because it does movntq !!!
513 idct248_put(img_dest
, 8, block
);
516 ti1
= gettime() - ti
;
517 } while (ti1
< 1000000);
520 printf("%s %s: %0.1f kdct/s\n",
521 1 ? "IDCT248" : "DCT248",
522 name
, (double)it1
* 1000.0 / (double)ti1
);
527 printf("dct-test [-i] [<test-number>]\n"
528 "test-number 0 -> test with random matrixes\n"
529 " 1 -> test with random sparse matrixes\n"
530 " 2 -> do 3. test from mpeg4 std\n"
531 "-i test IDCT implementations\n"
532 "-4 test IDCT248 implementations\n");
535 int main(int argc
, char **argv
)
537 int test_idct
= 0, test_248_dct
= 0;
540 cpu_flags
= mm_support();
545 for(i
=0;i
<256;i
++) cropTbl
[i
+ MAX_NEG_CROP
] = i
;
546 for(i
=0;i
<MAX_NEG_CROP
;i
++) {
548 cropTbl
[i
+ MAX_NEG_CROP
+ 256] = 255;
552 c
= getopt(argc
, argv
, "ih4");
569 if(optind
<argc
) test
= atoi(argv
[optind
]);
571 printf("ffmpeg DCT/IDCT test\n");
574 idct248_error("SIMPLE-C", ff_simple_idct248_put
);
576 for (i
=0;algos
[i
].name
;i
++)
577 if (algos
[i
].is_idct
== test_idct
&& !(~cpu_flags
& algos
[i
].mm_support
)) {
578 dct_error (algos
[i
].name
, algos
[i
].is_idct
, algos
[i
].func
, algos
[i
].ref
, algos
[i
].format
, test
);