tools/checkasm.c

   1 /*****************************************************************************
   2  * checkasm.c: assembly check tool
   3  *****************************************************************************
   4  * Copyright (C) 2003-2019 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #include <ctype.h>
  29 #include "common/common.h"
  30 #include "encoder/macroblock.h"
  31
  32 #ifdef _WIN32
  33 #include <windows.h>
  34 #endif
  35
  36 // GCC doesn't align stack variables on ARM, so use .bss
  37 #if ARCH_ARM
  38 #undef ALIGNED_16
  39 #define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
  40 #endif
  41
  42 /* buf1, buf2: initialised to random data and shouldn't write into them */
  43 static uint8_t *buf1, *buf2;
  44 /* buf3, buf4: used to store output */
  45 static uint8_t *buf3, *buf4;
  46 /* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */
  47 static pixel *pbuf1, *pbuf2;
  48 /* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */
  49 static pixel *pbuf3, *pbuf4;
  50
  51 static int quiet = 0;
  52
  53 #define report( name ) { \
  54     if( used_asm && !quiet ) \
  55         fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
  56     if( !ok ) ret = -1; \
  57 }
  58
  59 #define BENCH_RUNS 2000 // tradeoff between accuracy and speed
  60 #define MAX_FUNCS 1000  // just has to be big enough to hold all the existing functions
  61 #define MAX_CPUS 30     // number of different combinations of cpu flags
  62
  63 typedef struct
  64 {
  65     void *pointer; // just for detecting duplicates
  66     uint32_t cpu;
  67     uint64_t cycles;
  68     uint32_t den;
  69 } bench_t;
  70
  71 typedef struct
  72 {
  73     char *name;
  74     bench_t vers[MAX_CPUS];
  75 } bench_func_t;
  76
  77 static int do_bench = 0;
  78 static int bench_pattern_len = 0;
  79 static const char *bench_pattern = "";
  80 static char func_name[100];
  81 static bench_func_t benchs[MAX_FUNCS];
  82
  83 static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
  84 static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
  85 static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
  86 static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
  87 static const char **intra_predict_8x8_names = intra_predict_4x4_names;
  88 static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
  89
  90 #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
  91
  92 static inline uint32_t read_time(void)
  93 {
  94     uint32_t a = 0;
  95 #if HAVE_X86_INLINE_ASM
  96     asm volatile( "lfence \n"
  97                   "rdtsc  \n"
  98                   : "=a"(a) :: "edx", "memory" );
  99 #elif ARCH_PPC
 100     asm volatile( "mftb %0" : "=r"(a) :: "memory" );
 101 #elif HAVE_ARM_INLINE_ASM    // ARMv7 only
 102     asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
 103 #elif ARCH_AARCH64
 104     uint64_t b = 0;
 105     asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" );
 106     a = b;
 107 #elif ARCH_MIPS
 108     asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
 109 #endif
 110     return a;
 111 }
 112
 113 static bench_t* get_bench( const char *name, int cpu )
 114 {
 115     int i, j;
 116     for( i = 0; benchs[i].name && strcmp(name, benchs[i].name); i++ )
 117         assert( i < MAX_FUNCS );
 118     if( !benchs[i].name )
 119         benchs[i].name = strdup( name );
 120     if( !cpu )
 121         return &benchs[i].vers[0];
 122     for( j = 1; benchs[i].vers[j].cpu && benchs[i].vers[j].cpu != cpu; j++ )
 123         assert( j < MAX_CPUS );
 124     benchs[i].vers[j].cpu = cpu;
 125     return &benchs[i].vers[j];
 126 }
 127
 128 static int cmp_nop( const void *a, const void *b )
 129 {
 130     return *(uint16_t*)a - *(uint16_t*)b;
 131 }
 132
 133 static int cmp_bench( const void *a, const void *b )
 134 {
 135     // asciibetical sort except preserving numbers
 136     const char *sa = ((bench_func_t*)a)->name;
 137     const char *sb = ((bench_func_t*)b)->name;
 138     for( ;; sa++, sb++ )
 139     {
 140         if( !*sa && !*sb )
 141             return 0;
 142         if( isdigit( *sa ) && isdigit( *sb ) && isdigit( sa[1] ) != isdigit( sb[1] ) )
 143             return isdigit( sa[1] ) - isdigit( sb[1] );
 144         if( *sa != *sb )
 145             return *sa - *sb;
 146     }
 147 }
 148
 149 static void print_bench(void)
 150 {
 151     uint16_t nops[10000];
 152     int nfuncs, nop_time=0;
 153
 154     for( int i = 0; i < 10000; i++ )
 155     {
 156         uint32_t t = read_time();
 157         nops[i] = read_time() - t;
 158     }
 159     qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
 160     for( int i = 500; i < 9500; i++ )
 161         nop_time += nops[i];
 162     nop_time /= 900;
 163     printf( "nop: %d\n", nop_time );
 164
 165     for( nfuncs = 0; nfuncs < MAX_FUNCS && benchs[nfuncs].name; nfuncs++ );
 166     qsort( benchs, nfuncs, sizeof(bench_func_t), cmp_bench );
 167     for( int i = 0; i < nfuncs; i++ )
 168         for( int j = 0; j < MAX_CPUS && (!j || benchs[i].vers[j].cpu); j++ )
 169         {
 170             int k;
 171             bench_t *b = &benchs[i].vers[j];
 172             if( !b->den )
 173                 continue;
 174             for( k = 0; k < j && benchs[i].vers[k].pointer != b->pointer; k++ );
 175             if( k < j )
 176                 continue;
 177             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
 178 #if HAVE_MMX
 179                     b->cpu&X264_CPU_AVX512 ? "avx512" :
 180                     b->cpu&X264_CPU_AVX2 ? "avx2" :
 181                     b->cpu&X264_CPU_BMI2 ? "bmi2" :
 182                     b->cpu&X264_CPU_BMI1 ? "bmi1" :
 183                     b->cpu&X264_CPU_FMA3 ? "fma3" :
 184                     b->cpu&X264_CPU_FMA4 ? "fma4" :
 185                     b->cpu&X264_CPU_XOP ? "xop" :
 186                     b->cpu&X264_CPU_AVX ? "avx" :
 187                     b->cpu&X264_CPU_SSE42 ? "sse42" :
 188                     b->cpu&X264_CPU_SSE4 ? "sse4" :
 189                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
 190                     b->cpu&X264_CPU_SSE3 ? "sse3" :
 191                     b->cpu&X264_CPU_LZCNT ? "lzcnt" :
 192                     /* print sse2slow only if there's also a sse2fast version of the same func */
 193                     b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
 194                     b->cpu&X264_CPU_SSE2 ? "sse2" :
 195                     b->cpu&X264_CPU_SSE ? "sse" :
 196                     b->cpu&X264_CPU_MMX ? "mmx" :
 197 #elif ARCH_PPC
 198                     b->cpu&X264_CPU_ALTIVEC ? "altivec" :
 199 #elif ARCH_ARM
 200                     b->cpu&X264_CPU_NEON ? "neon" :
 201                     b->cpu&X264_CPU_ARMV6 ? "armv6" :
 202 #elif ARCH_AARCH64
 203                     b->cpu&X264_CPU_NEON ? "neon" :
 204                     b->cpu&X264_CPU_ARMV8 ? "armv8" :
 205 #elif ARCH_MIPS
 206                     b->cpu&X264_CPU_MSA ? "msa" :
 207 #endif
 208                     "c",
 209 #if HAVE_MMX
 210                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
 211                     b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
 212                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
 213                     b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
 214                     b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" :
 215                     b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
 216 #elif ARCH_ARM
 217                     b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
 218 #endif
 219                     "",
 220                     (int64_t)(10*b->cycles/b->den - nop_time)/4 );
 221         }
 222 }
 223
 224 /* YMM and ZMM registers on x86 are turned off to save power when they haven't been
 225  * used for some period of time. When they are used there will be a "warmup" period
 226  * during which performance will be reduced and inconsistent which is problematic when
 227  * trying to benchmark individual functions. We can work around this by periodically
 228  * issuing "dummy" instructions that uses those registers to keep them powered on. */
 229 static void (*simd_warmup_func)( void ) = NULL;
 230 #define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 )
 231
 232 #if ARCH_X86 || ARCH_X86_64
 233 int x264_stack_pagealign( int (*func)(), int align );
 234 void x264_checkasm_warmup_avx( void );
 235 void x264_checkasm_warmup_avx512( void );
 236
 237 /* detect when callee-saved regs aren't saved
 238  * needs an explicit asm check because it only sometimes crashes in normal use. */
 239 intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
 240 #else
 241 #define x264_stack_pagealign( func, align ) func()
 242 #endif
 243
 244 #if ARCH_AARCH64
 245 intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
 246 #endif
 247
 248 #if ARCH_ARM
 249 intptr_t x264_checkasm_call_neon( intptr_t (*func)(), int *ok, ... );
 250 intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... );
 251 intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon;
 252 #endif
 253
 254 #define call_c1(func,...) func(__VA_ARGS__)
 255
 256 #if ARCH_X86_64
 257 /* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
 258  * This is done by clobbering the stack with junk around the stack pointer and calling the
 259  * assembly function through x264_checkasm_call with added dummy arguments which forces all
 260  * real arguments to be passed on the stack and not in registers. For 32-bit argument the
 261  * upper half of the 64-bit register location on the stack will now contain junk. Note that
 262  * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may
 263  * overwrite the junk written to the stack so there's no guarantee that it will always
 264  * detect all functions that assumes zero-extension.
 265  */
 266 void x264_checkasm_stack_clobber( uint64_t clobber, ... );
 267 #define call_a1(func,...) ({ \
 268     uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
 269     x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
 270     simd_warmup(); \
 271     x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
 272 #elif ARCH_AARCH64 && !defined(__APPLE__)
 273 void x264_checkasm_stack_clobber( uint64_t clobber, ... );
 274 #define call_a1(func,...) ({ \
 275     uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
 276     x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \
 277     x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); })
 278 #elif ARCH_X86 || ARCH_ARM
 279 #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
 280 #else
 281 #define call_a1 call_c1
 282 #endif
 283
 284 #if ARCH_ARM
 285 #define call_a1_64(func,...) ((uint64_t (*)(intptr_t(*)(), int*, ...))x264_checkasm_call)( (intptr_t(*)())func, &ok, __VA_ARGS__ )
 286 #else
 287 #define call_a1_64 call_a1
 288 #endif
 289
 290 #define call_bench(func,cpu,...)\
 291     if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
 292     {\
 293         uint64_t tsum = 0;\
 294         int tcount = 0;\
 295         call_a1(func, __VA_ARGS__);\
 296         for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
 297         {\
 298             simd_warmup();\
 299             uint32_t t = read_time();\
 300             func(__VA_ARGS__);\
 301             func(__VA_ARGS__);\
 302             func(__VA_ARGS__);\
 303             func(__VA_ARGS__);\
 304             t = read_time() - t;\
 305             if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\
 306             {\
 307                 tsum += t;\
 308                 tcount++;\
 309             }\
 310         }\
 311         bench_t *b = get_bench( func_name, cpu );\
 312         b->cycles += tsum;\
 313         b->den += tcount;\
 314         b->pointer = func;\
 315     }
 316
 317 /* for most functions, run benchmark and correctness test at the same time.
 318  * for those that modify their inputs, run the above macros separately */
 319 #define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); })
 320 #define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
 321 #define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
 322 #define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })
 323 #define call_a64(func,...) ({ call_a2(func,__VA_ARGS__); call_a1_64(func,__VA_ARGS__); })
 324
 325
 326 static int check_pixel( int cpu_ref, int cpu_new )
 327 {
 328     x264_pixel_function_t pixel_c;
 329     x264_pixel_function_t pixel_ref;
 330     x264_pixel_function_t pixel_asm;
 331     x264_predict_t predict_4x4[12];
 332     x264_predict8x8_t predict_8x8[12];
 333     x264_predict_8x8_filter_t predict_8x8_filter;
 334     ALIGNED_16( pixel edge[36] );
 335     uint16_t cost_mv[32];
 336     int ret = 0, ok, used_asm;
 337
 338     x264_pixel_init( 0, &pixel_c );
 339     x264_pixel_init( cpu_ref, &pixel_ref );
 340     x264_pixel_init( cpu_new, &pixel_asm );
 341     x264_predict_4x4_init( 0, predict_4x4 );
 342     x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
 343     predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
 344
 345     // maximize sum
 346     for( int i = 0; i < 256; i++ )
 347     {
 348         int z = i|(i>>4);
 349         z ^= z>>2;
 350         z ^= z>>1;
 351         pbuf4[i] = -(z&1) & PIXEL_MAX;
 352         pbuf3[i] = ~pbuf4[i] & PIXEL_MAX;
 353     }
 354     // random pattern made of maxed pixel differences, in case an intermediate value overflows
 355     for( int i = 256; i < 0x1000; i++ )
 356     {
 357         pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX;
 358         pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX;
 359     }
 360
 361 #define TEST_PIXEL( name, align ) \
 362     ok = 1, used_asm = 0; \
 363     for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \
 364     { \
 365         int res_c, res_asm; \
 366         if( pixel_asm.name[i] != pixel_ref.name[i] ) \
 367         { \
 368             set_func_name( "%s_%s", #name, pixel_names[i] ); \
 369             used_asm = 1; \
 370             for( int j = 0; j < 64; j++ ) \
 371             { \
 372                 intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \
 373                 res_c   = call_c( pixel_c.name[i],   pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
 374                 res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
 375                 if( res_c != res_asm ) \
 376                 { \
 377                     ok = 0; \
 378                     fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
 379                     break; \
 380                 } \
 381             } \
 382             for( int j = 0; j < 0x1000 && ok; j += 256 ) \
 383             { \
 384                 res_c   = pixel_c  .name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
 385                 res_asm = pixel_asm.name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
 386                 if( res_c != res_asm ) \
 387                 { \
 388                     ok = 0; \
 389                     fprintf( stderr, #name "[%d]: overflow %d != %d\n", i, res_c, res_asm ); \
 390                 } \
 391             } \
 392         } \
 393     } \
 394     report( "pixel " #name " :" );
 395
 396     TEST_PIXEL( sad, 0 );
 397     TEST_PIXEL( sad_aligned, 1 );
 398     TEST_PIXEL( ssd, 1 );
 399     TEST_PIXEL( satd, 0 );
 400     TEST_PIXEL( sa8d, 1 );
 401
 402     ok = 1, used_asm = 0;
 403     if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
 404     {
 405         set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
 406         used_asm = 1;
 407         for( int j = 0; j < 64; j++ )
 408         {
 409             uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
 410             uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
 411             uint64_t res_a = call_a64( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
 412             uint32_t cost8_a = res_a;
 413             uint32_t cost4_a = res_a >> 32;
 414             if( cost8_a != cost8_c || cost4_a != cost4_c )
 415             {
 416                 ok = 0;
 417                 fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
 418                          cost8_c, cost4_c, cost8_a, cost4_a );
 419                 break;
 420             }
 421         }
 422         for( int j = 0; j < 0x1000 && ok; j += 256 ) \
 423         {
 424             uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
 425             uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
 426             uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
 427             uint32_t cost8_a = res_a;
 428             uint32_t cost4_a = res_a >> 32;
 429             if( cost8_a != cost8_c || cost4_a != cost4_c )
 430             {
 431                 ok = 0;
 432                 fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
 433                          cost8_c, cost4_c, cost8_a, cost4_a );
 434             }
 435         }
 436     }
 437     report( "pixel sa8d_satd :" );
 438
 439 #define TEST_PIXEL_X( N ) \
 440     ok = 1; used_asm = 0; \
 441     for( int i = 0; i < 7; i++ ) \
 442     { \
 443         ALIGNED_16( int res_c[4] ) = {0}; \
 444         ALIGNED_16( int res_asm[4] ) = {0}; \
 445         if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
 446         { \
 447             set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
 448             used_asm = 1; \
 449             for( int j = 0; j < 64; j++ ) \
 450             { \
 451                 pixel *pix2 = pbuf2+j; \
 452                 res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2,   64 ); \
 453                 res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
 454                 res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
 455                 if( N == 4 ) \
 456                 { \
 457                     res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
 458                     call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
 459                 } \
 460                 else \
 461                     call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
 462                 if( memcmp(res_c, res_asm, N*sizeof(int)) ) \
 463                 { \
 464                     ok = 0; \
 465                     fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
 466                              i, res_c[0], res_c[1], res_c[2], res_c[3], \
 467                              res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
 468                 } \
 469                 if( N == 4 ) \
 470                     call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
 471                 else \
 472                     call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
 473             } \
 474         } \
 475     } \
 476     report( "pixel sad_x"#N" :" );
 477
 478     TEST_PIXEL_X(3);
 479     TEST_PIXEL_X(4);
 480
 481 #define TEST_PIXEL_VAR( i ) \
 482     if( pixel_asm.var[i] != pixel_ref.var[i] ) \
 483     { \
 484         set_func_name( "%s_%s", "var", pixel_names[i] ); \
 485         used_asm = 1; \
 486         /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
 487         call_c1( pixel_c.var[i],   pbuf1,           16 ); \
 488         call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
 489         uint64_t res_c   = pixel_c.var[i]( pbuf1, 16 ); \
 490         uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
 491         if( res_c != res_asm ) \
 492         { \
 493             ok = 0; \
 494             fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
 495         } \
 496         call_c2( pixel_c.var[i],   pbuf1, (intptr_t)16 ); \
 497         call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
 498     }
 499
 500     ok = 1; used_asm = 0;
 501     TEST_PIXEL_VAR( PIXEL_16x16 );
 502     TEST_PIXEL_VAR( PIXEL_8x16 );
 503     TEST_PIXEL_VAR( PIXEL_8x8 );
 504     report( "pixel var :" );
 505
 506 #define TEST_PIXEL_VAR2( i ) \
 507     if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
 508     { \
 509         int res_c, res_asm; \
 510         ALIGNED_ARRAY_8( int, ssd_c,  [2] ); \
 511         ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \
 512         set_func_name( "%s_%s", "var2", pixel_names[i] ); \
 513         used_asm = 1; \
 514         res_c   = call_c( pixel_c.var2[i],   pbuf1, pbuf2, ssd_c   ); \
 515         res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \
 516         if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \
 517         { \
 518             ok = 0; \
 519             fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \
 520         } \
 521     }
 522
 523     ok = 1; used_asm = 0;
 524     TEST_PIXEL_VAR2( PIXEL_8x16 );
 525     TEST_PIXEL_VAR2( PIXEL_8x8 );
 526     report( "pixel var2 :" );
 527
 528     ok = 1; used_asm = 0;
 529     for( int i = 0; i < 4; i++ )
 530         if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
 531         {
 532             set_func_name( "hadamard_ac_%s", pixel_names[i] );
 533             used_asm = 1;
 534             for( int j = 0; j < 32; j++ )
 535             {
 536                 pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
 537                 call_c1( pixel_c.hadamard_ac[i],   pbuf1, (intptr_t)16 );
 538                 call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
 539                 uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
 540                 uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
 541                 if( rc != ra )
 542                 {
 543                     ok = 0;
 544                     fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) );
 545                     break;
 546                 }
 547             }
 548             call_c2( pixel_c.hadamard_ac[i],   pbuf1, (intptr_t)16 );
 549             call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
 550         }
 551     report( "pixel hadamard_ac :" );
 552
 553     // maximize sum
 554     for( int i = 0; i < 32; i++ )
 555         for( int j = 0; j < 16; j++ )
 556             pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX;
 557     ok = 1; used_asm = 0;
 558     if( pixel_asm.vsad != pixel_ref.vsad )
 559     {
 560         for( int h = 2; h <= 32; h += 2 )
 561         {
 562             int res_c, res_asm;
 563             set_func_name( "vsad" );
 564             used_asm = 1;
 565             for( int j = 0; j < 2 && ok; j++ )
 566             {
 567                 pixel *p = j ? pbuf4 : pbuf1;
 568                 res_c   = call_c( pixel_c.vsad,   p, (intptr_t)16, h );
 569                 res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h );
 570                 if( res_c != res_asm )
 571                 {
 572                     ok = 0;
 573                     fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
 574                     break;
 575                 }
 576             }
 577         }
 578     }
 579     report( "pixel vsad :" );
 580
 581     ok = 1; used_asm = 0;
 582     if( pixel_asm.asd8 != pixel_ref.asd8 )
 583     {
 584         set_func_name( "asd8" );
 585         used_asm = 1;
 586         int res_c = call_c( pixel_c.asd8,   pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
 587         int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
 588         if( res_c != res_a )
 589         {
 590             ok = 0;
 591             fprintf( stderr, "asd: %d != %d\n", res_c, res_a );
 592         }
 593     }
 594     report( "pixel asd :" );
 595
 596 #define TEST_INTRA_X3( name, i8x8, ... ) \
 597     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
 598     { \
 599         ALIGNED_16( int res_c[4] ); \
 600         ALIGNED_16( int res_asm[4] ); \
 601         set_func_name( #name ); \
 602         used_asm = 1; \
 603         call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \
 604         call_a( pixel_asm.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_asm ); \
 605         if( memcmp(res_c, res_asm, 3 * sizeof(*res_c)) ) \
 606         { \
 607             ok = 0; \
 608             fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
 609                      res_c[0], res_c[1], res_c[2], \
 610                      res_asm[0], res_asm[1], res_asm[2] ); \
 611         } \
 612     }
 613
 614 #define TEST_INTRA_X9( name, cmp ) \
 615     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
 616     { \
 617         set_func_name( #name ); \
 618         used_asm = 1; \
 619         ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
 620         for( int i=0; i<17; i++ ) \
 621             bitcosts[i] = 9*(i!=8); \
 622         memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
 623         memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
 624         for( int i=0; i<32; i++ ) \
 625         { \
 626             pixel *fenc = pbuf1+48+i*12; \
 627             pixel *fdec1 = pbuf3+48+i*12; \
 628             pixel *fdec2 = pbuf4+48+i*12; \
 629             int pred_mode = i%9; \
 630             int res_c = INT_MAX; \
 631             for( int j=0; j<9; j++ ) \
 632             { \
 633                 predict_4x4[j]( fdec1 ); \
 634                 int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
 635                 if( cost < (uint16_t)res_c ) \
 636                     res_c = cost + (j<<16); \
 637             } \
 638             predict_4x4[res_c>>16]( fdec1 ); \
 639             int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
 640             if( res_c != res_a ) \
 641             { \
 642                 ok = 0; \
 643                 fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
 644                 break; \
 645             } \
 646             if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
 647             { \
 648                 ok = 0; \
 649                 fprintf( stderr, #name" [FAILED]\n" ); \
 650                 for( int j=0; j<16; j++ ) \
 651                     fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
 652                 fprintf( stderr, "\n" ); \
 653                 for( int j=0; j<16; j++ ) \
 654                     fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
 655                 fprintf( stderr, "\n" ); \
 656                 break; \
 657             } \
 658         } \
 659     }
 660
 661 #define TEST_INTRA8_X9( name, cmp ) \
 662     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
 663     { \
 664         set_func_name( #name ); \
 665         used_asm = 1; \
 666         ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
 667         ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
 668         ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
 669         memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
 670         memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
 671         for( int i=0; i<17; i++ ) \
 672             bitcosts[i] = 9*(i!=8); \
 673         for( int i=0; i<32; i++ ) \
 674         { \
 675             pixel *fenc = pbuf1+48+i*12; \
 676             pixel *fdec1 = pbuf3+48+i*12; \
 677             pixel *fdec2 = pbuf4+48+i*12; \
 678             int pred_mode = i%9; \
 679             int res_c = INT_MAX; \
 680             predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
 681             for( int j=0; j<9; j++ ) \
 682             { \
 683                 predict_8x8[j]( fdec1, edge ); \
 684                 satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
 685                 if( satds_c[j] < (uint16_t)res_c ) \
 686                     res_c = satds_c[j] + (j<<16); \
 687             } \
 688             predict_8x8[res_c>>16]( fdec1, edge ); \
 689             int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
 690             if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \
 691             { \
 692                 ok = 0; \
 693                 fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
 694                 for( int j = 0; j < 9; j++ ) \
 695                     fprintf( stderr, "%5d ", satds_c[j]); \
 696                 fprintf( stderr, "\n" ); \
 697                 for( int j = 0; j < 9; j++ ) \
 698                     fprintf( stderr, "%5d ", satds_a[j]); \
 699                 fprintf( stderr, "\n" ); \
 700                 break; \
 701             } \
 702             for( int j=0; j<8; j++ ) \
 703                 if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
 704                     ok = 0; \
 705             if( !ok ) \
 706             { \
 707                 fprintf( stderr, #name" [FAILED]\n" ); \
 708                 for( int j=0; j<8; j++ ) \
 709                 { \
 710                     for( int k=0; k<8; k++ ) \
 711                         fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
 712                     fprintf( stderr, "\n" ); \
 713                 } \
 714                 fprintf( stderr, "\n" ); \
 715                 for( int j=0; j<8; j++ ) \
 716                 { \
 717                     for( int k=0; k<8; k++ ) \
 718                         fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
 719                     fprintf( stderr, "\n" ); \
 720                 } \
 721                 fprintf( stderr, "\n" ); \
 722                 break; \
 723             } \
 724         } \
 725     }
 726
 727     memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
 728     ok = 1; used_asm = 0;
 729     TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
 730     TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
 731     TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
 732     TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
 733     TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
 734     report( "intra satd_x3 :" );
 735     ok = 1; used_asm = 0;
 736     TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
 737     TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
 738     TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
 739     TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
 740     TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
 741     report( "intra sad_x3 :" );
 742     ok = 1; used_asm = 0;
 743     TEST_INTRA_X9( intra_satd_x9_4x4, satd );
 744     TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );
 745     report( "intra satd_x9 :" );
 746     ok = 1; used_asm = 0;
 747     TEST_INTRA_X9( intra_sad_x9_4x4, sad );
 748     TEST_INTRA8_X9( intra_sad_x9_8x8, sad );
 749     report( "intra sad_x9 :" );
 750
 751     ok = 1; used_asm = 0;
 752     if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core )
 753     {
 754         used_asm = 1;
 755         set_func_name( "ssd_nv12" );
 756         uint64_t res_u_c, res_v_c, res_u_a, res_v_a;
 757         for( int w = 8; w <= 360; w += 8 )
 758         {
 759             pixel_c.ssd_nv12_core(   pbuf1, 368, pbuf2, 368, w, 8, &res_u_c, &res_v_c );
 760             pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, w, 8, &res_u_a, &res_v_a );
 761             if( res_u_c != res_u_a || res_v_c != res_v_a )
 762             {
 763                 ok = 0;
 764                 fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
 765                          res_u_c, res_v_c, res_u_a, res_v_a );
 766             }
 767         }
 768         call_c( pixel_c.ssd_nv12_core,   pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c );
 769         call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a );
 770     }
 771     report( "ssd_nv12 :" );
 772
 773     if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
 774         pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
 775     {
 776         int cnt;
 777         float res_c, res_a;
 778         ALIGNED_16( int sums[5][4] ) = {{0}};
 779         used_asm = ok = 1;
 780         x264_emms();
 781         res_c = x264_pixel_ssim_wxh( &pixel_c,   pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
 782         res_a = x264_pixel_ssim_wxh( &pixel_asm, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
 783         if( fabs( res_c - res_a ) > 1e-6 )
 784         {
 785             ok = 0;
 786             fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
 787         }
 788         set_func_name( "ssim_core" );
 789         call_c( pixel_c.ssim_4x4x2_core,   pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
 790         call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
 791         set_func_name( "ssim_end" );
 792         call_c2( pixel_c.ssim_end4,   sums, sums, 4 );
 793         call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
 794         /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */
 795         call_c1( pixel_c.ssim_end4,   sums, sums, 3 );
 796         call_a1( pixel_asm.ssim_end4, sums, sums, 3 );
 797         report( "ssim :" );
 798     }
 799
 800     ok = 1; used_asm = 0;
 801     for( int i = 0; i < 32; i++ )
 802         cost_mv[i] = i*10;
 803     for( int i = 0; i < 100 && ok; i++ )
 804         if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
 805         {
 806             ALIGNED_16( uint16_t sums[72] );
 807             ALIGNED_16( int dc[4] );
 808             ALIGNED_16( int16_t mvs_a[48] );
 809             ALIGNED_16( int16_t mvs_c[48] );
 810             int mvn_a, mvn_c;
 811             int thresh = rand() & 0x3fff;
 812             set_func_name( "esa_ads" );
 813             for( int j = 0; j < 72; j++ )
 814                 sums[j] = rand() & 0x3fff;
 815             for( int j = 0; j < 4; j++ )
 816                 dc[j] = rand() & 0x3fff;
 817             used_asm = 1;
 818             mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
 819             mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
 820             if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
 821             {
 822                 ok = 0;
 823                 printf( "c%d: ", i&3 );
 824                 for( int j = 0; j < mvn_c; j++ )
 825                     printf( "%d ", mvs_c[j] );
 826                 printf( "\na%d: ", i&3 );
 827                 for( int j = 0; j < mvn_a; j++ )
 828                     printf( "%d ", mvs_a[j] );
 829                 printf( "\n\n" );
 830             }
 831         }
 832     report( "esa ads:" );
 833
 834     return ret;
 835 }
 836
 837 static int check_dct( int cpu_ref, int cpu_new )
 838 {
 839     x264_dct_function_t dct_c;
 840     x264_dct_function_t dct_ref;
 841     x264_dct_function_t dct_asm;
 842     x264_quant_function_t qf;
 843     int ret = 0, ok, used_asm, interlace = 0;
 844     ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
 845     ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] );
 846     ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
 847     ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
 848     ALIGNED_16( dctcoef dctdc[2][8] );
 849     x264_t h_buf;
 850     x264_t *h = &h_buf;
 851
 852     x264_dct_init( 0, &dct_c );
 853     x264_dct_init( cpu_ref, &dct_ref);
 854     x264_dct_init( cpu_new, &dct_asm );
 855
 856     memset( h, 0, sizeof(*h) );
 857     x264_param_default( &h->param );
 858     h->sps->i_chroma_format_idc = 1;
 859     h->chroma_qp_table = i_chroma_qp_table + 12;
 860     h->param.analyse.i_luma_deadzone[0] = 0;
 861     h->param.analyse.i_luma_deadzone[1] = 0;
 862     h->param.analyse.b_transform_8x8 = 1;
 863     for( int i = 0; i < 6; i++ )
 864         h->sps->scaling_list[i] = x264_cqm_flat16;
 865     x264_cqm_init( h );
 866     x264_quant_init( h, 0, &qf );
 867
 868     /* overflow test cases */
 869     for( int i = 0; i < 5; i++ )
 870     {
 871         pixel *enc = &pbuf3[16*i*FENC_STRIDE];
 872         pixel *dec = &pbuf4[16*i*FDEC_STRIDE];
 873
 874         for( int j = 0; j < 16; j++ )
 875         {
 876             int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1));
 877             int cond_b = (i == 0) ? 1 : !cond_a;
 878             enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0;
 879             enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0;
 880
 881             for( int k = 0; k < 4; k++ )
 882                 dec[k] = PIXEL_MAX - enc[k];
 883
 884             enc += FENC_STRIDE;
 885             dec += FDEC_STRIDE;
 886         }
 887     }
 888
 889 #define TEST_DCT( name, t1, t2, size ) \
 890     if( dct_asm.name != dct_ref.name ) \
 891     { \
 892         set_func_name( #name ); \
 893         used_asm = 1; \
 894         pixel *enc = pbuf3; \
 895         pixel *dec = pbuf4; \
 896         for( int j = 0; j < 5; j++) \
 897         { \
 898             call_c( dct_c.name, t1, &pbuf1[j*64], &pbuf2[j*64] ); \
 899             call_a( dct_asm.name, t2, &pbuf1[j*64], &pbuf2[j*64] ); \
 900             if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
 901             { \
 902                 ok = 0; \
 903                 fprintf( stderr, #name " [FAILED]\n" ); \
 904                 for( int k = 0; k < size; k++ )\
 905                     printf( "%d ", ((dctcoef*)t1)[k] );\
 906                 printf("\n");\
 907                 for( int k = 0; k < size; k++ )\
 908                     printf( "%d ", ((dctcoef*)t2)[k] );\
 909                 printf("\n");\
 910                 break; \
 911             } \
 912             call_c( dct_c.name, t1, enc, dec ); \
 913             call_a( dct_asm.name, t2, enc, dec ); \
 914             if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
 915             { \
 916                 ok = 0; \
 917                 fprintf( stderr, #name " [FAILED] (overflow)\n" ); \
 918                 break; \
 919             } \
 920             enc += 16*FENC_STRIDE; \
 921             dec += 16*FDEC_STRIDE; \
 922         } \
 923     }
 924     ok = 1; used_asm = 0;
 925     TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
 926     TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
 927     TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
 928     TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
 929     TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
 930     report( "sub_dct4 :" );
 931
 932     ok = 1; used_asm = 0;
 933     TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 );
 934     TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 );
 935     report( "sub_dct8 :" );
 936 #undef TEST_DCT
 937
 938     // fdct and idct are denormalized by different factors, so quant/dequant
 939     // is needed to force the coefs into the right range.
 940     dct_c.sub16x16_dct( dct4, pbuf1, pbuf2 );
 941     dct_c.sub16x16_dct8( dct8, pbuf1, pbuf2 );
 942     for( int i = 0; i < 16; i++ )
 943     {
 944         qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
 945         qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 );
 946     }
 947     for( int i = 0; i < 4; i++ )
 948     {
 949         qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] );
 950         qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 );
 951     }
 952     x264_cqm_delete( h );
 953
 954 #define TEST_IDCT( name, src ) \
 955     if( dct_asm.name != dct_ref.name ) \
 956     { \
 957         set_func_name( #name ); \
 958         used_asm = 1; \
 959         memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \
 960         memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \
 961         memcpy( dct1, src, 256 * sizeof(dctcoef) ); \
 962         memcpy( dct2, src, 256 * sizeof(dctcoef) ); \
 963         call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
 964         call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
 965         if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \
 966         { \
 967             ok = 0; \
 968             fprintf( stderr, #name " [FAILED]\n" ); \
 969         } \
 970         call_c2( dct_c.name, pbuf3, (void*)dct1 ); \
 971         call_a2( dct_asm.name, pbuf4, (void*)dct2 ); \
 972     }
 973     ok = 1; used_asm = 0;
 974     TEST_IDCT( add4x4_idct, dct4 );
 975     TEST_IDCT( add8x8_idct, dct4 );
 976     TEST_IDCT( add8x8_idct_dc, dct4 );
 977     TEST_IDCT( add16x16_idct, dct4 );
 978     TEST_IDCT( add16x16_idct_dc, dct4 );
 979     report( "add_idct4 :" );
 980
 981     ok = 1; used_asm = 0;
 982     TEST_IDCT( add8x8_idct8, dct8 );
 983     TEST_IDCT( add16x16_idct8, dct8 );
 984     report( "add_idct8 :" );
 985 #undef TEST_IDCT
 986
 987 #define TEST_DCTDC( name )\
 988     ok = 1; used_asm = 0;\
 989     if( dct_asm.name != dct_ref.name )\
 990     {\
 991         set_func_name( #name );\
 992         used_asm = 1;\
 993         uint16_t *p = (uint16_t*)buf1;\
 994         for( int i = 0; i < 16 && ok; i++ )\
 995         {\
 996             for( int j = 0; j < 16; j++ )\
 997                 dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
 998                            : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
 999                            : ((*p++)&0x1fff)-0x1000; /* general case */\
1000             memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
1001             call_c1( dct_c.name, dct1[0] );\
1002             call_a1( dct_asm.name, dct2[0] );\
1003             if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\
1004                 ok = 0;\
1005         }\
1006         call_c2( dct_c.name, dct1[0] );\
1007         call_a2( dct_asm.name, dct2[0] );\
1008     }\
1009     report( #name " :" );
1010
1011     TEST_DCTDC(  dct4x4dc );
1012     TEST_DCTDC( idct4x4dc );
1013 #undef TEST_DCTDC
1014
1015 #define TEST_DCTDC_CHROMA( name )\
1016     ok = 1; used_asm = 0;\
1017     if( dct_asm.name != dct_ref.name )\
1018     {\
1019         set_func_name( #name );\
1020         used_asm = 1;\
1021         uint16_t *p = (uint16_t*)buf1;\
1022         for( int i = 0; i < 16 && ok; i++ )\
1023         {\
1024             for( int j = 0; j < 8; j++ )\
1025                 dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
1026                            : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
1027                            : ((*p++)&0x1fff)-0x1000; /* general case */\
1028             memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
1029             call_c1( dct_c.name, dctdc[0], dct1 );\
1030             call_a1( dct_asm.name, dctdc[1], dct2 );\
1031             if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
1032             {\
1033                 ok = 0;\
1034                 fprintf( stderr, #name " [FAILED]\n" ); \
1035             }\
1036         }\
1037         call_c2( dct_c.name, dctdc[0], dct1 );\
1038         call_a2( dct_asm.name, dctdc[1], dct2 );\
1039     }\
1040     report( #name " :" );
1041
1042     TEST_DCTDC_CHROMA( dct2x4dc );
1043 #undef TEST_DCTDC_CHROMA
1044
1045     x264_zigzag_function_t zigzag_c[2];
1046     x264_zigzag_function_t zigzag_ref[2];
1047     x264_zigzag_function_t zigzag_asm[2];
1048
1049     ALIGNED_ARRAY_64( dctcoef, level1,[64] );
1050     ALIGNED_ARRAY_64( dctcoef, level2,[64] );
1051
1052 #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
1053     if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
1054     { \
1055         set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
1056         used_asm = 1; \
1057         for( int i = 0; i < size*size; i++ ) \
1058             dct[i] = i; \
1059         call_c( zigzag_c[interlace].name, t1, dct ); \
1060         call_a( zigzag_asm[interlace].name, t2, dct ); \
1061         if( memcmp( t1, t2, size*size*sizeof(dctcoef) ) ) \
1062         { \
1063             ok = 0; \
1064             for( int i = 0; i < 2; i++ ) \
1065             { \
1066                 dctcoef *d = (dctcoef*)(i ? t2 : t1); \
1067                 for( int j = 0; j < size; j++ ) \
1068                 { \
1069                     for( int k = 0; k < size; k++ ) \
1070                         fprintf( stderr, "%2d ", d[k+j*8] ); \
1071                     fprintf( stderr, "\n" ); \
1072                 } \
1073                 fprintf( stderr, "\n" ); \
1074             } \
1075             fprintf( stderr, #name " [FAILED]\n" ); \
1076         } \
1077     }
1078
1079 #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
1080     if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
1081     { \
1082         int nz_a, nz_c; \
1083         set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
1084         used_asm = 1; \
1085         memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
1086         memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
1087         nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
1088         nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
1089         if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \
1090         { \
1091             ok = 0; \
1092             fprintf( stderr, #name " [FAILED]\n" ); \
1093         } \
1094         call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
1095         call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
1096     }
1097
1098 #define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
1099     if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
1100     { \
1101         int nz_a, nz_c; \
1102         dctcoef dc_a, dc_c; \
1103         set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
1104         used_asm = 1; \
1105         for( int i = 0; i < 2; i++ ) \
1106         { \
1107             memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
1108             memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
1109             for( int j = 0; j < 4; j++ ) \
1110             { \
1111                 memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
1112                 memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
1113             } \
1114             nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
1115             nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
1116             if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
1117             { \
1118                 ok = 0; \
1119                 fprintf( stderr, #name " [FAILED]\n" ); \
1120                 break; \
1121             } \
1122         } \
1123         call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
1124         call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
1125     }
1126
1127 #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
1128     if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
1129     { \
1130         for( int j = 0; j < 100; j++ ) \
1131         { \
1132             set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
1133             used_asm = 1; \
1134             memcpy(dct, buf1, size*sizeof(dctcoef)); \
1135             for( int i = 0; i < size; i++ ) \
1136                 dct[i] = rand()&0x1F ? 0 : dct[i]; \
1137             memcpy(buf3, buf4, 10); \
1138             call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \
1139             call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
1140             if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
1141             { \
1142                 ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\
1143             } \
1144         } \
1145     }
1146
1147     x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] );
1148     x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] );
1149     x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] );
1150
1151     ok = 1; used_asm = 0;
1152     TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 );
1153     report( "zigzag_interleave :" );
1154
1155     for( interlace = 0; interlace <= 1; interlace++ )
1156     {
1157         ok = 1; used_asm = 0;
1158         TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 );
1159         TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 );
1160         TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
1161         TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 );
1162         TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
1163         report( interlace ? "zigzag_field :" : "zigzag_frame :" );
1164     }
1165 #undef TEST_ZIGZAG_SCAN
1166 #undef TEST_ZIGZAG_SUB
1167
1168     return ret;
1169 }
1170
1171 static int check_mc( int cpu_ref, int cpu_new )
1172 {
1173     x264_mc_functions_t mc_c;
1174     x264_mc_functions_t mc_ref;
1175     x264_mc_functions_t mc_a;
1176     x264_pixel_function_t pixf;
1177
1178     pixel *src     = &(pbuf1)[2*64+2];
1179     pixel *src2[4] = { &(pbuf1)[3*64+2], &(pbuf1)[5*64+2],
1180                        &(pbuf1)[7*64+2], &(pbuf1)[9*64+2] };
1181     pixel *dst1    = pbuf3;
1182     pixel *dst2    = pbuf4;
1183
1184     int ret = 0, ok, used_asm;
1185
1186     x264_mc_init( 0, &mc_c, 0 );
1187     x264_mc_init( cpu_ref, &mc_ref, 0 );
1188     x264_mc_init( cpu_new, &mc_a, 0 );
1189     x264_pixel_init( 0, &pixf );
1190
1191 #define MC_TEST_LUMA( w, h ) \
1192         if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
1193         { \
1194             const x264_weight_t *weight = x264_weight_none; \
1195             set_func_name( "mc_luma_%dx%d", w, h ); \
1196             used_asm = 1; \
1197             for( int i = 0; i < 1024; i++ ) \
1198                 pbuf3[i] = pbuf4[i] = 0xCD; \
1199             call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
1200             call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
1201             if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
1202             { \
1203                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
1204                 ok = 0; \
1205             } \
1206         } \
1207         if( mc_a.get_ref != mc_ref.get_ref ) \
1208         { \
1209             pixel *ref = dst2; \
1210             intptr_t ref_stride = 32; \
1211             int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \
1212             const x264_weight_t *weight = x264_weight_none; \
1213             set_func_name( "get_ref_%dx%d", w_checked, h ); \
1214             used_asm = 1; \
1215             for( int i = 0; i < 1024; i++ ) \
1216                 pbuf3[i] = pbuf4[i] = 0xCD; \
1217             call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
1218             ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \
1219             for( int i = 0; i < h; i++ ) \
1220                 if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \
1221                 { \
1222                     fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w_checked, h ); \
1223                     ok = 0; \
1224                     break; \
1225                 } \
1226         }
1227
1228 #define MC_TEST_CHROMA( w, h ) \
1229         if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
1230         { \
1231             set_func_name( "mc_chroma_%dx%d", w, h ); \
1232             used_asm = 1; \
1233             for( int i = 0; i < 1024; i++ ) \
1234                 pbuf3[i] = pbuf4[i] = 0xCD; \
1235             call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
1236             call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
1237             /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
1238             for( int j = 0; j < h; j++ ) \
1239                 for( int i = w; i < 8; i++ ) \
1240                 { \
1241                     dst2[i+j*16+8] = dst1[i+j*16+8]; \
1242                     dst2[i+j*16  ] = dst1[i+j*16  ]; \
1243                 } \
1244             if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
1245             { \
1246                 fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
1247                 ok = 0; \
1248             } \
1249         }
1250     ok = 1; used_asm = 0;
1251     for( int dy = -8; dy < 8; dy++ )
1252         for( int dx = -128; dx < 128; dx++ )
1253         {
1254             if( rand()&15 ) continue; // running all of them is too slow
1255             MC_TEST_LUMA( 20, 18 );
1256             MC_TEST_LUMA( 16, 16 );
1257             MC_TEST_LUMA( 16, 8 );
1258             MC_TEST_LUMA( 12, 10 );
1259             MC_TEST_LUMA( 8, 16 );
1260             MC_TEST_LUMA( 8, 8 );
1261             MC_TEST_LUMA( 8, 4 );
1262             MC_TEST_LUMA( 4, 8 );
1263             MC_TEST_LUMA( 4, 4 );
1264         }
1265     report( "mc luma :" );
1266
1267     ok = 1; used_asm = 0;
1268     for( int dy = -1; dy < 9; dy++ )
1269         for( int dx = -128; dx < 128; dx++ )
1270         {
1271             if( rand()&15 ) continue;
1272             MC_TEST_CHROMA( 8, 8 );
1273             MC_TEST_CHROMA( 8, 4 );
1274             MC_TEST_CHROMA( 4, 8 );
1275             MC_TEST_CHROMA( 4, 4 );
1276             MC_TEST_CHROMA( 4, 2 );
1277             MC_TEST_CHROMA( 2, 4 );
1278             MC_TEST_CHROMA( 2, 2 );
1279         }
1280     report( "mc chroma :" );
1281 #undef MC_TEST_LUMA
1282 #undef MC_TEST_CHROMA
1283
1284 #define MC_TEST_AVG( name, weight ) \
1285 { \
1286     for( int i = 0; i < 12; i++ ) \
1287     { \
1288         memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
1289         memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
1290         if( mc_a.name[i] != mc_ref.name[i] ) \
1291         { \
1292             set_func_name( "%s_%s", #name, pixel_names[i] ); \
1293             used_asm = 1; \
1294             call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
1295             call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
1296             if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
1297             { \
1298                 ok = 0; \
1299                 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
1300             } \
1301             call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
1302             call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
1303         } \
1304     } \
1305 }
1306
1307     ok = 1, used_asm = 0;
1308     for( int w = -63; w <= 127 && ok; w++ )
1309         MC_TEST_AVG( avg, w );
1310     report( "mc wpredb :" );
1311
1312 #define MC_TEST_WEIGHT( name, weight, aligned ) \
1313     int align_off = (aligned ? 0 : rand()%16); \
1314     for( int i = 1; i <= 5; i++ ) \
1315     { \
1316         ALIGNED_16( pixel buffC[640] ); \
1317         ALIGNED_16( pixel buffA[640] ); \
1318         int j = X264_MAX( i*4, 2 ); \
1319         memset( buffC, 0, 640 * sizeof(pixel) ); \
1320         memset( buffA, 0, 640 * sizeof(pixel) ); \
1321         x264_t ha; \
1322         ha.mc = mc_a; \
1323         /* w12 is the same as w16 in some cases */ \
1324         if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
1325             continue; \
1326         if( mc_a.name[i] != mc_ref.name[i] ) \
1327         { \
1328             set_func_name( "%s_w%d", #name, j ); \
1329             used_asm = 1; \
1330             call_c1( mc_c.weight[i],     buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
1331             mc_a.weight_cache(&ha, &weight); \
1332             call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
1333             for( int k = 0; k < 16; k++ ) \
1334                 if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
1335                 { \
1336                     ok = 0; \
1337                     fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
1338                     break; \
1339                 } \
1340             /* omit unlikely high scales for benchmarking */ \
1341             if( (s << (8-d)) < 512 ) \
1342             { \
1343                 call_c2( mc_c.weight[i],     buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
1344                 call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
1345             } \
1346         } \
1347     }
1348
1349     ok = 1; used_asm = 0;
1350
1351     int align_cnt = 0;
1352     for( int s = 0; s <= 127 && ok; s++ )
1353     {
1354         for( int o = -128; o <= 127 && ok; o++ )
1355         {
1356             if( rand() & 2047 ) continue;
1357             for( int d = 0; d <= 7 && ok; d++ )
1358             {
1359                 if( s == 1<<d )
1360                     continue;
1361                 x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
1362                 MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
1363             }
1364         }
1365
1366     }
1367     report( "mc weight :" );
1368
1369     ok = 1; used_asm = 0;
1370     for( int o = 0; o <= 127 && ok; o++ )
1371     {
1372         int s = 1, d = 0;
1373         if( rand() & 15 ) continue;
1374         x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
1375         MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
1376     }
1377     report( "mc offsetadd :" );
1378     ok = 1; used_asm = 0;
1379     for( int o = -128; o < 0 && ok; o++ )
1380     {
1381         int s = 1, d = 0;
1382         if( rand() & 15 ) continue;
1383         x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
1384         MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
1385     }
1386     report( "mc offsetsub :" );
1387
1388     memset( pbuf3, 0, 64*16 );
1389     memset( pbuf4, 0, 64*16 );
1390     ok = 1; used_asm = 0;
1391     for( int height = 8; height <= 16; height += 8 )
1392     {
1393         if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
1394         {
1395             set_func_name( "store_interleave_chroma" );
1396             used_asm = 1;
1397             call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height );
1398             call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height );
1399             if( memcmp( pbuf3, pbuf4, 64*height ) )
1400             {
1401                 ok = 0;
1402                 fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
1403                 break;
1404             }
1405         }
1406         if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
1407         {
1408             set_func_name( "load_deinterleave_chroma_fenc" );
1409             used_asm = 1;
1410             call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, (intptr_t)64, height );
1411             call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, (intptr_t)64, height );
1412             if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
1413             {
1414                 ok = 0;
1415                 fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
1416                 break;
1417             }
1418         }
1419         if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
1420         {
1421             set_func_name( "load_deinterleave_chroma_fdec" );
1422             used_asm = 1;
1423             call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, (intptr_t)64, height );
1424             call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, (intptr_t)64, height );
1425             if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
1426             {
1427                 ok = 0;
1428                 fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
1429                 break;
1430             }
1431         }
1432     }
1433     report( "store_interleave :" );
1434
1435     struct plane_spec {
1436         int w, h, src_stride;
1437     } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} };
1438     ok = 1; used_asm = 0;
1439     if( mc_a.plane_copy != mc_ref.plane_copy )
1440     {
1441         set_func_name( "plane_copy" );
1442         used_asm = 1;
1443         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1444         {
1445             int w = plane_specs[i].w;
1446             int h = plane_specs[i].h;
1447             intptr_t src_stride = plane_specs[i].src_stride;
1448             intptr_t dst_stride = (w + 127) & ~63;
1449             assert( dst_stride * h <= 0x1000 );
1450             pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
1451             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
1452             memset( pbuf4, 0, 0x1000*sizeof(pixel) );
1453             call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h );
1454             call_a( mc_a.plane_copy, pbuf4, dst_stride, src1, src_stride, w, h );
1455             for( int y = 0; y < h; y++ )
1456                 if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
1457                 {
1458                     ok = 0;
1459                     fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
1460                     break;
1461                 }
1462         }
1463     }
1464
1465     if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap )
1466     {
1467         set_func_name( "plane_copy_swap" );
1468         used_asm = 1;
1469         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1470         {
1471             int w = (plane_specs[i].w + 1) >> 1;
1472             int h = plane_specs[i].h;
1473             intptr_t src_stride = plane_specs[i].src_stride;
1474             intptr_t dst_stride = (2*w + 127) & ~63;
1475             assert( dst_stride * h <= 0x1000 );
1476             pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
1477             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
1478             memset( pbuf4, 0, 0x1000*sizeof(pixel) );
1479             call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h );
1480             call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h );
1481             for( int y = 0; y < h; y++ )
1482                 if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
1483                 {
1484                     ok = 0;
1485                     fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
1486                     break;
1487                 }
1488         }
1489     }
1490
1491     if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
1492     {
1493         set_func_name( "plane_copy_interleave" );
1494         used_asm = 1;
1495         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1496         {
1497             int w = (plane_specs[i].w + 1) >> 1;
1498             int h = plane_specs[i].h;
1499             intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
1500             intptr_t dst_stride = (2*w + 127) & ~63;
1501             assert( dst_stride * h <= 0x1000 );
1502             pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
1503             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
1504             memset( pbuf4, 0, 0x1000*sizeof(pixel) );
1505             call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
1506             call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
1507             for( int y = 0; y < h; y++ )
1508                 if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
1509                 {
1510                     ok = 0;
1511                     fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
1512                     break;
1513                 }
1514         }
1515     }
1516
1517     if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave )
1518     {
1519         set_func_name( "plane_copy_deinterleave" );
1520         used_asm = 1;
1521         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1522         {
1523             int w = (plane_specs[i].w + 1) >> 1;
1524             int h = plane_specs[i].h;
1525             intptr_t dst_stride = w;
1526             intptr_t src_stride = (2*w + 127) & ~63;
1527             intptr_t offv = (dst_stride*h + 63) & ~31;
1528             memset( pbuf3, 0, 0x1000 );
1529             memset( pbuf4, 0, 0x1000 );
1530             call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
1531             call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h );
1532             for( int y = 0; y < h; y++ )
1533                 if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride, w ) ||
1534                     memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
1535                 {
1536                     ok = 0;
1537                     fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
1538                     break;
1539                 }
1540         }
1541     }
1542
1543     if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv )
1544     {
1545         set_func_name( "plane_copy_deinterleave_yuyv" );
1546         used_asm = 1;
1547         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1548         {
1549             int w = (plane_specs[i].w + 1) >> 1;
1550             int h = plane_specs[i].h;
1551             intptr_t dst_stride = ALIGN( w, 32/sizeof(pixel) );
1552             intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
1553             intptr_t offv = dst_stride*h;
1554             pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
1555             memset( pbuf3, 0, 0x1000 );
1556             memset( pbuf4, 0, 0x1000 );
1557             /* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */
1558             call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h );
1559             call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h );
1560             for( int y = 0; y < h; y++ )
1561                 if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride,      w*sizeof(pixel) ) ||
1562                     memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(pixel) ) )
1563                 {
1564                     fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
1565                     break;
1566                 }
1567         }
1568     }
1569
1570     if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb )
1571     {
1572         set_func_name( "plane_copy_deinterleave_rgb" );
1573         used_asm = 1;
1574         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1575         {
1576             int w = (plane_specs[i].w + 2) >> 2;
1577             int h = plane_specs[i].h;
1578             intptr_t src_stride = plane_specs[i].src_stride;
1579             intptr_t dst_stride = ALIGN( w, 16 );
1580             intptr_t offv = dst_stride*h + 16;
1581
1582             for( int pw = 3; pw <= 4; pw++ )
1583             {
1584                 memset( pbuf3, 0, 0x1000 );
1585                 memset( pbuf4, 0, 0x1000 );
1586                 call_c( mc_c.plane_copy_deinterleave_rgb, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf3+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
1587                 call_a( mc_a.plane_copy_deinterleave_rgb, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf4+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
1588                 for( int y = 0; y < h; y++ )
1589                     if( memcmp( pbuf3+y*dst_stride+0*offv, pbuf4+y*dst_stride+0*offv, w ) ||
1590                         memcmp( pbuf3+y*dst_stride+1*offv, pbuf4+y*dst_stride+1*offv, w ) ||
1591                         memcmp( pbuf3+y*dst_stride+2*offv, pbuf4+y*dst_stride+2*offv, w ) )
1592                     {
1593                         ok = 0;
1594                         fprintf( stderr, "plane_copy_deinterleave_rgb FAILED: w=%d h=%d stride=%d pw=%d\n", w, h, (int)src_stride, pw );
1595                         break;
1596                     }
1597             }
1598         }
1599     }
1600     report( "plane_copy :" );
1601
1602     if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
1603     {
1604         set_func_name( "plane_copy_deinterleave_v210" );
1605         ok = 1; used_asm = 1;
1606         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
1607         {
1608             int w = (plane_specs[i].w + 1) >> 1;
1609             int h = plane_specs[i].h;
1610             intptr_t dst_stride = ALIGN( w, 32 );
1611             intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
1612             intptr_t offv = dst_stride*h + 32;
1613             memset( pbuf3, 0, 0x1000 );
1614             memset( pbuf4, 0, 0x1000 );
1615             call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
1616             call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
1617             for( int y = 0; y < h; y++ )
1618                 if( memcmp( pbuf3+y*dst_stride,      pbuf4+y*dst_stride,      w*sizeof(uint16_t) ) ||
1619                     memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
1620                 {
1621                     ok = 0;
1622                     fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
1623                     break;
1624                 }
1625         }
1626         report( "v210 :" );
1627     }
1628
1629     if( mc_a.hpel_filter != mc_ref.hpel_filter )
1630     {
1631         pixel *srchpel = pbuf1+8+2*64;
1632         pixel *dstc[3] = { pbuf3+8, pbuf3+8+16*64, pbuf3+8+32*64 };
1633         pixel *dsta[3] = { pbuf4+8, pbuf4+8+16*64, pbuf4+8+32*64 };
1634         void *tmp = pbuf3+49*64;
1635         set_func_name( "hpel_filter" );
1636         ok = 1; used_asm = 1;
1637         memset( pbuf3, 0, 4096 * sizeof(pixel) );
1638         memset( pbuf4, 0, 4096 * sizeof(pixel) );
1639         call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp );
1640         call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp );
1641         for( int i = 0; i < 3; i++ )
1642             for( int j = 0; j < 10; j++ )
1643                 //FIXME ideally the first pixels would match too, but they aren't actually used
1644                 if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 * sizeof(pixel) ) )
1645                 {
1646                     ok = 0;
1647                     fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
1648                     for( int k = 0; k < 48; k++ )
1649                         printf( "%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " " );
1650                     printf( "\n" );
1651                     for( int k = 0; k < 48; k++ )
1652                         printf( "%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " " );
1653                     printf( "\n" );
1654                     break;
1655                 }
1656         report( "hpel filter :" );
1657     }
1658
1659     if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
1660     {
1661         pixel *dstc[4] = { pbuf3, pbuf3+1024, pbuf3+2048, pbuf3+3072 };
1662         pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
1663         set_func_name( "lowres_init" );
1664         ok = 1; used_asm = 1;
1665         for( int w = 96; w <= 96+24; w += 8 )
1666         {
1667             intptr_t stride = (w*2+31)&~31;
1668             intptr_t stride_lowres = (w+31)&~31;
1669             call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 );
1670             call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 );
1671             for( int i = 0; i < 8; i++ )
1672             {
1673                 for( int j = 0; j < 4; j++ )
1674                     if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * sizeof(pixel) ) )
1675                     {
1676                         ok = 0;
1677                         fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
1678                         for( int k = 0; k < w; k++ )
1679                             printf( "%d ", dstc[j][k+i*stride_lowres] );
1680                         printf( "\n" );
1681                         for( int k = 0; k < w; k++ )
1682                             printf( "%d ", dsta[j][k+i*stride_lowres] );
1683                         printf( "\n" );
1684                         break;
1685                     }
1686             }
1687         }
1688         report( "lowres init :" );
1689     }
1690
1691 #define INTEGRAL_INIT( name, size, offset, cmp_len, ... )\
1692     if( mc_a.name != mc_ref.name )\
1693     {\
1694         intptr_t stride = 96;\
1695         set_func_name( #name );\
1696         used_asm = 1;\
1697         memcpy( buf3, buf1, size*2*stride );\
1698         memcpy( buf4, buf1, size*2*stride );\
1699         uint16_t *sum = (uint16_t*)buf3;\
1700         call_c1( mc_c.name, sum+offset, __VA_ARGS__ );\
1701         sum = (uint16_t*)buf4;\
1702         call_a1( mc_a.name, sum+offset, __VA_ARGS__ );\
1703         if( memcmp( buf3+2*offset, buf4+2*offset, cmp_len*2 )\
1704             || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
1705             ok = 0;\
1706         call_c2( mc_c.name, sum+offset, __VA_ARGS__ );\
1707         call_a2( mc_a.name, sum+offset, __VA_ARGS__ );\
1708     }
1709     ok = 1; used_asm = 0;
1710     INTEGRAL_INIT( integral_init4h, 2, stride, stride-4, pbuf2, stride );
1711     INTEGRAL_INIT( integral_init8h, 2, stride, stride-8, pbuf2, stride );
1712     INTEGRAL_INIT( integral_init4v, 14, 0, stride-8, sum+9*stride, stride );
1713     INTEGRAL_INIT( integral_init8v, 9, 0, stride-8, stride );
1714     report( "integral init :" );
1715
1716     ok = 1; used_asm = 0;
1717     if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
1718     {
1719         used_asm = 1;
1720         x264_emms();
1721         for( int i = 0; i < 10; i++ )
1722         {
1723             float fps_factor = (rand()&65535) / 65535.0f;
1724             set_func_name( "mbtree_propagate_cost" );
1725             int16_t *dsta = (int16_t*)buf3;
1726             int16_t *dstc = dsta+400;
1727             uint16_t *prop = (uint16_t*)buf1;
1728             uint16_t *intra = (uint16_t*)buf4;
1729             uint16_t *inter = intra+128;
1730             uint16_t *qscale = inter+128;
1731             uint16_t *rnd = (uint16_t*)buf2;
1732             x264_emms();
1733             for( int j = 0; j < 100; j++ )
1734             {
1735                 intra[j]  = *rnd++ & 0x7fff;
1736                 intra[j] += !intra[j];
1737                 inter[j]  = *rnd++ & 0x7fff;
1738                 qscale[j] = *rnd++ & 0x7fff;
1739             }
1740             call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
1741             call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
1742             // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
1743             x264_emms();
1744             for( int j = 0; j < 100 && ok; j++ )
1745             {
1746                 ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
1747                 if( !ok )
1748                     fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] );
1749             }
1750         }
1751     }
1752
1753     if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list )
1754     {
1755         used_asm = 1;
1756         for( int i = 0; i < 8; i++ )
1757         {
1758             set_func_name( "mbtree_propagate_list" );
1759             x264_t h;
1760             int height = 4;
1761             int width = 128;
1762             int size = width*height;
1763             h.mb.i_mb_stride = width;
1764             h.mb.i_mb_width = width;
1765             h.mb.i_mb_height = height;
1766
1767             uint16_t *ref_costsc = (uint16_t*)buf3 + width;
1768             uint16_t *ref_costsa = (uint16_t*)buf4 + width;
1769             int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size);
1770             int16_t *propagate_amount = (int16_t*)(mvs + width);
1771             uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
1772             h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size);
1773             int bipred_weight = (rand()%63)+1;
1774             int mb_y = rand()&3;
1775             int list = i&1;
1776             for( int j = -width; j < size+width; j++ )
1777                 ref_costsc[j] = ref_costsa[j] = rand()&32767;
1778             for( int j = 0; j < width; j++ )
1779             {
1780                 static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}};
1781                 for( int k = 0; k < 2; k++ )
1782                     mvs[j][k] = (rand()&127) - 64;
1783                 propagate_amount[j] = rand()&32767;
1784                 lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
1785             }
1786
1787             call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
1788             call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
1789
1790             for( int j = -width; j < size+width && ok; j++ )
1791             {
1792                 ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
1793                 if( !ok )
1794                     fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
1795             }
1796
1797             call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
1798             call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
1799         }
1800     }
1801
1802     static const uint16_t mbtree_fix8_counts[] = { 5, 384, 392, 400, 415 };
1803
1804     if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack )
1805     {
1806         set_func_name( "mbtree_fix8_pack" );
1807         used_asm = 1;
1808         float *fix8_src = (float*)(buf3 + 0x800);
1809         uint16_t *dstc = (uint16_t*)buf3;
1810         uint16_t *dsta = (uint16_t*)buf4;
1811         for( int i = 0; i < ARRAY_ELEMS(mbtree_fix8_counts); i++ )
1812         {
1813             int count = mbtree_fix8_counts[i];
1814
1815             for( int j = 0; j < count; j++ )
1816                 fix8_src[j] = (int16_t)(rand()) / 256.0f;
1817             dsta[count] = 0xAAAA;
1818
1819             call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count );
1820             call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count );
1821
1822             if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA )
1823             {
1824                 ok = 0;
1825                 fprintf( stderr, "mbtree_fix8_pack FAILED\n" );
1826                 break;
1827             }
1828         }
1829     }
1830
1831     if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack )
1832     {
1833         set_func_name( "mbtree_fix8_unpack" );
1834         used_asm = 1;
1835         uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800);
1836         float *dstc = (float*)buf3;
1837         float *dsta = (float*)buf4;
1838         for( int i = 0; i < ARRAY_ELEMS(mbtree_fix8_counts); i++ )
1839         {
1840             int count = mbtree_fix8_counts[i];
1841
1842             for( int j = 0; j < count; j++ )
1843                 fix8_src[j] = rand();
1844             M32( &dsta[count] ) = 0xAAAAAAAA;
1845
1846             call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count );
1847             call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count );
1848
1849             if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA )
1850             {
1851                 ok = 0;
1852                 fprintf( stderr, "mbtree_fix8_unpack FAILED\n" );
1853                 break;
1854             }
1855         }
1856     }
1857     report( "mbtree :" );
1858
1859     if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
1860     {
1861         set_func_name( "memcpy_aligned" );
1862         ok = 1; used_asm = 1;
1863         for( size_t size = 16; size < 512; size += 16 )
1864         {
1865             for( int i = 0; i < size; i++ )
1866                 buf1[i] = rand();
1867             memset( buf4-1, 0xAA, size + 2 );
1868             call_c( mc_c.memcpy_aligned, buf3, buf1, size );
1869             call_a( mc_a.memcpy_aligned, buf4, buf1, size );
1870             if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
1871             {
1872                 ok = 0;
1873                 fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
1874                 break;
1875             }
1876         }
1877         report( "memcpy aligned :" );
1878     }
1879
1880     if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
1881     {
1882         set_func_name( "memzero_aligned" );
1883         ok = 1; used_asm = 1;
1884         for( size_t size = 128; size < 1024; size += 128 )
1885         {
1886             memset( buf4-1, 0xAA, size + 2 );
1887             call_c( mc_c.memzero_aligned, buf3, size );
1888             call_a( mc_a.memzero_aligned, buf4, size );
1889             if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
1890             {
1891                 ok = 0;
1892                 fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
1893                 break;
1894             }
1895         }
1896         report( "memzero aligned :" );
1897     }
1898
1899     return ret;
1900 }
1901
1902 static int check_deblock( int cpu_ref, int cpu_new )
1903 {
1904     x264_deblock_function_t db_c;
1905     x264_deblock_function_t db_ref;
1906     x264_deblock_function_t db_a;
1907     int ret = 0, ok = 1, used_asm = 0;
1908     int alphas[36], betas[36];
1909     int8_t tcs[36][4];
1910
1911     x264_deblock_init( 0, &db_c, 0 );
1912     x264_deblock_init( cpu_ref, &db_ref, 0 );
1913     x264_deblock_init( cpu_new, &db_a, 0 );
1914
1915     /* not exactly the real values of a,b,tc but close enough */
1916     for( int i = 35, a = 255, c = 250; i >= 0; i-- )
1917     {
1918         alphas[i] = a << (BIT_DEPTH-8);
1919         betas[i] = (i+1)/2 << (BIT_DEPTH-8);
1920         tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8);
1921         tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8);
1922         tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8);
1923         a = a*9/10;
1924         c = c*9/10;
1925     }
1926
1927 #define TEST_DEBLOCK( name, align, ... ) \
1928     for( int i = 0; i < 36; i++ ) \
1929     { \
1930         intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
1931         for( int j = 0; j < 1024; j++ ) \
1932             /* two distributions of random to excersize different failure modes */ \
1933             pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
1934         memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \
1935         if( db_a.name != db_ref.name ) \
1936         { \
1937             set_func_name( #name ); \
1938             used_asm = 1; \
1939             call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
1940             call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
1941             if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
1942             { \
1943                 ok = 0; \
1944                 fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
1945                 break; \
1946             } \
1947             call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
1948             call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
1949         } \
1950     }
1951
1952     TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
1953     TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
1954     TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
1955     TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
1956     TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] );
1957     TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] );
1958     TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
1959     TEST_DEBLOCK( deblock_luma_intra[0], 0 );
1960     TEST_DEBLOCK( deblock_luma_intra[1], 1 );
1961     TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
1962     TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
1963     TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 );
1964     TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 );
1965     TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
1966
1967     if( db_a.deblock_strength != db_ref.deblock_strength )
1968     {
1969         set_func_name( "deblock_strength" );
1970         used_asm = 1;
1971         for( int i = 0; i < 100; i++ )
1972         {
1973             ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] );
1974             uint8_t *nnz = &nnz_buf[8];
1975             ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
1976             ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
1977             ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
1978             memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
1979             for( int j = 0; j < X264_SCAN8_SIZE; j++ )
1980                 nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
1981             for( int j = 0; j < 2; j++ )
1982                 for( int k = 0; k < X264_SCAN8_LUMA_SIZE; k++ )
1983                 {
1984                     ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
1985                     for( int l = 0; l < 2; l++ )
1986                         mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192;
1987                 }
1988             call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
1989             call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
1990             if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
1991             {
1992                 ok = 0;
1993                 fprintf( stderr, "deblock_strength: [FAILED]\n" );
1994                 for( int j = 0; j < 2; j++ )
1995                 {
1996                     for( int k = 0; k < 2; k++ )
1997                         for( int l = 0; l < 4; l++ )
1998                         {
1999                             for( int m = 0; m < 4; m++ )
2000                                 printf("%d ",bs[j][k][l][m]);
2001                             printf("\n");
2002                         }
2003                     printf("\n");
2004                 }
2005                 break;
2006             }
2007         }
2008     }
2009
2010     report( "deblock :" );
2011
2012     return ret;
2013 }
2014
2015 static int check_quant( int cpu_ref, int cpu_new )
2016 {
2017     x264_quant_function_t qf_c;
2018     x264_quant_function_t qf_ref;
2019     x264_quant_function_t qf_a;
2020     ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
2021     ALIGNED_ARRAY_64( dctcoef, dct2,[64] );
2022     ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
2023     ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
2024     ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
2025     int ret = 0, ok, used_asm;
2026     int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
2027     x264_t h_buf;
2028     x264_t *h = &h_buf;
2029     memset( h, 0, sizeof(*h) );
2030     h->sps->i_chroma_format_idc = 1;
2031     x264_param_default( &h->param );
2032     h->chroma_qp_table = i_chroma_qp_table + 12;
2033     h->param.analyse.b_transform_8x8 = 1;
2034
2035     for( int i_cqm = 0; i_cqm < 4; i_cqm++ )
2036     {
2037         if( i_cqm == 0 )
2038         {
2039             for( int i = 0; i < 6; i++ )
2040                 h->sps->scaling_list[i] = x264_cqm_flat16;
2041             h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_FLAT;
2042         }
2043         else if( i_cqm == 1 )
2044         {
2045             for( int i = 0; i < 6; i++ )
2046                 h->sps->scaling_list[i] = x264_cqm_jvt[i];
2047             h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_JVT;
2048         }
2049         else
2050         {
2051             int max_scale = BIT_DEPTH < 10 ? 255 : 228;
2052             if( i_cqm == 2 )
2053                 for( int i = 0; i < 64; i++ )
2054                     cqm_buf[i] = 10 + rand() % (max_scale - 9);
2055             else
2056                 for( int i = 0; i < 64; i++ )
2057                     cqm_buf[i] = 1;
2058             for( int i = 0; i < 6; i++ )
2059                 h->sps->scaling_list[i] = cqm_buf;
2060             h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_CUSTOM;
2061         }
2062
2063         h->param.rc.i_qp_min = 0;
2064         h->param.rc.i_qp_max = QP_MAX_SPEC;
2065         x264_cqm_init( h );
2066         x264_quant_init( h, 0, &qf_c );
2067         x264_quant_init( h, cpu_ref, &qf_ref );
2068         x264_quant_init( h, cpu_new, &qf_a );
2069
2070 #define INIT_QUANT8(j,max) \
2071         { \
2072             static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
2073             for( int i = 0; i < max; i++ ) \
2074             { \
2075                 unsigned int scale = (255*scale1d[(i>>3)&7]*scale1d[i&7])/16; \
2076                 dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand()%(2*scale+1))-scale : 0; \
2077             } \
2078         }
2079
2080 #define INIT_QUANT4(j,max) \
2081         { \
2082             static const int scale1d[4] = {4,6,4,6}; \
2083             for( int i = 0; i < max; i++ ) \
2084             { \
2085                 unsigned int scale = 255*scale1d[(i>>2)&3]*scale1d[i&3]; \
2086                 dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand()%(2*scale+1))-scale : 0; \
2087             } \
2088         }
2089
2090 #define TEST_QUANT_DC( name, cqm ) \
2091         if( qf_a.name != qf_ref.name ) \
2092         { \
2093             set_func_name( #name ); \
2094             used_asms[0] = 1; \
2095             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
2096             { \
2097                 for( int j = 0; j < 2; j++ ) \
2098                 { \
2099                     int result_c, result_a; \
2100                     for( int i = 0; i < 16; i++ ) \
2101                         dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
2102                     result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
2103                     result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
2104                     if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \
2105                     { \
2106                         oks[0] = 0; \
2107                         fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
2108                         break; \
2109                     } \
2110                     call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
2111                     call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
2112                 } \
2113             } \
2114         }
2115
2116 #define TEST_QUANT( qname, block, type, w, maxj ) \
2117         if( qf_a.qname != qf_ref.qname ) \
2118         { \
2119             set_func_name( #qname ); \
2120             used_asms[0] = 1; \
2121             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
2122             { \
2123                 for( int j = 0; j < maxj; j++ ) \
2124                 { \
2125                     INIT_QUANT##type(j, w*w) \
2126                     int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
2127                     int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
2128                     if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
2129                     { \
2130                         oks[0] = 0; \
2131                         fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
2132                         break; \
2133                     } \
2134                     call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
2135                     call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
2136                 } \
2137             } \
2138         }
2139
2140         TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 );
2141         TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 );
2142         TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 );
2143         TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 );
2144         TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 );
2145         TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 );
2146         TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
2147         TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
2148
2149 #define TEST_DEQUANT( qname, dqname, block, w ) \
2150         if( qf_a.dqname != qf_ref.dqname ) \
2151         { \
2152             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
2153             used_asms[1] = 1; \
2154             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
2155             { \
2156                 INIT_QUANT##w(1, w*w) \
2157                 qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
2158                 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
2159                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
2160                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
2161                 if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
2162                 { \
2163                     oks[1] = 0; \
2164                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
2165                     break; \
2166                 } \
2167                 call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
2168                 call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
2169             } \
2170         }
2171
2172         TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
2173         TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
2174         TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
2175         TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
2176
2177 #define TEST_DEQUANT_DC( qname, dqname, block, w ) \
2178         if( qf_a.dqname != qf_ref.dqname ) \
2179         { \
2180             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
2181             used_asms[1] = 1; \
2182             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
2183             { \
2184                 for( int i = 0; i < 16; i++ ) \
2185                     dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
2186                 qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
2187                 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
2188                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
2189                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
2190                 if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
2191                 { \
2192                     oks[1] = 0; \
2193                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
2194                 } \
2195                 call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
2196                 call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
2197             } \
2198         }
2199
2200         TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
2201
2202         if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
2203         {
2204             set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
2205             used_asms[1] = 1;
2206             for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- )
2207             {
2208                 for( int i = 0; i < 8; i++ )
2209                     dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
2210                 qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
2211                 qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
2212                 call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
2213                 call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
2214                 for( int i = 0; i < 8; i++ )
2215                     if( dct3[i][0] != dct4[i][0] )
2216                     {
2217                         oks[1] = 0;
2218                         fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
2219                         break;
2220                     }
2221             }
2222         }
2223
2224         if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
2225         {
2226             set_func_name( "idct_dequant_2x4_dconly_%s", i_cqm?"cqm":"flat" );
2227             used_asms[1] = 1;
2228             for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- )
2229             {
2230                 for( int i = 0; i < 8; i++ )
2231                     dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
2232                 qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
2233                 qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
2234                 memcpy( dct2, dct1, 8*sizeof(dctcoef) );
2235                 call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
2236                 call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
2237                 if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
2238                 {
2239                     oks[1] = 0;
2240                     fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
2241                     break;
2242                 }
2243                 call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
2244                 call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
2245             }
2246         }
2247
2248 #define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
2249         if( qf_a.optname != qf_ref.optname ) \
2250         { \
2251             set_func_name( #optname ); \
2252             used_asms[2] = 1; \
2253             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
2254             { \
2255                 int qpdc = qp + (size == 8 ? 3 : 0); \
2256                 int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
2257                 if( dmf > 32*64 ) \
2258                     continue; \
2259                 for( int i = 16;; i <<= 1 ) \
2260                 { \
2261                     int res_c, res_asm; \
2262                     int max = X264_MIN( i, PIXEL_MAX*16 ); \
2263                     for( int j = 0; j < size; j++ ) \
2264                         dct1[j] = rand()%(max*2+1) - max; \
2265                     for( int j = 0; j <= size; j += 4 ) \
2266                         qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
2267                     memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
2268                     res_c   = call_c1( qf_c.optname, dct1, dmf ); \
2269                     res_asm = call_a1( qf_a.optname, dct2, dmf ); \
2270                     if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
2271                     { \
2272                         oks[2] = 0; \
2273                         fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
2274                     } \
2275                     call_c2( qf_c.optname, dct1, dmf ); \
2276                     call_a2( qf_a.optname, dct2, dmf ); \
2277                     if( i >= PIXEL_MAX*16 ) \
2278                         break; \
2279                 } \
2280             } \
2281         }
2282
2283         TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
2284         TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );
2285
2286         x264_cqm_delete( h );
2287     }
2288
2289     ok = oks[0]; used_asm = used_asms[0];
2290     report( "quant :" );
2291
2292     ok = oks[1]; used_asm = used_asms[1];
2293     report( "dequant :" );
2294
2295     ok = oks[2]; used_asm = used_asms[2];
2296     report( "optimize chroma dc :" );
2297
2298     ok = 1; used_asm = 0;
2299     if( qf_a.denoise_dct != qf_ref.denoise_dct )
2300     {
2301         used_asm = 1;
2302         for( int size = 16; size <= 64; size += 48 )
2303         {
2304             set_func_name( "denoise_dct" );
2305             memcpy( dct1, buf1, size*sizeof(dctcoef) );
2306             memcpy( dct2, buf1, size*sizeof(dctcoef) );
2307             memcpy( buf3+256, buf3, 256 );
2308             call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3,       (udctcoef*)buf2, size );
2309             call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
2310             if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
2311                 ok = 0;
2312             call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3,       (udctcoef*)buf2, size );
2313             call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
2314         }
2315     }
2316     report( "denoise dct :" );
2317
2318 #define TEST_DECIMATE( decname, w, ac, thresh ) \
2319     if( qf_a.decname != qf_ref.decname ) \
2320     { \
2321         set_func_name( #decname ); \
2322         used_asm = 1; \
2323         for( int i = 0; i < 100; i++ ) \
2324         { \
2325             static const int distrib[16] = {1,1,1,1,1,1,1,1,1,1,1,1,2,3,4};\
2326             static const int zerorate_lut[4] = {3,7,15,31};\
2327             int zero_rate = zerorate_lut[i&3];\
2328             for( int idx = 0; idx < w*w; idx++ ) \
2329             { \
2330                 int sign = (rand()&1) ? -1 : 1; \
2331                 int abs_level = distrib[rand()&15]; \
2332                 if( abs_level == 4 ) abs_level = rand()&0x3fff; \
2333                 int zero = !(rand()&zero_rate); \
2334                 dct1[idx] = zero * abs_level * sign; \
2335             } \
2336             if( ac ) \
2337                 dct1[0] = 0; \
2338             int result_c = call_c( qf_c.decname, dct1 ); \
2339             int result_a = call_a( qf_a.decname, dct1 ); \
2340             if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
2341             { \
2342                 ok = 0; \
2343                 fprintf( stderr, #decname ": [FAILED]\n" ); \
2344                 break; \
2345             } \
2346         } \
2347     }
2348
2349     ok = 1; used_asm = 0;
2350     TEST_DECIMATE( decimate_score64, 8, 0, 6 );
2351     TEST_DECIMATE( decimate_score16, 4, 0, 6 );
2352     TEST_DECIMATE( decimate_score15, 4, 1, 7 );
2353     report( "decimate_score :" );
2354
2355 #define TEST_LAST( last, lastname, size, ac ) \
2356     if( qf_a.last != qf_ref.last ) \
2357     { \
2358         set_func_name( #lastname ); \
2359         used_asm = 1; \
2360         for( int i = 0; i < 100; i++ ) \
2361         { \
2362             int nnz = 0; \
2363             int max = rand() & (size-1); \
2364             memset( dct1, 0, size*sizeof(dctcoef) ); \
2365             for( int idx = ac; idx < max; idx++ ) \
2366                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
2367             if( !nnz ) \
2368                 dct1[ac] = 1; \
2369             int result_c = call_c( qf_c.last, dct1+ac ); \
2370             int result_a = call_a( qf_a.last, dct1+ac ); \
2371             if( result_c != result_a ) \
2372             { \
2373                 ok = 0; \
2374                 fprintf( stderr, #lastname ": [FAILED]\n" ); \
2375                 break; \
2376             } \
2377         } \
2378     }
2379
2380     ok = 1; used_asm = 0;
2381     TEST_LAST( coeff_last4              , coeff_last4,   4, 0 );
2382     TEST_LAST( coeff_last8              , coeff_last8,   8, 0 );
2383     TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 16, 1 );
2384     TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
2385     TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
2386     report( "coeff_last :" );
2387
2388 #define TEST_LEVELRUN( lastname, name, size, ac ) \
2389     if( qf_a.lastname != qf_ref.lastname ) \
2390     { \
2391         set_func_name( #name ); \
2392         used_asm = 1; \
2393         for( int i = 0; i < 100; i++ ) \
2394         { \
2395             x264_run_level_t runlevel_c, runlevel_a; \
2396             int nnz = 0; \
2397             int max = rand() & (size-1); \
2398             memset( dct1, 0, size*sizeof(dctcoef) ); \
2399             memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
2400             memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
2401             for( int idx = ac; idx < max; idx++ ) \
2402                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
2403             if( !nnz ) \
2404                 dct1[ac] = 1; \
2405             int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
2406             int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
2407             if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
2408                 runlevel_c.mask != runlevel_a.mask || \
2409                 memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c)) \
2410             { \
2411                 ok = 0; \
2412                 fprintf( stderr, #name ": [FAILED]\n" ); \
2413                 break; \
2414             } \
2415         } \
2416     }
2417
2418     ok = 1; used_asm = 0;
2419     TEST_LEVELRUN( coeff_level_run4              , coeff_level_run4,   4, 0 );
2420     TEST_LEVELRUN( coeff_level_run8              , coeff_level_run8,   8, 0 );
2421     TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 16, 1 );
2422     TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
2423     report( "coeff_level_run :" );
2424
2425     return ret;
2426 }
2427
2428 static int check_intra( int cpu_ref, int cpu_new )
2429 {
2430     int ret = 0, ok = 1, used_asm = 0;
2431     ALIGNED_ARRAY_32( pixel, edge,[36] );
2432     ALIGNED_ARRAY_32( pixel, edge2,[36] );
2433     ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
2434     struct
2435     {
2436         x264_predict_t      predict_16x16[4+3];
2437         x264_predict_t      predict_8x8c[4+3];
2438         x264_predict_t      predict_8x16c[4+3];
2439         x264_predict8x8_t   predict_8x8[9+3];
2440         x264_predict_t      predict_4x4[9+3];
2441         x264_predict_8x8_filter_t predict_8x8_filter;
2442     } ip_c, ip_ref, ip_a;
2443
2444     x264_predict_16x16_init( 0, ip_c.predict_16x16 );
2445     x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
2446     x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
2447     x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
2448     x264_predict_4x4_init( 0, ip_c.predict_4x4 );
2449
2450     x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
2451     x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
2452     x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
2453     x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
2454     x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
2455
2456     x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
2457     x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
2458     x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
2459     x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
2460     x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
2461
2462     memcpy( fdec, pbuf1, 32*20 * sizeof(pixel) );\
2463
2464     ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
2465
2466 #define INTRA_TEST( name, dir, w, h, align, bench, ... )\
2467     if( ip_a.name[dir] != ip_ref.name[dir] )\
2468     {\
2469         set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
2470         used_asm = 1;\
2471         memcpy( pbuf3, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
2472         memcpy( pbuf4, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
2473         for( int a = 0; a < (do_bench ? 64/sizeof(pixel) : 1); a += align )\
2474         {\
2475             call_c##bench( ip_c.name[dir], pbuf3+48+a, ##__VA_ARGS__ );\
2476             call_a##bench( ip_a.name[dir], pbuf4+48+a, ##__VA_ARGS__ );\
2477             if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * sizeof(pixel) ) )\
2478             {\
2479                 fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
2480                 ok = 0;\
2481                 if( ip_c.name == (void *)ip_c.predict_8x8 )\
2482                 {\
2483                     for( int k = -1; k < 16; k++ )\
2484                         printf( "%2x ", edge[16+k] );\
2485                     printf( "\n" );\
2486                 }\
2487                 for( int j = 0; j < h; j++ )\
2488                 {\
2489                     if( ip_c.name == (void *)ip_c.predict_8x8 )\
2490                         printf( "%2x ", edge[14-j] );\
2491                     for( int k = 0; k < w; k++ )\
2492                         printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
2493                     printf( "\n" );\
2494                 }\
2495                 printf( "\n" );\
2496                 for( int j = 0; j < h; j++ )\
2497                 {\
2498                     if( ip_c.name == (void *)ip_c.predict_8x8 )\
2499                         printf( "   " );\
2500                     for( int k = 0; k < w; k++ )\
2501                         printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
2502                     printf( "\n" );\
2503                 }\
2504                 break;\
2505             }\
2506         }\
2507     }
2508
2509     for( int i = 0; i < 12; i++ )
2510         INTRA_TEST(   predict_4x4, i,  4,  4,  4, );
2511     for( int i = 0; i < 7; i++ )
2512         INTRA_TEST(  predict_8x8c, i,  8,  8, 16, );
2513     for( int i = 0; i < 7; i++ )
2514         INTRA_TEST( predict_8x16c, i,  8, 16, 16, );
2515     for( int i = 0; i < 7; i++ )
2516         INTRA_TEST( predict_16x16, i, 16, 16, 16, );
2517     for( int i = 0; i < 12; i++ )
2518         INTRA_TEST(   predict_8x8, i,  8,  8,  8, , edge );
2519
2520     set_func_name("intra_predict_8x8_filter");
2521     if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
2522     {
2523         used_asm = 1;
2524         for( int i = 0; i < 32; i++ )
2525         {
2526             if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) )
2527                 continue;
2528             int neighbor = (i&24)>>1;
2529             memset( edge,  0, 36*sizeof(pixel) );
2530             memset( edge2, 0, 36*sizeof(pixel) );
2531             call_c( ip_c.predict_8x8_filter, pbuf1+48, edge,  neighbor, i&7 );
2532             call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 );
2533             if( !(neighbor&MB_TOPLEFT) )
2534                 edge[15] = edge2[15] = 0;
2535             if( memcmp( edge+7, edge2+7, (i&MB_TOPRIGHT ? 26 : i&MB_TOP ? 17 : 8) * sizeof(pixel) ) )
2536             {
2537                 fprintf( stderr, "predict_8x8_filter :  [FAILED] %d %d\n", (i&24)>>1, i&7);
2538                 ok = 0;
2539             }
2540         }
2541     }
2542
2543 #define EXTREMAL_PLANE( w, h ) \
2544     { \
2545         int max[7]; \
2546         for( int j = 0; j < 7; j++ ) \
2547             max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
2548         fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
2549         for( int j = 0; j < w/2; j++ ) \
2550             fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
2551         for( int j = w/2; j < w-1; j++ ) \
2552             fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
2553         fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
2554         for( int j = 0; j < h/2; j++ ) \
2555             fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
2556         for( int j = h/2; j < h-1; j++ ) \
2557             fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
2558         fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
2559     }
2560     /* Extremal test case for planar prediction. */
2561     for( int test = 0; test < 100 && ok; test++ )
2562         for( int i = 0; i < 128 && ok; i++ )
2563         {
2564             EXTREMAL_PLANE(  8,  8 );
2565             INTRA_TEST(  predict_8x8c, I_PRED_CHROMA_P,  8,  8, 64, 1 );
2566             EXTREMAL_PLANE(  8, 16 );
2567             INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P,  8, 16, 64, 1 );
2568             EXTREMAL_PLANE( 16, 16 );
2569             INTRA_TEST( predict_16x16,  I_PRED_16x16_P, 16, 16, 64, 1 );
2570         }
2571     report( "intra pred :" );
2572     return ret;
2573 }
2574
2575 #define DECL_CABAC(cpu) \
2576 static void run_cabac_decision_##cpu( x264_t *h, uint8_t *dst )\
2577 {\
2578     x264_cabac_t cb;\
2579     x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
2580     x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
2581     for( int i = 0; i < 0x1000; i++ )\
2582         x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
2583 }\
2584 static void run_cabac_bypass_##cpu( x264_t *h, uint8_t *dst )\
2585 {\
2586     x264_cabac_t cb;\
2587     x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
2588     x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
2589     for( int i = 0; i < 0x1000; i++ )\
2590         x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
2591 }\
2592 static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
2593 {\
2594     x264_cabac_t cb;\
2595     x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
2596     x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
2597     for( int i = 0; i < 0x1000; i++ )\
2598         x264_cabac_encode_terminal_##cpu( &cb );\
2599 }
2600 DECL_CABAC(c)
2601 #if HAVE_MMX
2602 DECL_CABAC(asm)
2603 #elif defined(ARCH_AARCH64)
2604 DECL_CABAC(asm)
2605 #else
2606 #define run_cabac_decision_asm run_cabac_decision_c
2607 #define run_cabac_bypass_asm run_cabac_bypass_c
2608 #define run_cabac_terminal_asm run_cabac_terminal_c
2609 #endif
2610
2611 extern const uint8_t x264_count_cat_m1[14];
2612
2613 static int check_cabac( int cpu_ref, int cpu_new )
2614 {
2615     int ret = 0, ok = 1, used_asm = 0;
2616     x264_t h;
2617     h.sps->i_chroma_format_idc = 3;
2618
2619     x264_bitstream_function_t bs_ref;
2620     x264_bitstream_function_t bs_a;
2621     x264_bitstream_init( cpu_ref, &bs_ref );
2622     x264_bitstream_init( cpu_new, &bs_a );
2623     x264_quant_init( &h, cpu_new, &h.quantf );
2624     h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
2625
2626 /* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
2627 #define GET_CB( i ) (\
2628     x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
2629     cb[i].f8_bits_encoded = 0, &cb[i] )
2630
2631 #define CABAC_RESIDUAL(name, start, end, rd)\
2632 {\
2633     if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
2634     {\
2635         used_asm = 1;\
2636         set_func_name( #name );\
2637         for( int i = 0; i < 2; i++ )\
2638         {\
2639             for( intptr_t ctx_block_cat = start; ctx_block_cat <= end; ctx_block_cat++ )\
2640             {\
2641                 for( int j = 0; j < 256; j++ )\
2642                 {\
2643                     ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
2644                     uint8_t bitstream[2][1<<16];\
2645                     static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
2646                     int ac = ctx_ac[ctx_block_cat];\
2647                     int nz = 0;\
2648                     while( !nz )\
2649                     {\
2650                         for( int k = 0; k <= x264_count_cat_m1[ctx_block_cat]; k++ )\
2651                         {\
2652                             /* Very rough distribution that covers possible inputs */\
2653                             int rnd = rand();\
2654                             int coef = !(rnd&3);\
2655                             coef += !(rnd&  15) * (rand()&0x0006);\
2656                             coef += !(rnd&  63) * (rand()&0x0008);\
2657                             coef += !(rnd& 255) * (rand()&0x00F0);\
2658                             coef += !(rnd&1023) * (rand()&0x7F00);\
2659                             nz |= dct[0][ac+k] = dct[1][ac+k] = coef * ((rand()&1) ? 1 : -1);\
2660                         }\
2661                     }\
2662                     h.mb.b_interlaced = i;\
2663                     x264_cabac_t cb[2];\
2664                     x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
2665                     x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
2666                     if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
2667                     call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
2668                     call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
2669                     ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
2670                     if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
2671                     if( !ok )\
2672                     {\
2673                         fprintf( stderr, #name " :  [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\
2674                         if( rd && cb[0].f8_bits_encoded != cb[1].f8_bits_encoded )\
2675                             fprintf( stderr, " (%d != %d)", cb[0].f8_bits_encoded, cb[1].f8_bits_encoded );\
2676                         fprintf( stderr, "\n");\
2677                         goto name##fail;\
2678                     }\
2679                     if( (j&15) == 0 )\
2680                     {\
2681                         call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
2682                         call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
2683                     }\
2684                 }\
2685             }\
2686         }\
2687     }\
2688 }\
2689 name##fail:
2690
2691     CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 )
2692     report( "cabac residual:" );
2693
2694     ok = 1; used_asm = 0;
2695     CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 )
2696     CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 )
2697     report( "cabac residual rd:" );
2698
2699     if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
2700         return ret;
2701     ok = 1; used_asm = 0;
2702     x264_cabac_init( &h );
2703
2704     set_func_name( "cabac_encode_decision" );
2705     memcpy( buf4, buf3, 0x1000 );
2706     call_c( run_cabac_decision_c, &h, buf3 );
2707     call_a( run_cabac_decision_asm, &h, buf4 );
2708     ok = !memcmp( buf3, buf4, 0x1000 );
2709     report( "cabac decision:" );
2710
2711     set_func_name( "cabac_encode_bypass" );
2712     memcpy( buf4, buf3, 0x1000 );
2713     call_c( run_cabac_bypass_c, &h, buf3 );
2714     call_a( run_cabac_bypass_asm, &h, buf4 );
2715     ok = !memcmp( buf3, buf4, 0x1000 );
2716     report( "cabac bypass:" );
2717
2718     set_func_name( "cabac_encode_terminal" );
2719     memcpy( buf4, buf3, 0x1000 );
2720     call_c( run_cabac_terminal_c, &h, buf3 );
2721     call_a( run_cabac_terminal_asm, &h, buf4 );
2722     ok = !memcmp( buf3, buf4, 0x1000 );
2723     report( "cabac terminal:" );
2724
2725     return ret;
2726 }
2727
2728 static int check_bitstream( int cpu_ref, int cpu_new )
2729 {
2730     x264_bitstream_function_t bs_c;
2731     x264_bitstream_function_t bs_ref;
2732     x264_bitstream_function_t bs_a;
2733
2734     int ret = 0, ok = 1, used_asm = 0;
2735
2736     x264_bitstream_init( 0, &bs_c );
2737     x264_bitstream_init( cpu_ref, &bs_ref );
2738     x264_bitstream_init( cpu_new, &bs_a );
2739     if( bs_a.nal_escape != bs_ref.nal_escape )
2740     {
2741         int size = 0x4000;
2742         uint8_t *input = malloc(size+100);
2743         uint8_t *output1 = malloc(size*2);
2744         uint8_t *output2 = malloc(size*2);
2745         used_asm = 1;
2746         set_func_name( "nal_escape" );
2747         for( int i = 0; i < 100; i++ )
2748         {
2749             /* Test corner-case sizes */
2750             int test_size = i < 10 ? i+1 : rand() & 0x3fff;
2751             /* Test 8 different probability distributions of zeros */
2752             for( int j = 0; j < test_size+32; j++ )
2753                 input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
2754             uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
2755             uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
2756             int size_c = end_c-output1;
2757             int size_a = end_a-output2;
2758             if( size_c != size_a || memcmp( output1, output2, size_c ) )
2759             {
2760                 fprintf( stderr, "nal_escape :  [FAILED] %d %d\n", size_c, size_a );
2761                 ok = 0;
2762                 break;
2763             }
2764         }
2765         for( int j = 0; j < size+32; j++ )
2766             input[j] = rand();
2767         call_c2( bs_c.nal_escape, output1, input, input+size );
2768         call_a2( bs_a.nal_escape, output2, input, input+size );
2769         free(input);
2770         free(output1);
2771         free(output2);
2772     }
2773     report( "nal escape:" );
2774
2775     return ret;
2776 }
2777
2778 static int check_all_funcs( int cpu_ref, int cpu_new )
2779 {
2780     return check_pixel( cpu_ref, cpu_new )
2781          + check_dct( cpu_ref, cpu_new )
2782          + check_mc( cpu_ref, cpu_new )
2783          + check_intra( cpu_ref, cpu_new )
2784          + check_deblock( cpu_ref, cpu_new )
2785          + check_quant( cpu_ref, cpu_new )
2786          + check_cabac( cpu_ref, cpu_new )
2787          + check_bitstream( cpu_ref, cpu_new );
2788 }
2789
2790 static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
2791 {
2792     *cpu_ref = *cpu_new;
2793     *cpu_new |= flags;
2794 #if STACK_ALIGNMENT < 16
2795     *cpu_new |= X264_CPU_STACK_MOD4;
2796 #endif
2797     if( *cpu_new & X264_CPU_SSE2_IS_FAST )
2798         *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
2799     if( !quiet )
2800         fprintf( stderr, "x264: %s\n", name );
2801     return check_all_funcs( *cpu_ref, *cpu_new );
2802 }
2803
2804 static int check_all_flags( void )
2805 {
2806     int ret = 0;
2807     int cpu0 = 0, cpu1 = 0;
2808     uint32_t cpu_detect = x264_cpu_detect();
2809 #if ARCH_X86 || ARCH_X86_64
2810     if( cpu_detect & X264_CPU_AVX512 )
2811         simd_warmup_func = x264_checkasm_warmup_avx512;
2812     else if( cpu_detect & X264_CPU_AVX )
2813         simd_warmup_func = x264_checkasm_warmup_avx;
2814 #endif
2815     simd_warmup();
2816
2817 #if HAVE_MMX
2818     if( cpu_detect & X264_CPU_MMX2 )
2819     {
2820         ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" );
2821         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
2822         cpu1 &= ~X264_CPU_CACHELINE_64;
2823 #if ARCH_X86
2824         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
2825         cpu1 &= ~X264_CPU_CACHELINE_32;
2826 #endif
2827     }
2828     if( cpu_detect & X264_CPU_SSE )
2829         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
2830     if( cpu_detect & X264_CPU_SSE2 )
2831     {
2832         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
2833         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
2834         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
2835         cpu1 &= ~X264_CPU_CACHELINE_64;
2836         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
2837         cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
2838     }
2839     if( cpu_detect & X264_CPU_LZCNT )
2840     {
2841         ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" );
2842         cpu1 &= ~X264_CPU_LZCNT;
2843     }
2844     if( cpu_detect & X264_CPU_SSE3 )
2845     {
2846         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
2847         cpu1 &= ~X264_CPU_CACHELINE_64;
2848     }
2849     if( cpu_detect & X264_CPU_SSSE3 )
2850     {
2851         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
2852         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
2853         cpu1 &= ~X264_CPU_CACHELINE_64;
2854         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
2855         cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
2856         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
2857         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
2858         cpu1 &= ~X264_CPU_CACHELINE_64;
2859         cpu1 &= ~X264_CPU_SLOW_ATOM;
2860         if( cpu_detect & X264_CPU_LZCNT )
2861         {
2862             ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
2863             cpu1 &= ~X264_CPU_LZCNT;
2864         }
2865     }
2866     if( cpu_detect & X264_CPU_SSE4 )
2867         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
2868     if( cpu_detect & X264_CPU_SSE42 )
2869         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
2870     if( cpu_detect & X264_CPU_AVX )
2871         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
2872     if( cpu_detect & X264_CPU_XOP )
2873         ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
2874     if( cpu_detect & X264_CPU_FMA4 )
2875     {
2876         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
2877         cpu1 &= ~X264_CPU_FMA4;
2878     }
2879     if( cpu_detect & X264_CPU_FMA3 )
2880         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
2881     if( cpu_detect & X264_CPU_BMI1 )
2882         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
2883     if( cpu_detect & X264_CPU_BMI2 )
2884         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
2885     if( cpu_detect & X264_CPU_AVX2 )
2886         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
2887     if( cpu_detect & X264_CPU_AVX512 )
2888         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
2889 #elif ARCH_PPC
2890     if( cpu_detect & X264_CPU_ALTIVEC )
2891     {
2892         fprintf( stderr, "x264: ALTIVEC against C\n" );
2893         ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
2894     }
2895 #elif ARCH_ARM
2896     if( cpu_detect & X264_CPU_NEON )
2897         x264_checkasm_call = x264_checkasm_call_neon;
2898     if( cpu_detect & X264_CPU_ARMV6 )
2899         ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
2900     if( cpu_detect & X264_CPU_NEON )
2901         ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
2902     if( cpu_detect & X264_CPU_FAST_NEON_MRC )
2903         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
2904 #elif ARCH_AARCH64
2905     if( cpu_detect & X264_CPU_ARMV8 )
2906         ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" );
2907     if( cpu_detect & X264_CPU_NEON )
2908         ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
2909 #elif ARCH_MIPS
2910     if( cpu_detect & X264_CPU_MSA )
2911         ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" );
2912 #endif
2913     return ret;
2914 }
2915
2916 static int main_internal( int argc, char **argv )
2917 {
2918 #ifdef _WIN32
2919     /* Disable the Windows Error Reporting dialog */
2920     SetErrorMode( SEM_NOGPFAULTERRORBOX );
2921 #endif
2922
2923     if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
2924     {
2925 #if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS
2926         fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
2927         return 1;
2928 #endif
2929         do_bench = 1;
2930         if( argv[1][7] == '=' )
2931         {
2932             bench_pattern = argv[1]+8;
2933             bench_pattern_len = strlen(bench_pattern);
2934         }
2935         argc--;
2936         argv++;
2937     }
2938
2939     int seed = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
2940     fprintf( stderr, "x264: using random seed %u\n", seed );
2941     srand( seed );
2942
2943     buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
2944     pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
2945     if( !buf1 || !pbuf1 )
2946     {
2947         fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
2948         return -1;
2949     }
2950 #define INIT_POINTER_OFFSETS\
2951     buf2 = buf1 + 0xf00;\
2952     buf3 = buf2 + 0xf00;\
2953     buf4 = buf3 + 0x1000*sizeof(pixel);\
2954     pbuf2 = pbuf1 + 0xf00;\
2955     pbuf3 = (pixel*)buf3;\
2956     pbuf4 = (pixel*)buf4;
2957     INIT_POINTER_OFFSETS;
2958     for( int i = 0; i < 0x1e00; i++ )
2959     {
2960         buf1[i] = rand() & 0xFF;
2961         pbuf1[i] = rand() & PIXEL_MAX;
2962     }
2963     memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
2964
2965     if( x264_stack_pagealign( check_all_flags, 0 ) )
2966     {
2967         fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
2968         return -1;
2969     }
2970     fprintf( stderr, "x264: All tests passed Yeah :)\n" );
2971     if( do_bench )
2972         print_bench();
2973     return 0;
2974 }
2975
2976 int main( int argc, char **argv )
2977 {
2978     return x264_stack_align( main_internal, argc, argv );
2979 }