common/dct.c

   1 /*****************************************************************************
   2  * dct.c: transform and zigzag
   3  *****************************************************************************
   4  * Copyright (C) 2003-2017 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Henrik Gramner <henrik@gramner.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #include "common.h"
  29 #if HAVE_MMX
  30 #   include "x86/dct.h"
  31 #endif
  32 #if ARCH_PPC
  33 #   include "ppc/dct.h"
  34 #endif
  35 #if ARCH_ARM
  36 #   include "arm/dct.h"
  37 #endif
  38 #if ARCH_AARCH64
  39 #   include "aarch64/dct.h"
  40 #endif
  41 #if ARCH_MIPS
  42 #   include "mips/dct.h"
  43 #endif
  44
  45 /* the inverse of the scaling factors introduced by 8x8 fdct */
  46 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
  47 #define W(i) (i==0 ? FIX8(1.0000) :\
  48               i==1 ? FIX8(0.8859) :\
  49               i==2 ? FIX8(1.6000) :\
  50               i==3 ? FIX8(0.9415) :\
  51               i==4 ? FIX8(1.2651) :\
  52               i==5 ? FIX8(1.1910) :0)
  53 const uint32_t x264_dct8_weight_tab[64] = {
  54     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  55     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  56     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  57     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  58
  59     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  60     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  61     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  62     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
  63 };
  64 #undef W
  65
  66 #define W(i) (i==0 ? FIX8(1.76777) :\
  67               i==1 ? FIX8(1.11803) :\
  68               i==2 ? FIX8(0.70711) :0)
  69 const uint32_t x264_dct4_weight_tab[16] = {
  70     W(0), W(1), W(0), W(1),
  71     W(1), W(2), W(1), W(2),
  72     W(0), W(1), W(0), W(1),
  73     W(1), W(2), W(1), W(2)
  74 };
  75 #undef W
  76
  77 /* inverse squared */
  78 #define W(i) (i==0 ? FIX8(3.125) :\
  79               i==1 ? FIX8(1.25) :\
  80               i==2 ? FIX8(0.5) :0)
  81 const uint32_t x264_dct4_weight2_tab[16] = {
  82     W(0), W(1), W(0), W(1),
  83     W(1), W(2), W(1), W(2),
  84     W(0), W(1), W(0), W(1),
  85     W(1), W(2), W(1), W(2)
  86 };
  87 #undef W
  88
  89 #define W(i) (i==0 ? FIX8(1.00000) :\
  90               i==1 ? FIX8(0.78487) :\
  91               i==2 ? FIX8(2.56132) :\
  92               i==3 ? FIX8(0.88637) :\
  93               i==4 ? FIX8(1.60040) :\
  94               i==5 ? FIX8(1.41850) :0)
  95 const uint32_t x264_dct8_weight2_tab[64] = {
  96     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  97     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  98     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  99     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
 100
 101     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
 102     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
 103     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
 104     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
 105 };
 106 #undef W
 107
 108
 109 static void dct4x4dc( dctcoef d[16] )
 110 {
 111     dctcoef tmp[16];
 112
 113     for( int i = 0; i < 4; i++ )
 114     {
 115         int s01 = d[i*4+0] + d[i*4+1];
 116         int d01 = d[i*4+0] - d[i*4+1];
 117         int s23 = d[i*4+2] + d[i*4+3];
 118         int d23 = d[i*4+2] - d[i*4+3];
 119
 120         tmp[0*4+i] = s01 + s23;
 121         tmp[1*4+i] = s01 - s23;
 122         tmp[2*4+i] = d01 - d23;
 123         tmp[3*4+i] = d01 + d23;
 124     }
 125
 126     for( int i = 0; i < 4; i++ )
 127     {
 128         int s01 = tmp[i*4+0] + tmp[i*4+1];
 129         int d01 = tmp[i*4+0] - tmp[i*4+1];
 130         int s23 = tmp[i*4+2] + tmp[i*4+3];
 131         int d23 = tmp[i*4+2] - tmp[i*4+3];
 132
 133         d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
 134         d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
 135         d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
 136         d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
 137     }
 138 }
 139
 140 static void idct4x4dc( dctcoef d[16] )
 141 {
 142     dctcoef tmp[16];
 143
 144     for( int i = 0; i < 4; i++ )
 145     {
 146         int s01 = d[i*4+0] + d[i*4+1];
 147         int d01 = d[i*4+0] - d[i*4+1];
 148         int s23 = d[i*4+2] + d[i*4+3];
 149         int d23 = d[i*4+2] - d[i*4+3];
 150
 151         tmp[0*4+i] = s01 + s23;
 152         tmp[1*4+i] = s01 - s23;
 153         tmp[2*4+i] = d01 - d23;
 154         tmp[3*4+i] = d01 + d23;
 155     }
 156
 157     for( int i = 0; i < 4; i++ )
 158     {
 159         int s01 = tmp[i*4+0] + tmp[i*4+1];
 160         int d01 = tmp[i*4+0] - tmp[i*4+1];
 161         int s23 = tmp[i*4+2] + tmp[i*4+3];
 162         int d23 = tmp[i*4+2] - tmp[i*4+3];
 163
 164         d[i*4+0] = s01 + s23;
 165         d[i*4+1] = s01 - s23;
 166         d[i*4+2] = d01 - d23;
 167         d[i*4+3] = d01 + d23;
 168     }
 169 }
 170
 171 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
 172 {
 173     int a0 = dct4x4[0][0] + dct4x4[1][0];
 174     int a1 = dct4x4[2][0] + dct4x4[3][0];
 175     int a2 = dct4x4[4][0] + dct4x4[5][0];
 176     int a3 = dct4x4[6][0] + dct4x4[7][0];
 177     int a4 = dct4x4[0][0] - dct4x4[1][0];
 178     int a5 = dct4x4[2][0] - dct4x4[3][0];
 179     int a6 = dct4x4[4][0] - dct4x4[5][0];
 180     int a7 = dct4x4[6][0] - dct4x4[7][0];
 181     int b0 = a0 + a1;
 182     int b1 = a2 + a3;
 183     int b2 = a4 + a5;
 184     int b3 = a6 + a7;
 185     int b4 = a0 - a1;
 186     int b5 = a2 - a3;
 187     int b6 = a4 - a5;
 188     int b7 = a6 - a7;
 189     dct[0] = b0 + b1;
 190     dct[1] = b2 + b3;
 191     dct[2] = b0 - b1;
 192     dct[3] = b2 - b3;
 193     dct[4] = b4 - b5;
 194     dct[5] = b6 - b7;
 195     dct[6] = b4 + b5;
 196     dct[7] = b6 + b7;
 197     dct4x4[0][0] = 0;
 198     dct4x4[1][0] = 0;
 199     dct4x4[2][0] = 0;
 200     dct4x4[3][0] = 0;
 201     dct4x4[4][0] = 0;
 202     dct4x4[5][0] = 0;
 203     dct4x4[6][0] = 0;
 204     dct4x4[7][0] = 0;
 205 }
 206
 207 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
 208                                   pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
 209 {
 210     for( int y = 0; y < i_size; y++ )
 211     {
 212         for( int x = 0; x < i_size; x++ )
 213             diff[x + y*i_size] = pix1[x] - pix2[x];
 214         pix1 += i_pix1;
 215         pix2 += i_pix2;
 216     }
 217 }
 218
 219 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
 220 {
 221     dctcoef d[16];
 222     dctcoef tmp[16];
 223
 224     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 225
 226     for( int i = 0; i < 4; i++ )
 227     {
 228         int s03 = d[i*4+0] + d[i*4+3];
 229         int s12 = d[i*4+1] + d[i*4+2];
 230         int d03 = d[i*4+0] - d[i*4+3];
 231         int d12 = d[i*4+1] - d[i*4+2];
 232
 233         tmp[0*4+i] =   s03 +   s12;
 234         tmp[1*4+i] = 2*d03 +   d12;
 235         tmp[2*4+i] =   s03 -   s12;
 236         tmp[3*4+i] =   d03 - 2*d12;
 237     }
 238
 239     for( int i = 0; i < 4; i++ )
 240     {
 241         int s03 = tmp[i*4+0] + tmp[i*4+3];
 242         int s12 = tmp[i*4+1] + tmp[i*4+2];
 243         int d03 = tmp[i*4+0] - tmp[i*4+3];
 244         int d12 = tmp[i*4+1] - tmp[i*4+2];
 245
 246         dct[i*4+0] =   s03 +   s12;
 247         dct[i*4+1] = 2*d03 +   d12;
 248         dct[i*4+2] =   s03 -   s12;
 249         dct[i*4+3] =   d03 - 2*d12;
 250     }
 251 }
 252
 253 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
 254 {
 255     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 256     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 257     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 258     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 259 }
 260
 261 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
 262 {
 263     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 264     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 265     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 266     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 267 }
 268
 269 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
 270 {
 271     int sum = 0;
 272     for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
 273         sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
 274              - pix2[0] - pix2[1] - pix2[2] - pix2[3];
 275     return sum;
 276 }
 277
 278 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
 279 {
 280     dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 281     dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 282     dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 283     dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 284
 285     /* 2x2 DC transform */
 286     int d0 = dct[0] + dct[1];
 287     int d1 = dct[2] + dct[3];
 288     int d2 = dct[0] - dct[1];
 289     int d3 = dct[2] - dct[3];
 290     dct[0] = d0 + d1;
 291     dct[1] = d0 - d1;
 292     dct[2] = d2 + d3;
 293     dct[3] = d2 - d3;
 294 }
 295
 296 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
 297 {
 298     int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
 299     int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
 300     int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
 301     int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
 302     int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
 303     int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
 304     int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
 305     int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
 306
 307     /* 2x4 DC transform */
 308     int b0 = a0 + a1;
 309     int b1 = a2 + a3;
 310     int b2 = a4 + a5;
 311     int b3 = a6 + a7;
 312     int b4 = a0 - a1;
 313     int b5 = a2 - a3;
 314     int b6 = a4 - a5;
 315     int b7 = a6 - a7;
 316     a0 = b0 + b1;
 317     a1 = b2 + b3;
 318     a2 = b4 + b5;
 319     a3 = b6 + b7;
 320     a4 = b0 - b1;
 321     a5 = b2 - b3;
 322     a6 = b4 - b5;
 323     a7 = b6 - b7;
 324     dct[0] = a0 + a1;
 325     dct[1] = a2 + a3;
 326     dct[2] = a0 - a1;
 327     dct[3] = a2 - a3;
 328     dct[4] = a4 - a5;
 329     dct[5] = a6 - a7;
 330     dct[6] = a4 + a5;
 331     dct[7] = a6 + a7;
 332 }
 333
 334 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
 335 {
 336     dctcoef d[16];
 337     dctcoef tmp[16];
 338
 339     for( int i = 0; i < 4; i++ )
 340     {
 341         int s02 =  dct[0*4+i]     +  dct[2*4+i];
 342         int d02 =  dct[0*4+i]     -  dct[2*4+i];
 343         int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
 344         int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
 345
 346         tmp[i*4+0] = s02 + s13;
 347         tmp[i*4+1] = d02 + d13;
 348         tmp[i*4+2] = d02 - d13;
 349         tmp[i*4+3] = s02 - s13;
 350     }
 351
 352     for( int i = 0; i < 4; i++ )
 353     {
 354         int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
 355         int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
 356         int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
 357         int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
 358
 359         d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
 360         d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
 361         d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
 362         d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
 363     }
 364
 365
 366     for( int y = 0; y < 4; y++ )
 367     {
 368         for( int x = 0; x < 4; x++ )
 369             p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
 370         p_dst += FDEC_STRIDE;
 371     }
 372 }
 373
 374 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
 375 {
 376     add4x4_idct( &p_dst[0],               dct[0] );
 377     add4x4_idct( &p_dst[4],               dct[1] );
 378     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 379     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 380 }
 381
 382 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
 383 {
 384     add8x8_idct( &p_dst[0],               &dct[0] );
 385     add8x8_idct( &p_dst[8],               &dct[4] );
 386     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 387     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 388 }
 389
 390 /****************************************************************************
 391  * 8x8 transform:
 392  ****************************************************************************/
 393
 394 #define DCT8_1D {\
 395     int s07 = SRC(0) + SRC(7);\
 396     int s16 = SRC(1) + SRC(6);\
 397     int s25 = SRC(2) + SRC(5);\
 398     int s34 = SRC(3) + SRC(4);\
 399     int a0 = s07 + s34;\
 400     int a1 = s16 + s25;\
 401     int a2 = s07 - s34;\
 402     int a3 = s16 - s25;\
 403     int d07 = SRC(0) - SRC(7);\
 404     int d16 = SRC(1) - SRC(6);\
 405     int d25 = SRC(2) - SRC(5);\
 406     int d34 = SRC(3) - SRC(4);\
 407     int a4 = d16 + d25 + (d07 + (d07>>1));\
 408     int a5 = d07 - d34 - (d25 + (d25>>1));\
 409     int a6 = d07 + d34 - (d16 + (d16>>1));\
 410     int a7 = d16 - d25 + (d34 + (d34>>1));\
 411     DST(0) =  a0 + a1     ;\
 412     DST(1) =  a4 + (a7>>2);\
 413     DST(2) =  a2 + (a3>>1);\
 414     DST(3) =  a5 + (a6>>2);\
 415     DST(4) =  a0 - a1     ;\
 416     DST(5) =  a6 - (a5>>2);\
 417     DST(6) = (a2>>1) - a3 ;\
 418     DST(7) = (a4>>2) - a7 ;\
 419 }
 420
 421 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
 422 {
 423     dctcoef tmp[64];
 424
 425     pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 426
 427 #define SRC(x) tmp[x*8+i]
 428 #define DST(x) tmp[x*8+i]
 429     for( int i = 0; i < 8; i++ )
 430         DCT8_1D
 431 #undef SRC
 432 #undef DST
 433
 434 #define SRC(x) tmp[i*8+x]
 435 #define DST(x) dct[x*8+i]
 436     for( int i = 0; i < 8; i++ )
 437         DCT8_1D
 438 #undef SRC
 439 #undef DST
 440 }
 441
 442 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
 443 {
 444     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 445     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 446     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 447     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 448 }
 449
 450 #define IDCT8_1D {\
 451     int a0 =  SRC(0) + SRC(4);\
 452     int a2 =  SRC(0) - SRC(4);\
 453     int a4 = (SRC(2)>>1) - SRC(6);\
 454     int a6 = (SRC(6)>>1) + SRC(2);\
 455     int b0 = a0 + a6;\
 456     int b2 = a2 + a4;\
 457     int b4 = a2 - a4;\
 458     int b6 = a0 - a6;\
 459     int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 460     int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 461     int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 462     int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 463     int b1 = (a7>>2) + a1;\
 464     int b3 =  a3 + (a5>>2);\
 465     int b5 = (a3>>2) - a5;\
 466     int b7 =  a7 - (a1>>2);\
 467     DST(0, b0 + b7);\
 468     DST(1, b2 + b5);\
 469     DST(2, b4 + b3);\
 470     DST(3, b6 + b1);\
 471     DST(4, b6 - b1);\
 472     DST(5, b4 - b3);\
 473     DST(6, b2 - b5);\
 474     DST(7, b0 - b7);\
 475 }
 476
 477 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
 478 {
 479     dct[0] += 32; // rounding for the >>6 at the end
 480
 481 #define SRC(x)     dct[x*8+i]
 482 #define DST(x,rhs) dct[x*8+i] = (rhs)
 483     for( int i = 0; i < 8; i++ )
 484         IDCT8_1D
 485 #undef SRC
 486 #undef DST
 487
 488 #define SRC(x)     dct[i*8+x]
 489 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 490     for( int i = 0; i < 8; i++ )
 491         IDCT8_1D
 492 #undef SRC
 493 #undef DST
 494 }
 495
 496 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
 497 {
 498     add8x8_idct8( &dst[0],               dct[0] );
 499     add8x8_idct8( &dst[8],               dct[1] );
 500     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 501     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 502 }
 503
 504 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
 505 {
 506     dc = (dc + 32) >> 6;
 507     for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 508     {
 509         p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
 510         p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
 511         p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
 512         p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
 513     }
 514 }
 515
 516 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
 517 {
 518     add4x4_idct_dc( &p_dst[0],               dct[0] );
 519     add4x4_idct_dc( &p_dst[4],               dct[1] );
 520     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 521     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 522 }
 523
 524 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
 525 {
 526     for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
 527     {
 528         add4x4_idct_dc( &p_dst[ 0], dct[0] );
 529         add4x4_idct_dc( &p_dst[ 4], dct[1] );
 530         add4x4_idct_dc( &p_dst[ 8], dct[2] );
 531         add4x4_idct_dc( &p_dst[12], dct[3] );
 532     }
 533 }
 534
 535
 536 /****************************************************************************
 537  * x264_dct_init:
 538  ****************************************************************************/
 539 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 540 {
 541     dctf->sub4x4_dct    = sub4x4_dct;
 542     dctf->add4x4_idct   = add4x4_idct;
 543
 544     dctf->sub8x8_dct    = sub8x8_dct;
 545     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 546     dctf->add8x8_idct   = add8x8_idct;
 547     dctf->add8x8_idct_dc = add8x8_idct_dc;
 548
 549     dctf->sub8x16_dct_dc = sub8x16_dct_dc;
 550
 551     dctf->sub16x16_dct  = sub16x16_dct;
 552     dctf->add16x16_idct = add16x16_idct;
 553     dctf->add16x16_idct_dc = add16x16_idct_dc;
 554
 555     dctf->sub8x8_dct8   = sub8x8_dct8;
 556     dctf->add8x8_idct8  = add8x8_idct8;
 557
 558     dctf->sub16x16_dct8  = sub16x16_dct8;
 559     dctf->add16x16_idct8 = add16x16_idct8;
 560
 561     dctf->dct4x4dc  = dct4x4dc;
 562     dctf->idct4x4dc = idct4x4dc;
 563
 564     dctf->dct2x4dc = dct2x4dc;
 565
 566 #if HIGH_BIT_DEPTH
 567 #if HAVE_MMX
 568     if( cpu&X264_CPU_MMX )
 569     {
 570         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 571         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 572         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 573     }
 574     if( cpu&X264_CPU_SSE2 )
 575     {
 576         dctf->add4x4_idct     = x264_add4x4_idct_sse2;
 577         dctf->dct4x4dc        = x264_dct4x4dc_sse2;
 578         dctf->idct4x4dc       = x264_idct4x4dc_sse2;
 579         dctf->dct2x4dc        = x264_dct2x4dc_sse2;
 580         dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse2;
 581         dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse2;
 582         dctf->add8x8_idct     = x264_add8x8_idct_sse2;
 583         dctf->add16x16_idct   = x264_add16x16_idct_sse2;
 584         dctf->add8x8_idct8    = x264_add8x8_idct8_sse2;
 585         dctf->add16x16_idct8    = x264_add16x16_idct8_sse2;
 586         dctf->sub8x8_dct_dc   = x264_sub8x8_dct_dc_sse2;
 587         dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_sse2;
 588         dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_sse2;
 589         dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
 590     }
 591     if( cpu&X264_CPU_SSE4 )
 592     {
 593         dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse4;
 594         dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse4;
 595     }
 596     if( cpu&X264_CPU_AVX )
 597     {
 598         dctf->add4x4_idct     = x264_add4x4_idct_avx;
 599         dctf->dct4x4dc        = x264_dct4x4dc_avx;
 600         dctf->idct4x4dc       = x264_idct4x4dc_avx;
 601         dctf->dct2x4dc        = x264_dct2x4dc_avx;
 602         dctf->sub8x8_dct8     = x264_sub8x8_dct8_avx;
 603         dctf->sub16x16_dct8   = x264_sub16x16_dct8_avx;
 604         dctf->add8x8_idct     = x264_add8x8_idct_avx;
 605         dctf->add16x16_idct   = x264_add16x16_idct_avx;
 606         dctf->add8x8_idct8    = x264_add8x8_idct8_avx;
 607         dctf->add16x16_idct8  = x264_add16x16_idct8_avx;
 608         dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_avx;
 609         dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_avx;
 610         dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
 611     }
 612 #endif // HAVE_MMX
 613 #else // !HIGH_BIT_DEPTH
 614 #if HAVE_MMX
 615     if( cpu&X264_CPU_MMX )
 616     {
 617         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 618         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 619         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 620         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
 621
 622 #if !ARCH_X86_64
 623         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 624         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 625         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 626         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 627
 628         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 629         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 630         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 631         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 632 #endif
 633     }
 634
 635     if( cpu&X264_CPU_MMX2 )
 636     {
 637         dctf->dct4x4dc         = x264_dct4x4dc_mmx2;
 638         dctf->dct2x4dc         = x264_dct2x4dc_mmx2;
 639         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
 640         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
 641     }
 642
 643     if( cpu&X264_CPU_SSE2 )
 644     {
 645         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 646         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 647         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 648         dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
 649         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 650         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 651
 652         if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
 653         {
 654             dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 655             dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 656             dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 657             dctf->add16x16_idct = x264_add16x16_idct_sse2;
 658             dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 659         }
 660     }
 661
 662     if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
 663     {
 664         dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
 665         if( !(cpu&X264_CPU_SLOW_ATOM) )
 666         {
 667             dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 668             dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 669             dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 670             dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 671             dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 672             if( !(cpu&X264_CPU_SLOW_PSHUFB) )
 673             {
 674                 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 675                 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 676             }
 677         }
 678     }
 679
 680     if( cpu&X264_CPU_SSE4 )
 681         dctf->add4x4_idct   = x264_add4x4_idct_sse4;
 682
 683     if( cpu&X264_CPU_AVX )
 684     {
 685         dctf->add4x4_idct      = x264_add4x4_idct_avx;
 686         dctf->add8x8_idct      = x264_add8x8_idct_avx;
 687         dctf->add16x16_idct    = x264_add16x16_idct_avx;
 688         dctf->add8x8_idct8     = x264_add8x8_idct8_avx;
 689         dctf->add16x16_idct8   = x264_add16x16_idct8_avx;
 690         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
 691         dctf->sub8x8_dct       = x264_sub8x8_dct_avx;
 692         dctf->sub16x16_dct     = x264_sub16x16_dct_avx;
 693         dctf->sub8x8_dct8      = x264_sub8x8_dct8_avx;
 694         dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx;
 695     }
 696
 697     if( cpu&X264_CPU_XOP )
 698     {
 699         dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
 700         dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
 701     }
 702
 703     if( cpu&X264_CPU_AVX2 )
 704     {
 705         dctf->add8x8_idct      = x264_add8x8_idct_avx2;
 706         dctf->add16x16_idct    = x264_add16x16_idct_avx2;
 707         dctf->sub8x8_dct       = x264_sub8x8_dct_avx2;
 708         dctf->sub16x16_dct     = x264_sub16x16_dct_avx2;
 709         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
 710 #if ARCH_X86_64
 711         dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx2;
 712 #endif
 713     }
 714
 715     if( cpu&X264_CPU_AVX512 )
 716     {
 717         dctf->sub4x4_dct       = x264_sub4x4_dct_avx512;
 718         dctf->sub8x8_dct       = x264_sub8x8_dct_avx512;
 719         dctf->sub16x16_dct     = x264_sub16x16_dct_avx512;
 720         dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_avx512;
 721         dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_avx512;
 722         dctf->add8x8_idct      = x264_add8x8_idct_avx512;
 723     }
 724 #endif //HAVE_MMX
 725
 726 #if HAVE_ALTIVEC
 727     if( cpu&X264_CPU_ALTIVEC )
 728     {
 729         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 730         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 731         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 732
 733         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_altivec;
 734
 735         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 736         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 737         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 738
 739         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec;
 740         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 741         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 742
 743         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 744         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 745     }
 746 #endif
 747
 748 #if HAVE_ARMV6 || ARCH_AARCH64
 749     if( cpu&X264_CPU_NEON )
 750     {
 751         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 752         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 753         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 754         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 755         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 756         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 757         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 758         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 759
 760         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 761         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 762         dctf->add16x16_idct = x264_add16x16_idct_neon;
 763
 764         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 765         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 766
 767         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 768         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 769         dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
 770     }
 771 #endif
 772
 773 #if HAVE_MSA
 774     if( cpu&X264_CPU_MSA )
 775     {
 776         dctf->sub4x4_dct       = x264_sub4x4_dct_msa;
 777         dctf->sub8x8_dct       = x264_sub8x8_dct_msa;
 778         dctf->sub16x16_dct     = x264_sub16x16_dct_msa;
 779         dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_msa;
 780         dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_msa;
 781         dctf->dct4x4dc         = x264_dct4x4dc_msa;
 782         dctf->idct4x4dc        = x264_idct4x4dc_msa;
 783         dctf->add4x4_idct      = x264_add4x4_idct_msa;
 784         dctf->add8x8_idct      = x264_add8x8_idct_msa;
 785         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_msa;
 786         dctf->add16x16_idct    = x264_add16x16_idct_msa;
 787         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
 788         dctf->add8x8_idct8     = x264_add8x8_idct8_msa;
 789         dctf->add16x16_idct8   = x264_add16x16_idct8_msa;
 790     }
 791 #endif
 792
 793 #endif // HIGH_BIT_DEPTH
 794 }
 795
 796
 797 #define ZIG(i,y,x) level[i] = dct[x*8+y];
 798 #define ZIGZAG8_FRAME\
 799     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 800     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 801     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 802     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 803     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 804     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 805     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 806     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 807     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 808     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 809     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 810     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 811     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 812     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 813     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 814     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 815
 816 #define ZIGZAG8_FIELD\
 817     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 818     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 819     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 820     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 821     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 822     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 823     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 824     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 825     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 826     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 827     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 828     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 829     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 830     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 831     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 832     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 833
 834 #define ZIGZAG4_FRAME\
 835     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 836     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 837     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 838     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 839
 840 #define ZIGZAG4_FIELD\
 841     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 842     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 843     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 844     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 845
 846 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
 847 {
 848     ZIGZAG8_FRAME
 849 }
 850
 851 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
 852 {
 853     ZIGZAG8_FIELD
 854 }
 855
 856 #undef ZIG
 857 #define ZIG(i,y,x) level[i] = dct[x*4+y];
 858 #define ZIGDC(i,y,x) ZIG(i,y,x)
 859
 860 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
 861 {
 862     ZIGZAG4_FRAME
 863 }
 864
 865 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
 866 {
 867     memcpy( level, dct, 2 * sizeof(dctcoef) );
 868     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 869     memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
 870 }
 871
 872 #undef ZIG
 873 #define ZIG(i,y,x) {\
 874     int oe = x+y*FENC_STRIDE;\
 875     int od = x+y*FDEC_STRIDE;\
 876     level[i] = p_src[oe] - p_dst[od];\
 877     nz |= level[i];\
 878 }
 879 #define COPY4x4\
 880     CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 881     CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 882     CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 883     CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 884 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
 885 #define COPY8x8\
 886     CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 887     CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 888     CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 889     CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
 890     CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
 891     CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
 892     CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
 893     CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
 894
 895 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
 896 {
 897     int nz = 0;
 898     ZIGZAG4_FRAME
 899     COPY4x4
 900     return !!nz;
 901 }
 902
 903 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
 904 {
 905     int nz = 0;
 906     ZIGZAG4_FIELD
 907     COPY4x4
 908     return !!nz;
 909 }
 910
 911 #undef ZIGDC
 912 #define ZIGDC(i,y,x) {\
 913     int oe = x+y*FENC_STRIDE;\
 914     int od = x+y*FDEC_STRIDE;\
 915     *dc = p_src[oe] - p_dst[od];\
 916     level[0] = 0;\
 917 }
 918
 919 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
 920 {
 921     int nz = 0;
 922     ZIGZAG4_FRAME
 923     COPY4x4
 924     return !!nz;
 925 }
 926
 927 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
 928 {
 929     int nz = 0;
 930     ZIGZAG4_FIELD
 931     COPY4x4
 932     return !!nz;
 933 }
 934
 935 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
 936 {
 937     int nz = 0;
 938     ZIGZAG8_FRAME
 939     COPY8x8
 940     return !!nz;
 941 }
 942 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
 943 {
 944     int nz = 0;
 945     ZIGZAG8_FIELD
 946     COPY8x8
 947     return !!nz;
 948 }
 949
 950 #undef ZIG
 951 #undef COPY4x4
 952
 953 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
 954 {
 955     for( int i = 0; i < 4; i++ )
 956     {
 957         int nz = 0;
 958         for( int j = 0; j < 16; j++ )
 959         {
 960             nz |= src[i+j*4];
 961             dst[i*16+j] = src[i+j*4];
 962         }
 963         nnz[(i&1) + (i>>1)*8] = !!nz;
 964     }
 965 }
 966
 967 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
 968 {
 969     pf_interlaced->scan_8x8   = zigzag_scan_8x8_field;
 970     pf_progressive->scan_8x8  = zigzag_scan_8x8_frame;
 971     pf_interlaced->scan_4x4   = zigzag_scan_4x4_field;
 972     pf_progressive->scan_4x4  = zigzag_scan_4x4_frame;
 973     pf_interlaced->sub_8x8    = zigzag_sub_8x8_field;
 974     pf_progressive->sub_8x8   = zigzag_sub_8x8_frame;
 975     pf_interlaced->sub_4x4    = zigzag_sub_4x4_field;
 976     pf_progressive->sub_4x4   = zigzag_sub_4x4_frame;
 977     pf_interlaced->sub_4x4ac  = zigzag_sub_4x4ac_field;
 978     pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
 979
 980 #if HIGH_BIT_DEPTH
 981 #if HAVE_MMX
 982     if( cpu&X264_CPU_SSE2 )
 983     {
 984         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse2;
 985         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
 986         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 987     }
 988     if( cpu&X264_CPU_SSE4 )
 989         pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
 990     if( cpu&X264_CPU_AVX )
 991         pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
 992 #if ARCH_X86_64
 993     if( cpu&X264_CPU_AVX )
 994     {
 995         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
 996         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
 997     }
 998 #endif // ARCH_X86_64
 999     if( cpu&X264_CPU_AVX512 )
1000     {
1001         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_avx512;
1002         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
1003         pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_avx512;
1004         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
1005     }
1006 #endif // HAVE_MMX
1007 #else
1008 #if HAVE_MMX
1009     if( cpu&X264_CPU_MMX )
1010         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
1011     if( cpu&X264_CPU_MMX2 )
1012     {
1013         pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
1014         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
1015     }
1016     if( cpu&X264_CPU_SSE )
1017         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse;
1018     if( cpu&X264_CPU_SSE2_IS_FAST )
1019         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
1020     if( cpu&X264_CPU_SSSE3 )
1021     {
1022         pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_ssse3;
1023         pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
1024         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1025         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1026         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1027         if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1028             pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1029     }
1030     if( cpu&X264_CPU_AVX )
1031     {
1032         pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_avx;
1033         pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_avx;
1034 #if ARCH_X86_64
1035         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1036         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1037 #endif
1038         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1039     }
1040     if( cpu&X264_CPU_XOP )
1041     {
1042         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1043         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1044         pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1045     }
1046     if( cpu&X264_CPU_AVX512 )
1047     {
1048         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_avx512;
1049         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
1050         pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_avx512;
1051         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
1052     }
1053 #endif // HAVE_MMX
1054 #if HAVE_ALTIVEC
1055     if( cpu&X264_CPU_ALTIVEC )
1056     {
1057         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_altivec;
1058         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1059         pf_progressive->scan_8x8  = x264_zigzag_scan_8x8_frame_altivec;
1060     }
1061 #endif
1062 #if HAVE_ARMV6 || ARCH_AARCH64
1063     if( cpu&X264_CPU_NEON )
1064     {
1065         pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_neon;
1066 #if ARCH_AARCH64
1067         pf_interlaced->scan_4x4   = x264_zigzag_scan_4x4_field_neon;
1068         pf_interlaced->scan_8x8   = x264_zigzag_scan_8x8_field_neon;
1069         pf_interlaced->sub_4x4    = x264_zigzag_sub_4x4_field_neon;
1070         pf_interlaced->sub_4x4ac  = x264_zigzag_sub_4x4ac_field_neon;
1071         pf_interlaced->sub_8x8    = x264_zigzag_sub_8x8_field_neon;
1072         pf_progressive->scan_8x8  = x264_zigzag_scan_8x8_frame_neon;
1073         pf_progressive->sub_4x4   = x264_zigzag_sub_4x4_frame_neon;
1074         pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1075         pf_progressive->sub_8x8   = x264_zigzag_sub_8x8_frame_neon;
1076 #endif // ARCH_AARCH64
1077     }
1078 #endif // HAVE_ARMV6 || ARCH_AARCH64
1079 #endif // HIGH_BIT_DEPTH
1080
1081     pf_interlaced->interleave_8x8_cavlc =
1082     pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1083 #if HAVE_MMX
1084 #if HIGH_BIT_DEPTH
1085     if( cpu&X264_CPU_SSE2 )
1086     {
1087         pf_interlaced->interleave_8x8_cavlc =
1088         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1089     }
1090     if( cpu&X264_CPU_AVX )
1091     {
1092         pf_interlaced->interleave_8x8_cavlc =
1093         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1094     }
1095     if( cpu&X264_CPU_AVX512 )
1096     {
1097         pf_interlaced->interleave_8x8_cavlc =
1098         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
1099     }
1100 #else
1101     if( cpu&X264_CPU_MMX )
1102     {
1103         pf_interlaced->interleave_8x8_cavlc =
1104         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1105     }
1106     if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1107     {
1108         pf_interlaced->interleave_8x8_cavlc =
1109         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1110     }
1111
1112     if( cpu&X264_CPU_AVX )
1113     {
1114         pf_interlaced->interleave_8x8_cavlc =
1115         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1116     }
1117
1118     if( cpu&X264_CPU_AVX2 )
1119     {
1120         pf_interlaced->interleave_8x8_cavlc =
1121         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1122     }
1123     if( cpu&X264_CPU_AVX512 )
1124     {
1125         pf_interlaced->interleave_8x8_cavlc =
1126         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
1127     }
1128 #endif // HIGH_BIT_DEPTH
1129 #endif
1130 #if !HIGH_BIT_DEPTH
1131 #if ARCH_AARCH64
1132     if( cpu&X264_CPU_NEON )
1133     {
1134         pf_interlaced->interleave_8x8_cavlc =
1135         pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
1136     }
1137 #endif // ARCH_AARCH64
1138
1139 #if HAVE_ALTIVEC
1140     if( cpu&X264_CPU_ALTIVEC )
1141     {
1142         pf_interlaced->interleave_8x8_cavlc =
1143         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_altivec;
1144     }
1145 #endif // HAVE_ALTIVEC
1146
1147 #if HAVE_MSA
1148     if( cpu&X264_CPU_MSA )
1149     {
1150         pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_msa;
1151     }
1152 #endif
1153 #endif // !HIGH_BIT_DEPTH
1154 }