K-means weightp
[x264-7mod.git] / common / dct.c
blob70853bf9325887fd969e78b03d745da7d57014c7
1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2017 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <henrik@gramner.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #include "common.h"
29 #if HAVE_MMX
30 # include "x86/dct.h"
31 #endif
32 #if ARCH_PPC
33 # include "ppc/dct.h"
34 #endif
35 #if ARCH_ARM
36 # include "arm/dct.h"
37 #endif
38 #if ARCH_AARCH64
39 # include "aarch64/dct.h"
40 #endif
41 #if ARCH_MIPS
42 # include "mips/dct.h"
43 #endif
45 /* the inverse of the scaling factors introduced by 8x8 fdct */
46 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47 #define W(i) (i==0 ? FIX8(1.0000) :\
48 i==1 ? FIX8(0.8859) :\
49 i==2 ? FIX8(1.6000) :\
50 i==3 ? FIX8(0.9415) :\
51 i==4 ? FIX8(1.2651) :\
52 i==5 ? FIX8(1.1910) :0)
53 const uint32_t x264_dct8_weight_tab[64] = {
54 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
55 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
56 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
57 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
59 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
60 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
61 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
62 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
64 #undef W
66 #define W(i) (i==0 ? FIX8(1.76777) :\
67 i==1 ? FIX8(1.11803) :\
68 i==2 ? FIX8(0.70711) :0)
69 const uint32_t x264_dct4_weight_tab[16] = {
70 W(0), W(1), W(0), W(1),
71 W(1), W(2), W(1), W(2),
72 W(0), W(1), W(0), W(1),
73 W(1), W(2), W(1), W(2)
75 #undef W
77 /* inverse squared */
78 #define W(i) (i==0 ? FIX8(3.125) :\
79 i==1 ? FIX8(1.25) :\
80 i==2 ? FIX8(0.5) :0)
81 const uint32_t x264_dct4_weight2_tab[16] = {
82 W(0), W(1), W(0), W(1),
83 W(1), W(2), W(1), W(2),
84 W(0), W(1), W(0), W(1),
85 W(1), W(2), W(1), W(2)
87 #undef W
89 #define W(i) (i==0 ? FIX8(1.00000) :\
90 i==1 ? FIX8(0.78487) :\
91 i==2 ? FIX8(2.56132) :\
92 i==3 ? FIX8(0.88637) :\
93 i==4 ? FIX8(1.60040) :\
94 i==5 ? FIX8(1.41850) :0)
95 const uint32_t x264_dct8_weight2_tab[64] = {
96 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
97 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
98 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
99 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
101 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
102 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
103 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
104 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
106 #undef W
109 static void dct4x4dc( dctcoef d[16] )
111 dctcoef tmp[16];
113 for( int i = 0; i < 4; i++ )
115 int s01 = d[i*4+0] + d[i*4+1];
116 int d01 = d[i*4+0] - d[i*4+1];
117 int s23 = d[i*4+2] + d[i*4+3];
118 int d23 = d[i*4+2] - d[i*4+3];
120 tmp[0*4+i] = s01 + s23;
121 tmp[1*4+i] = s01 - s23;
122 tmp[2*4+i] = d01 - d23;
123 tmp[3*4+i] = d01 + d23;
126 for( int i = 0; i < 4; i++ )
128 int s01 = tmp[i*4+0] + tmp[i*4+1];
129 int d01 = tmp[i*4+0] - tmp[i*4+1];
130 int s23 = tmp[i*4+2] + tmp[i*4+3];
131 int d23 = tmp[i*4+2] - tmp[i*4+3];
133 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
134 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
135 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
136 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
140 static void idct4x4dc( dctcoef d[16] )
142 dctcoef tmp[16];
144 for( int i = 0; i < 4; i++ )
146 int s01 = d[i*4+0] + d[i*4+1];
147 int d01 = d[i*4+0] - d[i*4+1];
148 int s23 = d[i*4+2] + d[i*4+3];
149 int d23 = d[i*4+2] - d[i*4+3];
151 tmp[0*4+i] = s01 + s23;
152 tmp[1*4+i] = s01 - s23;
153 tmp[2*4+i] = d01 - d23;
154 tmp[3*4+i] = d01 + d23;
157 for( int i = 0; i < 4; i++ )
159 int s01 = tmp[i*4+0] + tmp[i*4+1];
160 int d01 = tmp[i*4+0] - tmp[i*4+1];
161 int s23 = tmp[i*4+2] + tmp[i*4+3];
162 int d23 = tmp[i*4+2] - tmp[i*4+3];
164 d[i*4+0] = s01 + s23;
165 d[i*4+1] = s01 - s23;
166 d[i*4+2] = d01 - d23;
167 d[i*4+3] = d01 + d23;
171 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
173 int a0 = dct4x4[0][0] + dct4x4[1][0];
174 int a1 = dct4x4[2][0] + dct4x4[3][0];
175 int a2 = dct4x4[4][0] + dct4x4[5][0];
176 int a3 = dct4x4[6][0] + dct4x4[7][0];
177 int a4 = dct4x4[0][0] - dct4x4[1][0];
178 int a5 = dct4x4[2][0] - dct4x4[3][0];
179 int a6 = dct4x4[4][0] - dct4x4[5][0];
180 int a7 = dct4x4[6][0] - dct4x4[7][0];
181 int b0 = a0 + a1;
182 int b1 = a2 + a3;
183 int b2 = a4 + a5;
184 int b3 = a6 + a7;
185 int b4 = a0 - a1;
186 int b5 = a2 - a3;
187 int b6 = a4 - a5;
188 int b7 = a6 - a7;
189 dct[0] = b0 + b1;
190 dct[1] = b2 + b3;
191 dct[2] = b0 - b1;
192 dct[3] = b2 - b3;
193 dct[4] = b4 - b5;
194 dct[5] = b6 - b7;
195 dct[6] = b4 + b5;
196 dct[7] = b6 + b7;
197 dct4x4[0][0] = 0;
198 dct4x4[1][0] = 0;
199 dct4x4[2][0] = 0;
200 dct4x4[3][0] = 0;
201 dct4x4[4][0] = 0;
202 dct4x4[5][0] = 0;
203 dct4x4[6][0] = 0;
204 dct4x4[7][0] = 0;
207 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
208 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
210 for( int y = 0; y < i_size; y++ )
212 for( int x = 0; x < i_size; x++ )
213 diff[x + y*i_size] = pix1[x] - pix2[x];
214 pix1 += i_pix1;
215 pix2 += i_pix2;
219 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
221 dctcoef d[16];
222 dctcoef tmp[16];
224 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
226 for( int i = 0; i < 4; i++ )
228 int s03 = d[i*4+0] + d[i*4+3];
229 int s12 = d[i*4+1] + d[i*4+2];
230 int d03 = d[i*4+0] - d[i*4+3];
231 int d12 = d[i*4+1] - d[i*4+2];
233 tmp[0*4+i] = s03 + s12;
234 tmp[1*4+i] = 2*d03 + d12;
235 tmp[2*4+i] = s03 - s12;
236 tmp[3*4+i] = d03 - 2*d12;
239 for( int i = 0; i < 4; i++ )
241 int s03 = tmp[i*4+0] + tmp[i*4+3];
242 int s12 = tmp[i*4+1] + tmp[i*4+2];
243 int d03 = tmp[i*4+0] - tmp[i*4+3];
244 int d12 = tmp[i*4+1] - tmp[i*4+2];
246 dct[i*4+0] = s03 + s12;
247 dct[i*4+1] = 2*d03 + d12;
248 dct[i*4+2] = s03 - s12;
249 dct[i*4+3] = d03 - 2*d12;
253 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
255 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
256 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
257 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
258 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
261 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
263 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
264 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
265 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
266 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
269 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
271 int sum = 0;
272 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
273 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
274 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
275 return sum;
278 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
280 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
281 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
282 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
283 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
285 /* 2x2 DC transform */
286 int d0 = dct[0] + dct[1];
287 int d1 = dct[2] + dct[3];
288 int d2 = dct[0] - dct[1];
289 int d3 = dct[2] - dct[3];
290 dct[0] = d0 + d1;
291 dct[1] = d0 - d1;
292 dct[2] = d2 + d3;
293 dct[3] = d2 - d3;
296 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
298 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
299 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
300 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
301 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
302 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
303 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
304 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
305 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
307 /* 2x4 DC transform */
308 int b0 = a0 + a1;
309 int b1 = a2 + a3;
310 int b2 = a4 + a5;
311 int b3 = a6 + a7;
312 int b4 = a0 - a1;
313 int b5 = a2 - a3;
314 int b6 = a4 - a5;
315 int b7 = a6 - a7;
316 a0 = b0 + b1;
317 a1 = b2 + b3;
318 a2 = b4 + b5;
319 a3 = b6 + b7;
320 a4 = b0 - b1;
321 a5 = b2 - b3;
322 a6 = b4 - b5;
323 a7 = b6 - b7;
324 dct[0] = a0 + a1;
325 dct[1] = a2 + a3;
326 dct[2] = a0 - a1;
327 dct[3] = a2 - a3;
328 dct[4] = a4 - a5;
329 dct[5] = a6 - a7;
330 dct[6] = a4 + a5;
331 dct[7] = a6 + a7;
334 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
336 dctcoef d[16];
337 dctcoef tmp[16];
339 for( int i = 0; i < 4; i++ )
341 int s02 = dct[0*4+i] + dct[2*4+i];
342 int d02 = dct[0*4+i] - dct[2*4+i];
343 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
344 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
346 tmp[i*4+0] = s02 + s13;
347 tmp[i*4+1] = d02 + d13;
348 tmp[i*4+2] = d02 - d13;
349 tmp[i*4+3] = s02 - s13;
352 for( int i = 0; i < 4; i++ )
354 int s02 = tmp[0*4+i] + tmp[2*4+i];
355 int d02 = tmp[0*4+i] - tmp[2*4+i];
356 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
357 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
359 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
360 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
361 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
362 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
366 for( int y = 0; y < 4; y++ )
368 for( int x = 0; x < 4; x++ )
369 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
370 p_dst += FDEC_STRIDE;
374 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
376 add4x4_idct( &p_dst[0], dct[0] );
377 add4x4_idct( &p_dst[4], dct[1] );
378 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
379 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
382 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
384 add8x8_idct( &p_dst[0], &dct[0] );
385 add8x8_idct( &p_dst[8], &dct[4] );
386 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
387 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
390 /****************************************************************************
391 * 8x8 transform:
392 ****************************************************************************/
394 #define DCT8_1D {\
395 int s07 = SRC(0) + SRC(7);\
396 int s16 = SRC(1) + SRC(6);\
397 int s25 = SRC(2) + SRC(5);\
398 int s34 = SRC(3) + SRC(4);\
399 int a0 = s07 + s34;\
400 int a1 = s16 + s25;\
401 int a2 = s07 - s34;\
402 int a3 = s16 - s25;\
403 int d07 = SRC(0) - SRC(7);\
404 int d16 = SRC(1) - SRC(6);\
405 int d25 = SRC(2) - SRC(5);\
406 int d34 = SRC(3) - SRC(4);\
407 int a4 = d16 + d25 + (d07 + (d07>>1));\
408 int a5 = d07 - d34 - (d25 + (d25>>1));\
409 int a6 = d07 + d34 - (d16 + (d16>>1));\
410 int a7 = d16 - d25 + (d34 + (d34>>1));\
411 DST(0) = a0 + a1 ;\
412 DST(1) = a4 + (a7>>2);\
413 DST(2) = a2 + (a3>>1);\
414 DST(3) = a5 + (a6>>2);\
415 DST(4) = a0 - a1 ;\
416 DST(5) = a6 - (a5>>2);\
417 DST(6) = (a2>>1) - a3 ;\
418 DST(7) = (a4>>2) - a7 ;\
421 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
423 dctcoef tmp[64];
425 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
427 #define SRC(x) tmp[x*8+i]
428 #define DST(x) tmp[x*8+i]
429 for( int i = 0; i < 8; i++ )
430 DCT8_1D
431 #undef SRC
432 #undef DST
434 #define SRC(x) tmp[i*8+x]
435 #define DST(x) dct[x*8+i]
436 for( int i = 0; i < 8; i++ )
437 DCT8_1D
438 #undef SRC
439 #undef DST
442 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
444 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
445 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
446 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
447 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
450 #define IDCT8_1D {\
451 int a0 = SRC(0) + SRC(4);\
452 int a2 = SRC(0) - SRC(4);\
453 int a4 = (SRC(2)>>1) - SRC(6);\
454 int a6 = (SRC(6)>>1) + SRC(2);\
455 int b0 = a0 + a6;\
456 int b2 = a2 + a4;\
457 int b4 = a2 - a4;\
458 int b6 = a0 - a6;\
459 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463 int b1 = (a7>>2) + a1;\
464 int b3 = a3 + (a5>>2);\
465 int b5 = (a3>>2) - a5;\
466 int b7 = a7 - (a1>>2);\
467 DST(0, b0 + b7);\
468 DST(1, b2 + b5);\
469 DST(2, b4 + b3);\
470 DST(3, b6 + b1);\
471 DST(4, b6 - b1);\
472 DST(5, b4 - b3);\
473 DST(6, b2 - b5);\
474 DST(7, b0 - b7);\
477 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
479 dct[0] += 32; // rounding for the >>6 at the end
481 #define SRC(x) dct[x*8+i]
482 #define DST(x,rhs) dct[x*8+i] = (rhs)
483 for( int i = 0; i < 8; i++ )
484 IDCT8_1D
485 #undef SRC
486 #undef DST
488 #define SRC(x) dct[i*8+x]
489 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490 for( int i = 0; i < 8; i++ )
491 IDCT8_1D
492 #undef SRC
493 #undef DST
496 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
498 add8x8_idct8( &dst[0], dct[0] );
499 add8x8_idct8( &dst[8], dct[1] );
500 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
501 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
504 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
506 dc = (dc + 32) >> 6;
507 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
509 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
510 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
511 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
512 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
516 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
518 add4x4_idct_dc( &p_dst[0], dct[0] );
519 add4x4_idct_dc( &p_dst[4], dct[1] );
520 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
521 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
524 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
526 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
528 add4x4_idct_dc( &p_dst[ 0], dct[0] );
529 add4x4_idct_dc( &p_dst[ 4], dct[1] );
530 add4x4_idct_dc( &p_dst[ 8], dct[2] );
531 add4x4_idct_dc( &p_dst[12], dct[3] );
536 /****************************************************************************
537 * x264_dct_init:
538 ****************************************************************************/
539 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
541 dctf->sub4x4_dct = sub4x4_dct;
542 dctf->add4x4_idct = add4x4_idct;
544 dctf->sub8x8_dct = sub8x8_dct;
545 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
546 dctf->add8x8_idct = add8x8_idct;
547 dctf->add8x8_idct_dc = add8x8_idct_dc;
549 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
551 dctf->sub16x16_dct = sub16x16_dct;
552 dctf->add16x16_idct = add16x16_idct;
553 dctf->add16x16_idct_dc = add16x16_idct_dc;
555 dctf->sub8x8_dct8 = sub8x8_dct8;
556 dctf->add8x8_idct8 = add8x8_idct8;
558 dctf->sub16x16_dct8 = sub16x16_dct8;
559 dctf->add16x16_idct8 = add16x16_idct8;
561 dctf->dct4x4dc = dct4x4dc;
562 dctf->idct4x4dc = idct4x4dc;
564 dctf->dct2x4dc = dct2x4dc;
566 #if HIGH_BIT_DEPTH
567 #if HAVE_MMX
568 if( cpu&X264_CPU_MMX )
570 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
571 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
572 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
574 if( cpu&X264_CPU_SSE2 )
576 dctf->add4x4_idct = x264_add4x4_idct_sse2;
577 dctf->dct4x4dc = x264_dct4x4dc_sse2;
578 dctf->idct4x4dc = x264_idct4x4dc_sse2;
579 dctf->dct2x4dc = x264_dct2x4dc_sse2;
580 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
581 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
582 dctf->add8x8_idct = x264_add8x8_idct_sse2;
583 dctf->add16x16_idct = x264_add16x16_idct_sse2;
584 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
585 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
586 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
587 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
588 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
589 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
591 if( cpu&X264_CPU_SSE4 )
593 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
594 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
596 if( cpu&X264_CPU_AVX )
598 dctf->add4x4_idct = x264_add4x4_idct_avx;
599 dctf->dct4x4dc = x264_dct4x4dc_avx;
600 dctf->idct4x4dc = x264_idct4x4dc_avx;
601 dctf->dct2x4dc = x264_dct2x4dc_avx;
602 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
603 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
604 dctf->add8x8_idct = x264_add8x8_idct_avx;
605 dctf->add16x16_idct = x264_add16x16_idct_avx;
606 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
607 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
608 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
609 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
610 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
612 #endif // HAVE_MMX
613 #else // !HIGH_BIT_DEPTH
614 #if HAVE_MMX
615 if( cpu&X264_CPU_MMX )
617 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
618 dctf->add4x4_idct = x264_add4x4_idct_mmx;
619 dctf->idct4x4dc = x264_idct4x4dc_mmx;
620 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
622 #if !ARCH_X86_64
623 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
624 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
625 dctf->add8x8_idct = x264_add8x8_idct_mmx;
626 dctf->add16x16_idct = x264_add16x16_idct_mmx;
628 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
629 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
630 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
631 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
632 #endif
635 if( cpu&X264_CPU_MMX2 )
637 dctf->dct4x4dc = x264_dct4x4dc_mmx2;
638 dctf->dct2x4dc = x264_dct2x4dc_mmx2;
639 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
640 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
643 if( cpu&X264_CPU_SSE2 )
645 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
646 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
647 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
648 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
649 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
650 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
652 if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
654 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
655 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
656 dctf->add8x8_idct = x264_add8x8_idct_sse2;
657 dctf->add16x16_idct = x264_add16x16_idct_sse2;
658 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
662 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
664 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
665 if( !(cpu&X264_CPU_SLOW_ATOM) )
667 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
668 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
669 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
670 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
671 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
672 if( !(cpu&X264_CPU_SLOW_PSHUFB) )
674 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
675 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
680 if( cpu&X264_CPU_SSE4 )
681 dctf->add4x4_idct = x264_add4x4_idct_sse4;
683 if( cpu&X264_CPU_AVX )
685 dctf->add4x4_idct = x264_add4x4_idct_avx;
686 dctf->add8x8_idct = x264_add8x8_idct_avx;
687 dctf->add16x16_idct = x264_add16x16_idct_avx;
688 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
689 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
690 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
691 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
692 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
693 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
694 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
697 if( cpu&X264_CPU_XOP )
699 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
700 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
703 if( cpu&X264_CPU_AVX2 )
705 dctf->add8x8_idct = x264_add8x8_idct_avx2;
706 dctf->add16x16_idct = x264_add16x16_idct_avx2;
707 dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
708 dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
709 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
710 #if ARCH_X86_64
711 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
712 #endif
715 if( cpu&X264_CPU_AVX512 )
717 dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
718 dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
719 dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
720 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
721 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512;
722 dctf->add8x8_idct = x264_add8x8_idct_avx512;
724 #endif //HAVE_MMX
726 #if HAVE_ALTIVEC
727 if( cpu&X264_CPU_ALTIVEC )
729 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
730 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
731 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
733 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_altivec;
735 dctf->add4x4_idct = x264_add4x4_idct_altivec;
736 dctf->add8x8_idct = x264_add8x8_idct_altivec;
737 dctf->add16x16_idct = x264_add16x16_idct_altivec;
739 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec;
740 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
741 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
743 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
744 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
746 #endif
748 #if HAVE_ARMV6 || ARCH_AARCH64
749 if( cpu&X264_CPU_NEON )
751 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
752 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
753 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
754 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
755 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
756 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
757 dctf->dct4x4dc = x264_dct4x4dc_neon;
758 dctf->idct4x4dc = x264_idct4x4dc_neon;
760 dctf->add4x4_idct = x264_add4x4_idct_neon;
761 dctf->add8x8_idct = x264_add8x8_idct_neon;
762 dctf->add16x16_idct = x264_add16x16_idct_neon;
764 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
765 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
767 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
768 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
769 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
771 #endif
773 #if HAVE_MSA
774 if( cpu&X264_CPU_MSA )
776 dctf->sub4x4_dct = x264_sub4x4_dct_msa;
777 dctf->sub8x8_dct = x264_sub8x8_dct_msa;
778 dctf->sub16x16_dct = x264_sub16x16_dct_msa;
779 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa;
780 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa;
781 dctf->dct4x4dc = x264_dct4x4dc_msa;
782 dctf->idct4x4dc = x264_idct4x4dc_msa;
783 dctf->add4x4_idct = x264_add4x4_idct_msa;
784 dctf->add8x8_idct = x264_add8x8_idct_msa;
785 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa;
786 dctf->add16x16_idct = x264_add16x16_idct_msa;
787 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
788 dctf->add8x8_idct8 = x264_add8x8_idct8_msa;
789 dctf->add16x16_idct8 = x264_add16x16_idct8_msa;
791 #endif
793 #endif // HIGH_BIT_DEPTH
797 #define ZIG(i,y,x) level[i] = dct[x*8+y];
798 #define ZIGZAG8_FRAME\
799 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
800 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
801 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
802 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
803 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
804 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
805 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
806 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
807 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
808 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
809 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
810 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
811 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
812 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
813 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
814 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
816 #define ZIGZAG8_FIELD\
817 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
818 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
819 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
820 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
821 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
822 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
823 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
824 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
825 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
826 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
827 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
828 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
829 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
830 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
831 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
832 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
834 #define ZIGZAG4_FRAME\
835 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
836 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
837 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
838 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
840 #define ZIGZAG4_FIELD\
841 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
842 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
843 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
844 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
846 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
848 ZIGZAG8_FRAME
851 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
853 ZIGZAG8_FIELD
856 #undef ZIG
857 #define ZIG(i,y,x) level[i] = dct[x*4+y];
858 #define ZIGDC(i,y,x) ZIG(i,y,x)
860 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
862 ZIGZAG4_FRAME
865 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
867 memcpy( level, dct, 2 * sizeof(dctcoef) );
868 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
869 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
872 #undef ZIG
873 #define ZIG(i,y,x) {\
874 int oe = x+y*FENC_STRIDE;\
875 int od = x+y*FDEC_STRIDE;\
876 level[i] = p_src[oe] - p_dst[od];\
877 nz |= level[i];\
879 #define COPY4x4\
880 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
881 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
882 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
883 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
884 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
885 #define COPY8x8\
886 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
887 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
888 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
889 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
890 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
891 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
892 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
893 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
895 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
897 int nz = 0;
898 ZIGZAG4_FRAME
899 COPY4x4
900 return !!nz;
903 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
905 int nz = 0;
906 ZIGZAG4_FIELD
907 COPY4x4
908 return !!nz;
911 #undef ZIGDC
912 #define ZIGDC(i,y,x) {\
913 int oe = x+y*FENC_STRIDE;\
914 int od = x+y*FDEC_STRIDE;\
915 *dc = p_src[oe] - p_dst[od];\
916 level[0] = 0;\
919 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
921 int nz = 0;
922 ZIGZAG4_FRAME
923 COPY4x4
924 return !!nz;
927 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
929 int nz = 0;
930 ZIGZAG4_FIELD
931 COPY4x4
932 return !!nz;
935 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
937 int nz = 0;
938 ZIGZAG8_FRAME
939 COPY8x8
940 return !!nz;
942 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
944 int nz = 0;
945 ZIGZAG8_FIELD
946 COPY8x8
947 return !!nz;
950 #undef ZIG
951 #undef COPY4x4
953 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
955 for( int i = 0; i < 4; i++ )
957 int nz = 0;
958 for( int j = 0; j < 16; j++ )
960 nz |= src[i+j*4];
961 dst[i*16+j] = src[i+j*4];
963 nnz[(i&1) + (i>>1)*8] = !!nz;
967 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
969 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
970 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
971 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
972 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
973 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
974 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
975 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
976 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
977 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
978 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
980 #if HIGH_BIT_DEPTH
981 #if HAVE_MMX
982 if( cpu&X264_CPU_SSE2 )
984 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
985 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
986 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
988 if( cpu&X264_CPU_SSE4 )
989 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
990 if( cpu&X264_CPU_AVX )
991 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
992 #if ARCH_X86_64
993 if( cpu&X264_CPU_AVX )
995 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
996 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
998 #endif // ARCH_X86_64
999 if( cpu&X264_CPU_AVX512 )
1001 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
1002 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
1003 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
1004 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
1006 #endif // HAVE_MMX
1007 #else
1008 #if HAVE_MMX
1009 if( cpu&X264_CPU_MMX )
1010 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
1011 if( cpu&X264_CPU_MMX2 )
1013 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
1014 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
1016 if( cpu&X264_CPU_SSE )
1017 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse;
1018 if( cpu&X264_CPU_SSE2_IS_FAST )
1019 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
1020 if( cpu&X264_CPU_SSSE3 )
1022 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
1023 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
1024 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1025 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1026 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1027 if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1028 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1030 if( cpu&X264_CPU_AVX )
1032 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
1033 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
1034 #if ARCH_X86_64
1035 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1036 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1037 #endif
1038 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1040 if( cpu&X264_CPU_XOP )
1042 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1043 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1044 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1046 if( cpu&X264_CPU_AVX512 )
1048 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
1049 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
1050 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
1051 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
1053 #endif // HAVE_MMX
1054 #if HAVE_ALTIVEC
1055 if( cpu&X264_CPU_ALTIVEC )
1057 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
1058 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1059 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec;
1061 #endif
1062 #if HAVE_ARMV6 || ARCH_AARCH64
1063 if( cpu&X264_CPU_NEON )
1065 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1066 #if ARCH_AARCH64
1067 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
1068 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
1069 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
1070 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
1071 pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
1072 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
1073 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
1074 pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1075 pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
1076 #endif // ARCH_AARCH64
1078 #endif // HAVE_ARMV6 || ARCH_AARCH64
1079 #endif // HIGH_BIT_DEPTH
1081 pf_interlaced->interleave_8x8_cavlc =
1082 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1083 #if HAVE_MMX
1084 #if HIGH_BIT_DEPTH
1085 if( cpu&X264_CPU_SSE2 )
1087 pf_interlaced->interleave_8x8_cavlc =
1088 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1090 if( cpu&X264_CPU_AVX )
1092 pf_interlaced->interleave_8x8_cavlc =
1093 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1095 if( cpu&X264_CPU_AVX512 )
1097 pf_interlaced->interleave_8x8_cavlc =
1098 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
1100 #else
1101 if( cpu&X264_CPU_MMX )
1103 pf_interlaced->interleave_8x8_cavlc =
1104 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1106 if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1108 pf_interlaced->interleave_8x8_cavlc =
1109 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1112 if( cpu&X264_CPU_AVX )
1114 pf_interlaced->interleave_8x8_cavlc =
1115 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1118 if( cpu&X264_CPU_AVX2 )
1120 pf_interlaced->interleave_8x8_cavlc =
1121 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1123 if( cpu&X264_CPU_AVX512 )
1125 pf_interlaced->interleave_8x8_cavlc =
1126 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
1128 #endif // HIGH_BIT_DEPTH
1129 #endif
1130 #if !HIGH_BIT_DEPTH
1131 #if ARCH_AARCH64
1132 if( cpu&X264_CPU_NEON )
1134 pf_interlaced->interleave_8x8_cavlc =
1135 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
1137 #endif // ARCH_AARCH64
1139 #if HAVE_ALTIVEC
1140 if( cpu&X264_CPU_ALTIVEC )
1142 pf_interlaced->interleave_8x8_cavlc =
1143 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_altivec;
1145 #endif // HAVE_ALTIVEC
1147 #if HAVE_MSA
1148 if( cpu&X264_CPU_MSA )
1150 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa;
1152 #endif
1153 #endif // !HIGH_BIT_DEPTH