1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2017 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <henrik@gramner.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 # include "aarch64/dct.h"
42 # include "mips/dct.h"
45 /* the inverse of the scaling factors introduced by 8x8 fdct */
46 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47 #define W(i) (i==0 ? FIX8(1.0000) :\
48 i==1 ? FIX8(0.8859) :\
49 i==2 ? FIX8(1.6000) :\
50 i==3 ? FIX8(0.9415) :\
51 i==4 ? FIX8(1.2651) :\
52 i==5 ? FIX8(1.1910) :0)
53 const uint32_t x264_dct8_weight_tab
[64] = {
54 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
55 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
56 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
57 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
59 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
60 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
61 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
62 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
66 #define W(i) (i==0 ? FIX8(1.76777) :\
67 i==1 ? FIX8(1.11803) :\
68 i==2 ? FIX8(0.70711) :0)
69 const uint32_t x264_dct4_weight_tab
[16] = {
70 W(0), W(1), W(0), W(1),
71 W(1), W(2), W(1), W(2),
72 W(0), W(1), W(0), W(1),
73 W(1), W(2), W(1), W(2)
78 #define W(i) (i==0 ? FIX8(3.125) :\
81 const uint32_t x264_dct4_weight2_tab
[16] = {
82 W(0), W(1), W(0), W(1),
83 W(1), W(2), W(1), W(2),
84 W(0), W(1), W(0), W(1),
85 W(1), W(2), W(1), W(2)
89 #define W(i) (i==0 ? FIX8(1.00000) :\
90 i==1 ? FIX8(0.78487) :\
91 i==2 ? FIX8(2.56132) :\
92 i==3 ? FIX8(0.88637) :\
93 i==4 ? FIX8(1.60040) :\
94 i==5 ? FIX8(1.41850) :0)
95 const uint32_t x264_dct8_weight2_tab
[64] = {
96 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
97 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
98 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
99 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
101 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
102 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
103 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
104 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
109 static void dct4x4dc( dctcoef d
[16] )
113 for( int i
= 0; i
< 4; i
++ )
115 int s01
= d
[i
*4+0] + d
[i
*4+1];
116 int d01
= d
[i
*4+0] - d
[i
*4+1];
117 int s23
= d
[i
*4+2] + d
[i
*4+3];
118 int d23
= d
[i
*4+2] - d
[i
*4+3];
120 tmp
[0*4+i
] = s01
+ s23
;
121 tmp
[1*4+i
] = s01
- s23
;
122 tmp
[2*4+i
] = d01
- d23
;
123 tmp
[3*4+i
] = d01
+ d23
;
126 for( int i
= 0; i
< 4; i
++ )
128 int s01
= tmp
[i
*4+0] + tmp
[i
*4+1];
129 int d01
= tmp
[i
*4+0] - tmp
[i
*4+1];
130 int s23
= tmp
[i
*4+2] + tmp
[i
*4+3];
131 int d23
= tmp
[i
*4+2] - tmp
[i
*4+3];
133 d
[i
*4+0] = ( s01
+ s23
+ 1 ) >> 1;
134 d
[i
*4+1] = ( s01
- s23
+ 1 ) >> 1;
135 d
[i
*4+2] = ( d01
- d23
+ 1 ) >> 1;
136 d
[i
*4+3] = ( d01
+ d23
+ 1 ) >> 1;
140 static void idct4x4dc( dctcoef d
[16] )
144 for( int i
= 0; i
< 4; i
++ )
146 int s01
= d
[i
*4+0] + d
[i
*4+1];
147 int d01
= d
[i
*4+0] - d
[i
*4+1];
148 int s23
= d
[i
*4+2] + d
[i
*4+3];
149 int d23
= d
[i
*4+2] - d
[i
*4+3];
151 tmp
[0*4+i
] = s01
+ s23
;
152 tmp
[1*4+i
] = s01
- s23
;
153 tmp
[2*4+i
] = d01
- d23
;
154 tmp
[3*4+i
] = d01
+ d23
;
157 for( int i
= 0; i
< 4; i
++ )
159 int s01
= tmp
[i
*4+0] + tmp
[i
*4+1];
160 int d01
= tmp
[i
*4+0] - tmp
[i
*4+1];
161 int s23
= tmp
[i
*4+2] + tmp
[i
*4+3];
162 int d23
= tmp
[i
*4+2] - tmp
[i
*4+3];
164 d
[i
*4+0] = s01
+ s23
;
165 d
[i
*4+1] = s01
- s23
;
166 d
[i
*4+2] = d01
- d23
;
167 d
[i
*4+3] = d01
+ d23
;
171 static void dct2x4dc( dctcoef dct
[8], dctcoef dct4x4
[8][16] )
173 int a0
= dct4x4
[0][0] + dct4x4
[1][0];
174 int a1
= dct4x4
[2][0] + dct4x4
[3][0];
175 int a2
= dct4x4
[4][0] + dct4x4
[5][0];
176 int a3
= dct4x4
[6][0] + dct4x4
[7][0];
177 int a4
= dct4x4
[0][0] - dct4x4
[1][0];
178 int a5
= dct4x4
[2][0] - dct4x4
[3][0];
179 int a6
= dct4x4
[4][0] - dct4x4
[5][0];
180 int a7
= dct4x4
[6][0] - dct4x4
[7][0];
207 static inline void pixel_sub_wxh( dctcoef
*diff
, int i_size
,
208 pixel
*pix1
, int i_pix1
, pixel
*pix2
, int i_pix2
)
210 for( int y
= 0; y
< i_size
; y
++ )
212 for( int x
= 0; x
< i_size
; x
++ )
213 diff
[x
+ y
*i_size
] = pix1
[x
] - pix2
[x
];
219 static void sub4x4_dct( dctcoef dct
[16], pixel
*pix1
, pixel
*pix2
)
224 pixel_sub_wxh( d
, 4, pix1
, FENC_STRIDE
, pix2
, FDEC_STRIDE
);
226 for( int i
= 0; i
< 4; i
++ )
228 int s03
= d
[i
*4+0] + d
[i
*4+3];
229 int s12
= d
[i
*4+1] + d
[i
*4+2];
230 int d03
= d
[i
*4+0] - d
[i
*4+3];
231 int d12
= d
[i
*4+1] - d
[i
*4+2];
233 tmp
[0*4+i
] = s03
+ s12
;
234 tmp
[1*4+i
] = 2*d03
+ d12
;
235 tmp
[2*4+i
] = s03
- s12
;
236 tmp
[3*4+i
] = d03
- 2*d12
;
239 for( int i
= 0; i
< 4; i
++ )
241 int s03
= tmp
[i
*4+0] + tmp
[i
*4+3];
242 int s12
= tmp
[i
*4+1] + tmp
[i
*4+2];
243 int d03
= tmp
[i
*4+0] - tmp
[i
*4+3];
244 int d12
= tmp
[i
*4+1] - tmp
[i
*4+2];
246 dct
[i
*4+0] = s03
+ s12
;
247 dct
[i
*4+1] = 2*d03
+ d12
;
248 dct
[i
*4+2] = s03
- s12
;
249 dct
[i
*4+3] = d03
- 2*d12
;
253 static void sub8x8_dct( dctcoef dct
[4][16], pixel
*pix1
, pixel
*pix2
)
255 sub4x4_dct( dct
[0], &pix1
[0], &pix2
[0] );
256 sub4x4_dct( dct
[1], &pix1
[4], &pix2
[4] );
257 sub4x4_dct( dct
[2], &pix1
[4*FENC_STRIDE
+0], &pix2
[4*FDEC_STRIDE
+0] );
258 sub4x4_dct( dct
[3], &pix1
[4*FENC_STRIDE
+4], &pix2
[4*FDEC_STRIDE
+4] );
261 static void sub16x16_dct( dctcoef dct
[16][16], pixel
*pix1
, pixel
*pix2
)
263 sub8x8_dct( &dct
[ 0], &pix1
[0], &pix2
[0] );
264 sub8x8_dct( &dct
[ 4], &pix1
[8], &pix2
[8] );
265 sub8x8_dct( &dct
[ 8], &pix1
[8*FENC_STRIDE
+0], &pix2
[8*FDEC_STRIDE
+0] );
266 sub8x8_dct( &dct
[12], &pix1
[8*FENC_STRIDE
+8], &pix2
[8*FDEC_STRIDE
+8] );
269 static int sub4x4_dct_dc( pixel
*pix1
, pixel
*pix2
)
272 for( int i
=0; i
<4; i
++, pix1
+= FENC_STRIDE
, pix2
+= FDEC_STRIDE
)
273 sum
+= pix1
[0] + pix1
[1] + pix1
[2] + pix1
[3]
274 - pix2
[0] - pix2
[1] - pix2
[2] - pix2
[3];
278 static void sub8x8_dct_dc( dctcoef dct
[4], pixel
*pix1
, pixel
*pix2
)
280 dct
[0] = sub4x4_dct_dc( &pix1
[0], &pix2
[0] );
281 dct
[1] = sub4x4_dct_dc( &pix1
[4], &pix2
[4] );
282 dct
[2] = sub4x4_dct_dc( &pix1
[4*FENC_STRIDE
+0], &pix2
[4*FDEC_STRIDE
+0] );
283 dct
[3] = sub4x4_dct_dc( &pix1
[4*FENC_STRIDE
+4], &pix2
[4*FDEC_STRIDE
+4] );
285 /* 2x2 DC transform */
286 int d0
= dct
[0] + dct
[1];
287 int d1
= dct
[2] + dct
[3];
288 int d2
= dct
[0] - dct
[1];
289 int d3
= dct
[2] - dct
[3];
296 static void sub8x16_dct_dc( dctcoef dct
[8], pixel
*pix1
, pixel
*pix2
)
298 int a0
= sub4x4_dct_dc( &pix1
[ 0*FENC_STRIDE
+0], &pix2
[ 0*FDEC_STRIDE
+0] );
299 int a1
= sub4x4_dct_dc( &pix1
[ 0*FENC_STRIDE
+4], &pix2
[ 0*FDEC_STRIDE
+4] );
300 int a2
= sub4x4_dct_dc( &pix1
[ 4*FENC_STRIDE
+0], &pix2
[ 4*FDEC_STRIDE
+0] );
301 int a3
= sub4x4_dct_dc( &pix1
[ 4*FENC_STRIDE
+4], &pix2
[ 4*FDEC_STRIDE
+4] );
302 int a4
= sub4x4_dct_dc( &pix1
[ 8*FENC_STRIDE
+0], &pix2
[ 8*FDEC_STRIDE
+0] );
303 int a5
= sub4x4_dct_dc( &pix1
[ 8*FENC_STRIDE
+4], &pix2
[ 8*FDEC_STRIDE
+4] );
304 int a6
= sub4x4_dct_dc( &pix1
[12*FENC_STRIDE
+0], &pix2
[12*FDEC_STRIDE
+0] );
305 int a7
= sub4x4_dct_dc( &pix1
[12*FENC_STRIDE
+4], &pix2
[12*FDEC_STRIDE
+4] );
307 /* 2x4 DC transform */
334 static void add4x4_idct( pixel
*p_dst
, dctcoef dct
[16] )
339 for( int i
= 0; i
< 4; i
++ )
341 int s02
= dct
[0*4+i
] + dct
[2*4+i
];
342 int d02
= dct
[0*4+i
] - dct
[2*4+i
];
343 int s13
= dct
[1*4+i
] + (dct
[3*4+i
]>>1);
344 int d13
= (dct
[1*4+i
]>>1) - dct
[3*4+i
];
346 tmp
[i
*4+0] = s02
+ s13
;
347 tmp
[i
*4+1] = d02
+ d13
;
348 tmp
[i
*4+2] = d02
- d13
;
349 tmp
[i
*4+3] = s02
- s13
;
352 for( int i
= 0; i
< 4; i
++ )
354 int s02
= tmp
[0*4+i
] + tmp
[2*4+i
];
355 int d02
= tmp
[0*4+i
] - tmp
[2*4+i
];
356 int s13
= tmp
[1*4+i
] + (tmp
[3*4+i
]>>1);
357 int d13
= (tmp
[1*4+i
]>>1) - tmp
[3*4+i
];
359 d
[0*4+i
] = ( s02
+ s13
+ 32 ) >> 6;
360 d
[1*4+i
] = ( d02
+ d13
+ 32 ) >> 6;
361 d
[2*4+i
] = ( d02
- d13
+ 32 ) >> 6;
362 d
[3*4+i
] = ( s02
- s13
+ 32 ) >> 6;
366 for( int y
= 0; y
< 4; y
++ )
368 for( int x
= 0; x
< 4; x
++ )
369 p_dst
[x
] = x264_clip_pixel( p_dst
[x
] + d
[y
*4+x
] );
370 p_dst
+= FDEC_STRIDE
;
374 static void add8x8_idct( pixel
*p_dst
, dctcoef dct
[4][16] )
376 add4x4_idct( &p_dst
[0], dct
[0] );
377 add4x4_idct( &p_dst
[4], dct
[1] );
378 add4x4_idct( &p_dst
[4*FDEC_STRIDE
+0], dct
[2] );
379 add4x4_idct( &p_dst
[4*FDEC_STRIDE
+4], dct
[3] );
382 static void add16x16_idct( pixel
*p_dst
, dctcoef dct
[16][16] )
384 add8x8_idct( &p_dst
[0], &dct
[0] );
385 add8x8_idct( &p_dst
[8], &dct
[4] );
386 add8x8_idct( &p_dst
[8*FDEC_STRIDE
+0], &dct
[8] );
387 add8x8_idct( &p_dst
[8*FDEC_STRIDE
+8], &dct
[12] );
390 /****************************************************************************
392 ****************************************************************************/
395 int s07 = SRC(0) + SRC(7);\
396 int s16 = SRC(1) + SRC(6);\
397 int s25 = SRC(2) + SRC(5);\
398 int s34 = SRC(3) + SRC(4);\
403 int d07 = SRC(0) - SRC(7);\
404 int d16 = SRC(1) - SRC(6);\
405 int d25 = SRC(2) - SRC(5);\
406 int d34 = SRC(3) - SRC(4);\
407 int a4 = d16 + d25 + (d07 + (d07>>1));\
408 int a5 = d07 - d34 - (d25 + (d25>>1));\
409 int a6 = d07 + d34 - (d16 + (d16>>1));\
410 int a7 = d16 - d25 + (d34 + (d34>>1));\
412 DST(1) = a4 + (a7>>2);\
413 DST(2) = a2 + (a3>>1);\
414 DST(3) = a5 + (a6>>2);\
416 DST(5) = a6 - (a5>>2);\
417 DST(6) = (a2>>1) - a3 ;\
418 DST(7) = (a4>>2) - a7 ;\
421 static void sub8x8_dct8( dctcoef dct
[64], pixel
*pix1
, pixel
*pix2
)
425 pixel_sub_wxh( tmp
, 8, pix1
, FENC_STRIDE
, pix2
, FDEC_STRIDE
);
427 #define SRC(x) tmp[x*8+i]
428 #define DST(x) tmp[x*8+i]
429 for( int i
= 0; i
< 8; i
++ )
434 #define SRC(x) tmp[i*8+x]
435 #define DST(x) dct[x*8+i]
436 for( int i
= 0; i
< 8; i
++ )
442 static void sub16x16_dct8( dctcoef dct
[4][64], pixel
*pix1
, pixel
*pix2
)
444 sub8x8_dct8( dct
[0], &pix1
[0], &pix2
[0] );
445 sub8x8_dct8( dct
[1], &pix1
[8], &pix2
[8] );
446 sub8x8_dct8( dct
[2], &pix1
[8*FENC_STRIDE
+0], &pix2
[8*FDEC_STRIDE
+0] );
447 sub8x8_dct8( dct
[3], &pix1
[8*FENC_STRIDE
+8], &pix2
[8*FDEC_STRIDE
+8] );
451 int a0 = SRC(0) + SRC(4);\
452 int a2 = SRC(0) - SRC(4);\
453 int a4 = (SRC(2)>>1) - SRC(6);\
454 int a6 = (SRC(6)>>1) + SRC(2);\
459 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463 int b1 = (a7>>2) + a1;\
464 int b3 = a3 + (a5>>2);\
465 int b5 = (a3>>2) - a5;\
466 int b7 = a7 - (a1>>2);\
477 static void add8x8_idct8( pixel
*dst
, dctcoef dct
[64] )
479 dct
[0] += 32; // rounding for the >>6 at the end
481 #define SRC(x) dct[x*8+i]
482 #define DST(x,rhs) dct[x*8+i] = (rhs)
483 for( int i
= 0; i
< 8; i
++ )
488 #define SRC(x) dct[i*8+x]
489 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490 for( int i
= 0; i
< 8; i
++ )
496 static void add16x16_idct8( pixel
*dst
, dctcoef dct
[4][64] )
498 add8x8_idct8( &dst
[0], dct
[0] );
499 add8x8_idct8( &dst
[8], dct
[1] );
500 add8x8_idct8( &dst
[8*FDEC_STRIDE
+0], dct
[2] );
501 add8x8_idct8( &dst
[8*FDEC_STRIDE
+8], dct
[3] );
504 static void inline add4x4_idct_dc( pixel
*p_dst
, dctcoef dc
)
507 for( int i
= 0; i
< 4; i
++, p_dst
+= FDEC_STRIDE
)
509 p_dst
[0] = x264_clip_pixel( p_dst
[0] + dc
);
510 p_dst
[1] = x264_clip_pixel( p_dst
[1] + dc
);
511 p_dst
[2] = x264_clip_pixel( p_dst
[2] + dc
);
512 p_dst
[3] = x264_clip_pixel( p_dst
[3] + dc
);
516 static void add8x8_idct_dc( pixel
*p_dst
, dctcoef dct
[4] )
518 add4x4_idct_dc( &p_dst
[0], dct
[0] );
519 add4x4_idct_dc( &p_dst
[4], dct
[1] );
520 add4x4_idct_dc( &p_dst
[4*FDEC_STRIDE
+0], dct
[2] );
521 add4x4_idct_dc( &p_dst
[4*FDEC_STRIDE
+4], dct
[3] );
524 static void add16x16_idct_dc( pixel
*p_dst
, dctcoef dct
[16] )
526 for( int i
= 0; i
< 4; i
++, dct
+= 4, p_dst
+= 4*FDEC_STRIDE
)
528 add4x4_idct_dc( &p_dst
[ 0], dct
[0] );
529 add4x4_idct_dc( &p_dst
[ 4], dct
[1] );
530 add4x4_idct_dc( &p_dst
[ 8], dct
[2] );
531 add4x4_idct_dc( &p_dst
[12], dct
[3] );
536 /****************************************************************************
538 ****************************************************************************/
539 void x264_dct_init( int cpu
, x264_dct_function_t
*dctf
)
541 dctf
->sub4x4_dct
= sub4x4_dct
;
542 dctf
->add4x4_idct
= add4x4_idct
;
544 dctf
->sub8x8_dct
= sub8x8_dct
;
545 dctf
->sub8x8_dct_dc
= sub8x8_dct_dc
;
546 dctf
->add8x8_idct
= add8x8_idct
;
547 dctf
->add8x8_idct_dc
= add8x8_idct_dc
;
549 dctf
->sub8x16_dct_dc
= sub8x16_dct_dc
;
551 dctf
->sub16x16_dct
= sub16x16_dct
;
552 dctf
->add16x16_idct
= add16x16_idct
;
553 dctf
->add16x16_idct_dc
= add16x16_idct_dc
;
555 dctf
->sub8x8_dct8
= sub8x8_dct8
;
556 dctf
->add8x8_idct8
= add8x8_idct8
;
558 dctf
->sub16x16_dct8
= sub16x16_dct8
;
559 dctf
->add16x16_idct8
= add16x16_idct8
;
561 dctf
->dct4x4dc
= dct4x4dc
;
562 dctf
->idct4x4dc
= idct4x4dc
;
564 dctf
->dct2x4dc
= dct2x4dc
;
568 if( cpu
&X264_CPU_MMX
)
570 dctf
->sub4x4_dct
= x264_sub4x4_dct_mmx
;
571 dctf
->sub8x8_dct
= x264_sub8x8_dct_mmx
;
572 dctf
->sub16x16_dct
= x264_sub16x16_dct_mmx
;
574 if( cpu
&X264_CPU_SSE2
)
576 dctf
->add4x4_idct
= x264_add4x4_idct_sse2
;
577 dctf
->dct4x4dc
= x264_dct4x4dc_sse2
;
578 dctf
->idct4x4dc
= x264_idct4x4dc_sse2
;
579 dctf
->dct2x4dc
= x264_dct2x4dc_sse2
;
580 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_sse2
;
581 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_sse2
;
582 dctf
->add8x8_idct
= x264_add8x8_idct_sse2
;
583 dctf
->add16x16_idct
= x264_add16x16_idct_sse2
;
584 dctf
->add8x8_idct8
= x264_add8x8_idct8_sse2
;
585 dctf
->add16x16_idct8
= x264_add16x16_idct8_sse2
;
586 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_sse2
;
587 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_sse2
;
588 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_sse2
;
589 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_sse2
;
591 if( cpu
&X264_CPU_SSE4
)
593 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_sse4
;
594 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_sse4
;
596 if( cpu
&X264_CPU_AVX
)
598 dctf
->add4x4_idct
= x264_add4x4_idct_avx
;
599 dctf
->dct4x4dc
= x264_dct4x4dc_avx
;
600 dctf
->idct4x4dc
= x264_idct4x4dc_avx
;
601 dctf
->dct2x4dc
= x264_dct2x4dc_avx
;
602 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_avx
;
603 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_avx
;
604 dctf
->add8x8_idct
= x264_add8x8_idct_avx
;
605 dctf
->add16x16_idct
= x264_add16x16_idct_avx
;
606 dctf
->add8x8_idct8
= x264_add8x8_idct8_avx
;
607 dctf
->add16x16_idct8
= x264_add16x16_idct8_avx
;
608 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_avx
;
609 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_avx
;
610 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_avx
;
613 #else // !HIGH_BIT_DEPTH
615 if( cpu
&X264_CPU_MMX
)
617 dctf
->sub4x4_dct
= x264_sub4x4_dct_mmx
;
618 dctf
->add4x4_idct
= x264_add4x4_idct_mmx
;
619 dctf
->idct4x4dc
= x264_idct4x4dc_mmx
;
620 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_mmx2
;
623 dctf
->sub8x8_dct
= x264_sub8x8_dct_mmx
;
624 dctf
->sub16x16_dct
= x264_sub16x16_dct_mmx
;
625 dctf
->add8x8_idct
= x264_add8x8_idct_mmx
;
626 dctf
->add16x16_idct
= x264_add16x16_idct_mmx
;
628 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_mmx
;
629 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_mmx
;
630 dctf
->add8x8_idct8
= x264_add8x8_idct8_mmx
;
631 dctf
->add16x16_idct8
= x264_add16x16_idct8_mmx
;
635 if( cpu
&X264_CPU_MMX2
)
637 dctf
->dct4x4dc
= x264_dct4x4dc_mmx2
;
638 dctf
->dct2x4dc
= x264_dct2x4dc_mmx2
;
639 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_mmx2
;
640 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_mmx2
;
643 if( cpu
&X264_CPU_SSE2
)
645 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_sse2
;
646 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_sse2
;
647 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_sse2
;
648 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_sse2
;
649 dctf
->add8x8_idct8
= x264_add8x8_idct8_sse2
;
650 dctf
->add16x16_idct8
= x264_add16x16_idct8_sse2
;
652 if( !(cpu
&X264_CPU_SSE2_IS_SLOW
) )
654 dctf
->sub8x8_dct
= x264_sub8x8_dct_sse2
;
655 dctf
->sub16x16_dct
= x264_sub16x16_dct_sse2
;
656 dctf
->add8x8_idct
= x264_add8x8_idct_sse2
;
657 dctf
->add16x16_idct
= x264_add16x16_idct_sse2
;
658 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_sse2
;
662 if( (cpu
&X264_CPU_SSSE3
) && !(cpu
&X264_CPU_SSE2_IS_SLOW
) )
664 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_ssse3
;
665 if( !(cpu
&X264_CPU_SLOW_ATOM
) )
667 dctf
->sub4x4_dct
= x264_sub4x4_dct_ssse3
;
668 dctf
->sub8x8_dct
= x264_sub8x8_dct_ssse3
;
669 dctf
->sub16x16_dct
= x264_sub16x16_dct_ssse3
;
670 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_ssse3
;
671 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_ssse3
;
672 if( !(cpu
&X264_CPU_SLOW_PSHUFB
) )
674 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_ssse3
;
675 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_ssse3
;
680 if( cpu
&X264_CPU_SSE4
)
681 dctf
->add4x4_idct
= x264_add4x4_idct_sse4
;
683 if( cpu
&X264_CPU_AVX
)
685 dctf
->add4x4_idct
= x264_add4x4_idct_avx
;
686 dctf
->add8x8_idct
= x264_add8x8_idct_avx
;
687 dctf
->add16x16_idct
= x264_add16x16_idct_avx
;
688 dctf
->add8x8_idct8
= x264_add8x8_idct8_avx
;
689 dctf
->add16x16_idct8
= x264_add16x16_idct8_avx
;
690 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_avx
;
691 dctf
->sub8x8_dct
= x264_sub8x8_dct_avx
;
692 dctf
->sub16x16_dct
= x264_sub16x16_dct_avx
;
693 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_avx
;
694 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_avx
;
697 if( cpu
&X264_CPU_XOP
)
699 dctf
->sub8x8_dct
= x264_sub8x8_dct_xop
;
700 dctf
->sub16x16_dct
= x264_sub16x16_dct_xop
;
703 if( cpu
&X264_CPU_AVX2
)
705 dctf
->add8x8_idct
= x264_add8x8_idct_avx2
;
706 dctf
->add16x16_idct
= x264_add16x16_idct_avx2
;
707 dctf
->sub8x8_dct
= x264_sub8x8_dct_avx2
;
708 dctf
->sub16x16_dct
= x264_sub16x16_dct_avx2
;
709 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_avx2
;
711 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_avx2
;
715 if( cpu
&X264_CPU_AVX512
)
717 dctf
->sub4x4_dct
= x264_sub4x4_dct_avx512
;
718 dctf
->sub8x8_dct
= x264_sub8x8_dct_avx512
;
719 dctf
->sub16x16_dct
= x264_sub16x16_dct_avx512
;
720 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_avx512
;
721 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_avx512
;
722 dctf
->add8x8_idct
= x264_add8x8_idct_avx512
;
727 if( cpu
&X264_CPU_ALTIVEC
)
729 dctf
->sub4x4_dct
= x264_sub4x4_dct_altivec
;
730 dctf
->sub8x8_dct
= x264_sub8x8_dct_altivec
;
731 dctf
->sub16x16_dct
= x264_sub16x16_dct_altivec
;
733 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_altivec
;
735 dctf
->add4x4_idct
= x264_add4x4_idct_altivec
;
736 dctf
->add8x8_idct
= x264_add8x8_idct_altivec
;
737 dctf
->add16x16_idct
= x264_add16x16_idct_altivec
;
739 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_altivec
;
740 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_altivec
;
741 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_altivec
;
743 dctf
->add8x8_idct8
= x264_add8x8_idct8_altivec
;
744 dctf
->add16x16_idct8
= x264_add16x16_idct8_altivec
;
748 #if HAVE_ARMV6 || ARCH_AARCH64
749 if( cpu
&X264_CPU_NEON
)
751 dctf
->sub4x4_dct
= x264_sub4x4_dct_neon
;
752 dctf
->sub8x8_dct
= x264_sub8x8_dct_neon
;
753 dctf
->sub16x16_dct
= x264_sub16x16_dct_neon
;
754 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_neon
;
755 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_neon
;
756 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_neon
;
757 dctf
->dct4x4dc
= x264_dct4x4dc_neon
;
758 dctf
->idct4x4dc
= x264_idct4x4dc_neon
;
760 dctf
->add4x4_idct
= x264_add4x4_idct_neon
;
761 dctf
->add8x8_idct
= x264_add8x8_idct_neon
;
762 dctf
->add16x16_idct
= x264_add16x16_idct_neon
;
764 dctf
->sub8x8_dct8
= x264_sub8x8_dct8_neon
;
765 dctf
->sub16x16_dct8
= x264_sub16x16_dct8_neon
;
767 dctf
->add8x8_idct8
= x264_add8x8_idct8_neon
;
768 dctf
->add16x16_idct8
= x264_add16x16_idct8_neon
;
769 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_neon
;
774 if( cpu
&X264_CPU_MSA
)
776 dctf
->sub4x4_dct
= x264_sub4x4_dct_msa
;
777 dctf
->sub8x8_dct
= x264_sub8x8_dct_msa
;
778 dctf
->sub16x16_dct
= x264_sub16x16_dct_msa
;
779 dctf
->sub8x8_dct_dc
= x264_sub8x8_dct_dc_msa
;
780 dctf
->sub8x16_dct_dc
= x264_sub8x16_dct_dc_msa
;
781 dctf
->dct4x4dc
= x264_dct4x4dc_msa
;
782 dctf
->idct4x4dc
= x264_idct4x4dc_msa
;
783 dctf
->add4x4_idct
= x264_add4x4_idct_msa
;
784 dctf
->add8x8_idct
= x264_add8x8_idct_msa
;
785 dctf
->add8x8_idct_dc
= x264_add8x8_idct_dc_msa
;
786 dctf
->add16x16_idct
= x264_add16x16_idct_msa
;
787 dctf
->add16x16_idct_dc
= x264_add16x16_idct_dc_msa
;
788 dctf
->add8x8_idct8
= x264_add8x8_idct8_msa
;
789 dctf
->add16x16_idct8
= x264_add16x16_idct8_msa
;
793 #endif // HIGH_BIT_DEPTH
797 #define ZIG(i,y,x) level[i] = dct[x*8+y];
798 #define ZIGZAG8_FRAME\
799 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
800 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
801 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
802 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
803 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
804 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
805 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
806 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
807 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
808 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
809 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
810 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
811 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
812 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
813 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
814 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
816 #define ZIGZAG8_FIELD\
817 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
818 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
819 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
820 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
821 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
822 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
823 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
824 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
825 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
826 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
827 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
828 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
829 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
830 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
831 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
832 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
834 #define ZIGZAG4_FRAME\
835 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
836 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
837 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
838 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
840 #define ZIGZAG4_FIELD\
841 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
842 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
843 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
844 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
846 static void zigzag_scan_8x8_frame( dctcoef level
[64], dctcoef dct
[64] )
851 static void zigzag_scan_8x8_field( dctcoef level
[64], dctcoef dct
[64] )
857 #define ZIG(i,y,x) level[i] = dct[x*4+y];
858 #define ZIGDC(i,y,x) ZIG(i,y,x)
860 static void zigzag_scan_4x4_frame( dctcoef level
[16], dctcoef dct
[16] )
865 static void zigzag_scan_4x4_field( dctcoef level
[16], dctcoef dct
[16] )
867 memcpy( level
, dct
, 2 * sizeof(dctcoef
) );
868 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
869 memcpy( level
+6, dct
+6, 10 * sizeof(dctcoef
) );
873 #define ZIG(i,y,x) {\
874 int oe = x+y*FENC_STRIDE;\
875 int od = x+y*FDEC_STRIDE;\
876 level[i] = p_src[oe] - p_dst[od];\
880 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
881 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
882 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
883 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
884 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
886 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
887 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
888 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
889 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
890 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
891 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
892 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
893 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
895 static int zigzag_sub_4x4_frame( dctcoef level
[16], const pixel
*p_src
, pixel
*p_dst
)
903 static int zigzag_sub_4x4_field( dctcoef level
[16], const pixel
*p_src
, pixel
*p_dst
)
912 #define ZIGDC(i,y,x) {\
913 int oe = x+y*FENC_STRIDE;\
914 int od = x+y*FDEC_STRIDE;\
915 *dc = p_src[oe] - p_dst[od];\
919 static int zigzag_sub_4x4ac_frame( dctcoef level
[16], const pixel
*p_src
, pixel
*p_dst
, dctcoef
*dc
)
927 static int zigzag_sub_4x4ac_field( dctcoef level
[16], const pixel
*p_src
, pixel
*p_dst
, dctcoef
*dc
)
935 static int zigzag_sub_8x8_frame( dctcoef level
[64], const pixel
*p_src
, pixel
*p_dst
)
942 static int zigzag_sub_8x8_field( dctcoef level
[64], const pixel
*p_src
, pixel
*p_dst
)
953 static void zigzag_interleave_8x8_cavlc( dctcoef
*dst
, dctcoef
*src
, uint8_t *nnz
)
955 for( int i
= 0; i
< 4; i
++ )
958 for( int j
= 0; j
< 16; j
++ )
961 dst
[i
*16+j
] = src
[i
+j
*4];
963 nnz
[(i
&1) + (i
>>1)*8] = !!nz
;
967 void x264_zigzag_init( int cpu
, x264_zigzag_function_t
*pf_progressive
, x264_zigzag_function_t
*pf_interlaced
)
969 pf_interlaced
->scan_8x8
= zigzag_scan_8x8_field
;
970 pf_progressive
->scan_8x8
= zigzag_scan_8x8_frame
;
971 pf_interlaced
->scan_4x4
= zigzag_scan_4x4_field
;
972 pf_progressive
->scan_4x4
= zigzag_scan_4x4_frame
;
973 pf_interlaced
->sub_8x8
= zigzag_sub_8x8_field
;
974 pf_progressive
->sub_8x8
= zigzag_sub_8x8_frame
;
975 pf_interlaced
->sub_4x4
= zigzag_sub_4x4_field
;
976 pf_progressive
->sub_4x4
= zigzag_sub_4x4_frame
;
977 pf_interlaced
->sub_4x4ac
= zigzag_sub_4x4ac_field
;
978 pf_progressive
->sub_4x4ac
= zigzag_sub_4x4ac_frame
;
982 if( cpu
&X264_CPU_SSE2
)
984 pf_interlaced
->scan_4x4
= x264_zigzag_scan_4x4_field_sse2
;
985 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_sse2
;
986 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_sse2
;
988 if( cpu
&X264_CPU_SSE4
)
989 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_sse4
;
990 if( cpu
&X264_CPU_AVX
)
991 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_avx
;
993 if( cpu
&X264_CPU_AVX
)
995 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_avx
;
996 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_avx
;
998 #endif // ARCH_X86_64
999 if( cpu
&X264_CPU_AVX512
)
1001 pf_interlaced
->scan_4x4
= x264_zigzag_scan_4x4_field_avx512
;
1002 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_avx512
;
1003 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_avx512
;
1004 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_avx512
;
1009 if( cpu
&X264_CPU_MMX
)
1010 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_mmx
;
1011 if( cpu
&X264_CPU_MMX2
)
1013 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_mmx2
;
1014 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_mmx2
;
1016 if( cpu
&X264_CPU_SSE
)
1017 pf_interlaced
->scan_4x4
= x264_zigzag_scan_4x4_field_sse
;
1018 if( cpu
&X264_CPU_SSE2_IS_FAST
)
1019 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_sse2
;
1020 if( cpu
&X264_CPU_SSSE3
)
1022 pf_interlaced
->sub_4x4
= x264_zigzag_sub_4x4_field_ssse3
;
1023 pf_progressive
->sub_4x4
= x264_zigzag_sub_4x4_frame_ssse3
;
1024 pf_interlaced
->sub_4x4ac
= x264_zigzag_sub_4x4ac_field_ssse3
;
1025 pf_progressive
->sub_4x4ac
= x264_zigzag_sub_4x4ac_frame_ssse3
;
1026 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_ssse3
;
1027 if( !(cpu
&X264_CPU_SLOW_SHUFFLE
) )
1028 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_ssse3
;
1030 if( cpu
&X264_CPU_AVX
)
1032 pf_interlaced
->sub_4x4
= x264_zigzag_sub_4x4_field_avx
;
1033 pf_progressive
->sub_4x4
= x264_zigzag_sub_4x4_frame_avx
;
1035 pf_interlaced
->sub_4x4ac
= x264_zigzag_sub_4x4ac_field_avx
;
1036 pf_progressive
->sub_4x4ac
= x264_zigzag_sub_4x4ac_frame_avx
;
1038 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_avx
;
1040 if( cpu
&X264_CPU_XOP
)
1042 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_xop
;
1043 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_xop
;
1044 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_xop
;
1046 if( cpu
&X264_CPU_AVX512
)
1048 pf_interlaced
->scan_4x4
= x264_zigzag_scan_4x4_field_avx512
;
1049 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_avx512
;
1050 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_avx512
;
1051 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_avx512
;
1055 if( cpu
&X264_CPU_ALTIVEC
)
1057 pf_interlaced
->scan_4x4
= x264_zigzag_scan_4x4_field_altivec
;
1058 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_altivec
;
1059 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_altivec
;
1062 #if HAVE_ARMV6 || ARCH_AARCH64
1063 if( cpu
&X264_CPU_NEON
)
1065 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_neon
;
1067 pf_interlaced
->scan_4x4
= x264_zigzag_scan_4x4_field_neon
;
1068 pf_interlaced
->scan_8x8
= x264_zigzag_scan_8x8_field_neon
;
1069 pf_interlaced
->sub_4x4
= x264_zigzag_sub_4x4_field_neon
;
1070 pf_interlaced
->sub_4x4ac
= x264_zigzag_sub_4x4ac_field_neon
;
1071 pf_interlaced
->sub_8x8
= x264_zigzag_sub_8x8_field_neon
;
1072 pf_progressive
->scan_8x8
= x264_zigzag_scan_8x8_frame_neon
;
1073 pf_progressive
->sub_4x4
= x264_zigzag_sub_4x4_frame_neon
;
1074 pf_progressive
->sub_4x4ac
= x264_zigzag_sub_4x4ac_frame_neon
;
1075 pf_progressive
->sub_8x8
= x264_zigzag_sub_8x8_frame_neon
;
1076 #endif // ARCH_AARCH64
1078 #endif // HAVE_ARMV6 || ARCH_AARCH64
1079 #endif // HIGH_BIT_DEPTH
1081 pf_interlaced
->interleave_8x8_cavlc
=
1082 pf_progressive
->interleave_8x8_cavlc
= zigzag_interleave_8x8_cavlc
;
1085 if( cpu
&X264_CPU_SSE2
)
1087 pf_interlaced
->interleave_8x8_cavlc
=
1088 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_sse2
;
1090 if( cpu
&X264_CPU_AVX
)
1092 pf_interlaced
->interleave_8x8_cavlc
=
1093 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_avx
;
1095 if( cpu
&X264_CPU_AVX512
)
1097 pf_interlaced
->interleave_8x8_cavlc
=
1098 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_avx512
;
1101 if( cpu
&X264_CPU_MMX
)
1103 pf_interlaced
->interleave_8x8_cavlc
=
1104 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_mmx
;
1106 if( (cpu
&X264_CPU_SSE2
) && !(cpu
&(X264_CPU_SLOW_SHUFFLE
|X264_CPU_SSE2_IS_SLOW
)) )
1108 pf_interlaced
->interleave_8x8_cavlc
=
1109 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_sse2
;
1112 if( cpu
&X264_CPU_AVX
)
1114 pf_interlaced
->interleave_8x8_cavlc
=
1115 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_avx
;
1118 if( cpu
&X264_CPU_AVX2
)
1120 pf_interlaced
->interleave_8x8_cavlc
=
1121 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_avx2
;
1123 if( cpu
&X264_CPU_AVX512
)
1125 pf_interlaced
->interleave_8x8_cavlc
=
1126 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_avx512
;
1128 #endif // HIGH_BIT_DEPTH
1132 if( cpu
&X264_CPU_NEON
)
1134 pf_interlaced
->interleave_8x8_cavlc
=
1135 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_neon
;
1137 #endif // ARCH_AARCH64
1140 if( cpu
&X264_CPU_ALTIVEC
)
1142 pf_interlaced
->interleave_8x8_cavlc
=
1143 pf_progressive
->interleave_8x8_cavlc
= x264_zigzag_interleave_8x8_cavlc_altivec
;
1145 #endif // HAVE_ALTIVEC
1148 if( cpu
&X264_CPU_MSA
)
1150 pf_progressive
->scan_4x4
= x264_zigzag_scan_4x4_frame_msa
;
1153 #endif // !HIGH_BIT_DEPTH