1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
37 pw_pixel_max: times
8 dw ((1 << 10)-1)
38 pw_512: times
8 dw 512
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS
4
52 ;-----------------------------------------------------------------------------
53 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
56 cglobal pred4x4_down_right_10
, 3, 3
60 movhps m2
, [r0
+r2
*1-8]
65 PALIGNR m3
, m1
, 10, m1
66 movhps m4
, [r1
+r2
*1-8]
67 PALIGNR m0
, m3
, m4
, 14, m4
68 movhps m4
, [r1
+r2
*2-8]
69 PALIGNR m2
, m0
, m4
, 14, m4
70 PRED4x4_LOWPASS m0
, m2
, m3
, m0
88 ;-----------------------------------------------------------------------------
89 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
90 ;-----------------------------------------------------------------------------
92 cglobal pred4x4_vertical_right_10
, 3, 3, 6
95 movq m5
, [r0
] ; ........t3t2t1t0
97 PALIGNR m0
, m5
, m1
, 14, m1
; ......t3t2t1t0lt
99 movhps m1
, [r0
+r2
*1-8]
100 PALIGNR m0
, m1
, 14, m1
; ....t3t2t1t0ltl0
101 movhps m2
, [r0
+r2
*2-8]
102 PALIGNR m1
, m0
, m2
, 14, m2
; ..t3t2t1t0ltl0l1
103 movhps m3
, [r1
+r2
*1-8]
104 PALIGNR m2
, m1
, m3
, 14, m3
; t3t2t1t0ltl0l1l2
105 PRED4x4_LOWPASS m1
, m0
, m2
, m1
110 PALIGNR m5
, m0
, 14, m2
113 PALIGNR m1
, m0
, 14, m0
125 ;-----------------------------------------------------------------------------
126 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
127 ;-----------------------------------------------------------------------------
129 cglobal pred4x4_horizontal_down_10
, 3, 3
132 movq m0
, [r0
-8] ; lt ..
134 pslldq m0
, 2 ; t2 t1 t0 lt .. .. .. ..
135 movq m1
, [r1
+r2
*2-8] ; l3
137 punpcklwd m1
, m3
; l2 l3
138 movq m2
, [r0
+r2
*2-8] ; l1
140 punpcklwd m2
, m3
; l0 l1
141 punpckhdq m1
, m2
; l0 l1 l2 l3
142 punpckhqdq m1
, m0
; t2 t1 t0 lt l0 l1 l2 l3
143 psrldq m0
, m1
, 4 ; .. .. t2 t1 t0 lt l0 l1
144 psrldq m3
, m1
, 2 ; .. t2 t1 t0 lt l0 l1 l2
146 PRED4x4_LOWPASS m3
, m1
, m0
, m3
149 PALIGNR m3
, m5
, 12, m4
165 ;-----------------------------------------------------------------------------
166 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
167 ;-----------------------------------------------------------------------------
168 %macro HADDD
2 ; sum junk
186 cglobal pred4x4_dc_10
, 3, 3
190 paddw m2
, [r0
+r2
*2-8]
191 paddw m2
, [r1
+r2
*1-8]
192 paddw m2
, [r1
+r2
*2-8]
206 ;-----------------------------------------------------------------------------
207 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
208 ;-----------------------------------------------------------------------------
210 cglobal pred4x4_down_left_10
, 3, 3
216 pshufhw m2
, m2
, 10100100b
217 PRED4x4_LOWPASS m0
, m3
, m2
, m0
234 ;-----------------------------------------------------------------------------
235 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
236 ;-----------------------------------------------------------------------------
238 cglobal pred4x4_vertical_left_10
, 3, 3
245 PRED4x4_LOWPASS m0
, m1
, m2
, m0
261 ;-----------------------------------------------------------------------------
262 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
263 ;-----------------------------------------------------------------------------
265 cglobal pred4x4_horizontal_up_10
, 3, 3
269 punpckhwd m0
, [r0
+r2
*2-8]
271 punpckhwd m1
, [r1
+r2
*2-8]
276 pshufw m2
, m0
, 11111001b
280 pshufw m5
, m0
, 11111110b
281 PRED4x4_LOWPASS m1
, m0
, m5
, m1
295 ;-----------------------------------------------------------------------------
296 ; void pred8x8_vertical(pixel *src, int stride)
297 ;-----------------------------------------------------------------------------
299 cglobal pred8x8_vertical_10
, 2, 2
311 ;-----------------------------------------------------------------------------
312 ; void pred8x8_horizontal(pixel *src, int stride)
313 ;-----------------------------------------------------------------------------
315 cglobal pred8x8_horizontal_10
, 2, 3
331 ;-----------------------------------------------------------------------------
332 ; void predict_8x8_dc(pixel *src, int stride)
333 ;-----------------------------------------------------------------------------
335 ; sort of a hack, but it works
345 cglobal pred8x8_dc_10
, 2, 6
355 pshufw m2
, m0
, 00001110b
356 pshufw m3
, m1
, 00001110b
366 movzx r2d
, word [r0
+r1
*1-2]
367 movzx r3d
, word [r0
+r1
*2-2]
369 movzx r3d
, word [r0
+r5
*1-2]
371 movzx r3d
, word [r4
-2]
375 movzx r2d
, word [r4
+r1
*1-2]
376 movzx r3d
, word [r4
+r1
*2-2]
378 movzx r3d
, word [r4
+r5
*1-2]
380 movzx r3d
, word [r4
+r1
*4-2]
385 punpckldq m0
, m2
; s0, s1, s2, s3
386 %1 m3
, m0
, 11110110b ; s2, s1, s3, s3
387 %1 m0
, m0
, 01110100b ; s0, s1, s3, s1
390 pavgw m0
, m4
; s0+s2, s1, s3, s1+s3
393 pshufd m3
, m0
, 11111010b
418 ;-----------------------------------------------------------------------------
419 ; void pred8x8_top_dc(pixel *src, int stride)
420 ;-----------------------------------------------------------------------------
422 cglobal pred8x8_top_dc_10
, 2, 4
445 ;-----------------------------------------------------------------------------
446 ; void pred8x8_plane(pixel *src, int stride)
447 ;-----------------------------------------------------------------------------
449 cglobal pred8x8_plane_10
, 2, 7, 7
454 pmaddwd m2
, [pw_m32101234
]
462 psllw m0
, 4 ; 16*(src[7*stride-1] + src[-stride+7])
463 movzx r4d
, word [r3
+r1
*1-2] ; src[4*stride-1]
464 movzx r5d
, word [r0
+r2
*1-2] ; src[2*stride-1]
466 movzx r6d
, word [r3
+r1
*2-2] ; src[5*stride-1]
467 movzx r5d
, word [r0
+r1
*2-2] ; src[1*stride-1]
470 movzx r5d
, word [r3
+r2
*1-2] ; src[6*stride-1]
471 movzx r6d
, word [r0
+r1
*1-2] ; src[0*stride-1]
475 movzx r6d
, word [r3
+r1
*4-2] ; src[7*stride-1]
476 movzx r5d
, word [r0
+r1
*0-2] ; src[ -stride-1]
485 mova m3
, [pw_pixel_max
]
490 pmullw m2
, [pw_m32101234
] ; b
491 pmullw m5
, m4
, [pw_m3
] ; c
508 ;-----------------------------------------------------------------------------
509 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
510 ;-----------------------------------------------------------------------------
511 %macro PRED8x8L_128_DC
0
512 cglobal pred8x8l_128_dc_10
, 4, 4
513 mova m0
, [pw_512
] ; (1<<(BIT_DEPTH-1))
532 ;-----------------------------------------------------------------------------
533 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
534 ;-----------------------------------------------------------------------------
535 %macro PRED8x8L_TOP_DC
0
536 cglobal pred8x8l_top_dc_10
, 4, 4, 6
544 pinsrw m1
, [r0
+r1
], 0
545 pinsrw m2
, [r0
+r2
+14], 7
548 PRED4x4_LOWPASS m0
, m2
, m1
, m0
569 ;-----------------------------------------------------------------------------
570 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
571 ;-----------------------------------------------------------------------------
572 ;TODO: see if scalar is faster
574 cglobal pred8x8l_dc_10
, 4, 6, 6
578 mova m0
, [r0
+r3
*2-16]
579 punpckhwd m0
, [r0
+r3
*1-16]
580 mova m1
, [r4
+r3
*0-16]
581 punpckhwd m1
, [r0
+r5
*1-16]
583 mova m2
, [r4
+r3
*2-16]
584 punpckhwd m2
, [r4
+r3
*1-16]
585 mova m3
, [r4
+r3
*4-16]
586 punpckhwd m3
, [r4
+r5
*1-16]
595 pinsrw m1
, [r0
+r1
], 0
596 pinsrw m2
, [r0
+r2
+14], 7
601 pshuflw m4
, m4
, 11100101b
602 pinsrw m5
, [r0
+r1
-2], 7
603 PRED4x4_LOWPASS m3
, m4
, m5
, m3
604 PRED4x4_LOWPASS m0
, m2
, m1
, m0
626 ;-----------------------------------------------------------------------------
627 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
628 ;-----------------------------------------------------------------------------
629 %macro PRED8x8L_VERTICAL
0
630 cglobal pred8x8l_vertical_10
, 4, 4, 6
638 pinsrw m1
, [r0
+r1
], 0
639 pinsrw m2
, [r0
+r2
+14], 7
642 PRED4x4_LOWPASS m0
, m2
, m1
, m0
659 ;-----------------------------------------------------------------------------
660 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
661 ;-----------------------------------------------------------------------------
662 %macro PRED8x8L_HORIZONTAL
0
663 cglobal pred8x8l_horizontal_10
, 4, 4, 5
669 punpckhwd m0
, [r0
+r1
-16]
670 mova m1
, [r0
+r3
*2-16]
671 punpckhwd m1
, [r0
+r3
*1-16]
675 mova m2
, [r2
+r3
*0-16]
676 punpckhwd m2
, [r0
+r1
-16]
677 mova m3
, [r2
+r3
*2-16]
678 punpckhwd m3
, [r2
+r3
*1-16]
681 PALIGNR m4
, m3
, [r2
+r1
-16], 14, m0
683 pshuflw m0
, m0
, 11100101b
684 PRED4x4_LOWPASS m4
, m3
, m0
, m4
713 ;-----------------------------------------------------------------------------
714 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
715 ;-----------------------------------------------------------------------------
716 %macro PRED8x8L_DOWN_LEFT
0
717 cglobal pred8x8l_down_left_10
, 4, 4, 7
725 pinsrw m1
, [r0
+r1
], 0
726 pinsrw m2
, [r0
+r2
+14], 7
727 PRED4x4_LOWPASS m6
, m2
, m1
, m3
728 jz .fix_tr
; flags from shr r2d
731 PALIGNR m2
, m1
, m3
, 14, m3
732 pshufhw m5
, m5
, 10100100b
733 PRED4x4_LOWPASS m1
, m2
, m5
, m1
738 PALIGNR m2
, m1
, m6
, 2, m0
739 PALIGNR m3
, m1
, m6
, 14, m0
740 PALIGNR m5
, m1
, 2, m0
742 PRED4x4_LOWPASS m6
, m4
, m2
, m6
743 PRED4x4_LOWPASS m1
, m3
, m5
, m1
745 PALIGNR m1
, m6
, 14, m2
748 PALIGNR m1
, m6
, 14, m2
751 PALIGNR m1
, m6
, 14, m2
754 PALIGNR m1
, m6
, 14, m2
757 PALIGNR m1
, m6
, 14, m2
760 PALIGNR m1
, m6
, 14, m2
763 PALIGNR m1
, m6
, 14, m6
779 ;-----------------------------------------------------------------------------
780 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
781 ;-----------------------------------------------------------------------------
782 %macro PRED8x8L_DOWN_RIGHT
0
783 ; standard forbids this when has_topleft is false
785 cglobal pred8x8l_down_right_10
, 4, 5, 8
789 mova m0
, [r0
+r3
*1-16]
790 punpckhwd m0
, [r0
+r3
*0-16]
791 mova m1
, [r0
+r1
*1-16]
792 punpckhwd m1
, [r0
+r3
*2-16]
794 mova m2
, [r4
+r3
*1-16]
795 punpckhwd m2
, [r4
+r3
*0-16]
796 mova m3
, [r4
+r1
*1-16]
797 punpckhwd m3
, [r4
+r3
*2-16]
800 mova m0
, [r4
+r3
*4-16]
802 PALIGNR m4
, m3
, m0
, 14, m0
803 PALIGNR m1
, m3
, 2, m2
805 pshuflw m0
, m0
, 11100101b
806 PRED4x4_LOWPASS m6
, m1
, m4
, m3
807 PRED4x4_LOWPASS m4
, m3
, m0
, m4
813 pinsrw m2
, [r0
+r2
+14], 7
814 PRED4x4_LOWPASS m3
, m2
, m1
, m3
815 PALIGNR m2
, m3
, m6
, 2, m0
816 PALIGNR m5
, m3
, m6
, 14, m0
818 PRED4x4_LOWPASS m6
, m4
, m2
, m6
819 PRED4x4_LOWPASS m3
, m5
, m7
, m3
821 PALIGNR m3
, m6
, 14, m2
824 PALIGNR m3
, m6
, 14, m2
827 PALIGNR m3
, m6
, 14, m2
830 PALIGNR m3
, m6
, 14, m2
833 PALIGNR m3
, m6
, 14, m2
836 PALIGNR m3
, m6
, 14, m2
839 PALIGNR m3
, m6
, 14, m6
851 ;-----------------------------------------------------------------------------
852 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
853 ;-----------------------------------------------------------------------------
854 %macro PRED8x8L_VERTICAL_RIGHT
0
855 ; likewise with 8x8l_down_right
856 cglobal pred8x8l_vertical_right_10
, 4, 5, 7
860 mova m0
, [r0
+r3
*1-16]
861 punpckhwd m0
, [r0
+r3
*0-16]
862 mova m1
, [r0
+r1
*1-16]
863 punpckhwd m1
, [r0
+r3
*2-16]
865 mova m2
, [r4
+r3
*1-16]
866 punpckhwd m2
, [r4
+r3
*0-16]
867 mova m3
, [r4
+r1
*1-16]
868 punpckhwd m3
, [r4
+r3
*2-16]
871 mova m0
, [r4
+r3
*4-16]
873 PALIGNR m4
, m3
, m0
, 14, m0
874 PALIGNR m1
, m3
, 2, m2
875 PRED4x4_LOWPASS m3
, m1
, m4
, m3
881 pinsrw m5
, [r0
+r2
+14], 7
882 PRED4x4_LOWPASS m2
, m5
, m1
, m2
883 PALIGNR m6
, m2
, m3
, 12, m1
884 PALIGNR m5
, m2
, m3
, 14, m0
885 PRED4x4_LOWPASS m0
, m6
, m2
, m5
891 PRED4x4_LOWPASS m1
, m3
, m6
, m1
892 PALIGNR m2
, m1
, 14, m4
895 PALIGNR m0
, m1
, 14, m3
898 PALIGNR m2
, m1
, 14, m4
901 PALIGNR m0
, m1
, 14, m3
904 PALIGNR m2
, m1
, 14, m4
907 PALIGNR m0
, m1
, 14, m1
913 PRED8x8L_VERTICAL_RIGHT
915 PRED8x8L_VERTICAL_RIGHT
917 PRED8x8L_VERTICAL_RIGHT
919 ;-----------------------------------------------------------------------------
920 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
921 ;-----------------------------------------------------------------------------
922 %macro PRED8x8L_HORIZONTAL_UP
0
923 cglobal pred8x8l_horizontal_up_10
, 4, 4, 6
924 mova m0
, [r0
+r3
*0-16]
925 punpckhwd m0
, [r0
+r3
*1-16]
930 mova m4
, [r0
+r1
*1-16]
933 mova m1
, [r0
+r3
*2-16]
934 punpckhwd m1
, [r0
+r1
*1-16]
936 mova m2
, [r2
+r3
*0-16]
937 punpckhwd m2
, [r2
+r3
*1-16]
938 mova m3
, [r2
+r3
*2-16]
939 punpckhwd m3
, [r2
+r1
*1-16]
942 PALIGNR m1
, m0
, m4
, 14, m4
944 pshufhw m2
, m2
, 10100100b
945 PRED4x4_LOWPASS m0
, m1
, m2
, m0
948 pshufhw m1
, m1
, 10100100b
949 pshufhw m2
, m2
, 01010100b
951 PRED4x4_LOWPASS m1
, m2
, m0
, m1
956 pshufd m0
, m5
, 11111001b
957 pshufd m1
, m5
, 11111110b
958 pshufd m2
, m5
, 11111111b
962 PALIGNR m2
, m5
, m4
, 4, m0
963 PALIGNR m3
, m5
, m4
, 8, m1
964 PALIGNR m5
, m5
, m4
, 12, m4
972 PRED8x8L_HORIZONTAL_UP
974 PRED8x8L_HORIZONTAL_UP
976 PRED8x8L_HORIZONTAL_UP
979 ;-----------------------------------------------------------------------------
980 ; void pred16x16_vertical(pixel *src, int stride)
981 ;-----------------------------------------------------------------------------
991 %macro PRED16x16_VERTICAL
0
992 cglobal pred16x16_vertical_10
, 2, 3
1002 MOV16 r0
+r1
*1, m0
, m1
, m2
, m3
1003 MOV16 r0
+r1
*2, m0
, m1
, m2
, m3
1015 ;-----------------------------------------------------------------------------
1016 ; void pred16x16_horizontal(pixel *src, int stride)
1017 ;-----------------------------------------------------------------------------
1018 %macro PRED16x16_HORIZONTAL
0
1019 cglobal pred16x16_horizontal_10
, 2, 3
1022 movd m0
, [r0
+r1
*0-4]
1023 movd m1
, [r0
+r1
*1-4]
1026 MOV16 r0
+r1
*0, m0
, m0
, m0
, m0
1027 MOV16 r0
+r1
*1, m1
, m1
, m1
, m1
1035 PRED16x16_HORIZONTAL
1037 PRED16x16_HORIZONTAL
1039 ;-----------------------------------------------------------------------------
1040 ; void pred16x16_dc(pixel *src, int stride)
1041 ;-----------------------------------------------------------------------------
1042 %macro PRED16x16_DC
0
1043 cglobal pred16x16_dc_10
, 2, 6
1047 paddw m0
, [r0
+mmsize
]
1055 movzx r3d
, word [r0
]
1056 movzx r4d
, word [r0
+r1
]
1059 movzx r2d
, word [r0
]
1061 movzx r2d
, word [r0
+r1
]
1072 MOV16 r5
+r1
*0, m0
, m0
, m0
, m0
1073 MOV16 r5
+r1
*1, m0
, m0
, m0
, m0
1085 ;-----------------------------------------------------------------------------
1086 ; void pred16x16_top_dc(pixel *src, int stride)
1087 ;-----------------------------------------------------------------------------
1088 %macro PRED16x16_TOP_DC
0
1089 cglobal pred16x16_top_dc_10
, 2, 3
1092 paddw m0
, [r0
+mmsize
]
1104 MOV16 r0
+r1
*1, m0
, m0
, m0
, m0
1105 MOV16 r0
+r1
*2, m0
, m0
, m0
, m0
1117 ;-----------------------------------------------------------------------------
1118 ; void pred16x16_left_dc(pixel *src, int stride)
1119 ;-----------------------------------------------------------------------------
1120 %macro PRED16x16_LEFT_DC
0
1121 cglobal pred16x16_left_dc_10
, 2, 6
1125 movzx r3d
, word [r0
]
1126 movzx r4d
, word [r0
+r1
]
1129 movzx r2d
, word [r0
]
1131 movzx r2d
, word [r0
+r1
]
1141 MOV16 r5
+r1
*0, m0
, m0
, m0
, m0
1142 MOV16 r5
+r1
*1, m0
, m0
, m0
, m0
1154 ;-----------------------------------------------------------------------------
1155 ; void pred16x16_128_dc(pixel *src, int stride)
1156 ;-----------------------------------------------------------------------------
1157 %macro PRED16x16_128_DC
0
1158 cglobal pred16x16_128_dc_10
, 2,3
1162 MOV16 r0
+r1
*0, m0
, m0
, m0
, m0
1163 MOV16 r0
+r1
*1, m0
, m0
, m0
, m0