1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=ALL,AVX512VBMI
7 define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a) {
8 ; ALL-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
10 ; ALL-NEXT: vpsrld $16, %xmm0, %xmm0
12 %b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
16 define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
17 ; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
19 ; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
20 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
21 ; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
22 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
25 ; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
27 ; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
30 ; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
32 ; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
33 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
34 ; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
35 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
38 ; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
39 ; AVX512VBMI: # %bb.0:
40 ; AVX512VBMI-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
41 ; AVX512VBMI-NEXT: retq
42 %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
43 ret <64 x i8> %shuffle
46 define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
47 ; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
49 ; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
50 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
51 ; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
52 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
55 ; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
57 ; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
60 ; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
62 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
63 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
64 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
65 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
68 ; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
69 ; AVX512VBMI: # %bb.0:
70 ; AVX512VBMI-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
71 ; AVX512VBMI-NEXT: retq
72 %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
73 ret <64 x i8> %shuffle
76 define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
77 ; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
79 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
80 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
81 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
82 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
83 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
86 ; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
88 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
91 ; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
93 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
94 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
95 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
96 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
97 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
100 ; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
101 ; AVX512VBMI: # %bb.0:
102 ; AVX512VBMI-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
103 ; AVX512VBMI-NEXT: retq
104 %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
105 ret <64 x i8> %shuffle
109 define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
110 ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
112 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
113 ; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
116 ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
118 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
119 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
120 ; AVX512BW-NEXT: retq
122 ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
124 ; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
125 ; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0
126 ; AVX512DQ-NEXT: retq
128 ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
129 ; AVX512VBMI: # %bb.0:
130 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
131 ; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0
132 ; AVX512VBMI-NEXT: retq
133 %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
134 ret <64 x i8> %shuffle
137 define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<64 x i8> %a, <64 x i8> %b) {
138 ; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
140 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
141 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
144 ; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
146 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
147 ; AVX512BW-NEXT: retq
149 ; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
151 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
152 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
153 ; AVX512DQ-NEXT: retq
155 ; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
156 ; AVX512VBMI: # %bb.0:
157 ; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0
158 ; AVX512VBMI-NEXT: retq
159 %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
160 ret <64 x i8> %shuffle
163 define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
164 ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
166 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
167 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2
168 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
169 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
170 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
171 ; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
174 ; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
176 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
177 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
178 ; AVX512BW-NEXT: retq
180 ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
182 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
183 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2
184 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
185 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
186 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
187 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
188 ; AVX512DQ-NEXT: retq
190 ; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
191 ; AVX512VBMI: # %bb.0:
192 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
193 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
194 ; AVX512VBMI-NEXT: retq
195 %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
196 ret <64 x i8> %shuffle
200 define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_18_19_20_21_22_23_16_17_26_27_28_29_30_31_24_25_34_35_36_37_38_39_32_33_42_43_44_45_46_47_40_41_50_51_52_53_54_55_48_49_58_59_60_61_62_63_56_57(<64 x i8> %a) {
201 ; ALL-LABEL: shuffle_v64i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_18_19_20_21_22_23_16_17_26_27_28_29_30_31_24_25_34_35_36_37_38_39_32_33_42_43_44_45_46_47_40_41_50_51_52_53_54_55_48_49_58_59_60_61_62_63_56_57:
203 ; ALL-NEXT: vprolq $48, %zmm0, %zmm0
205 %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 32, i32 33, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 40, i32 41, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 48, i32 49, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 56, i32 57>
206 ret <64 x i8> %shuffle
209 define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) {
210 ; AVX512F-LABEL: insert_dup_mem_v64i8_i32:
212 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
213 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
216 ; AVX512BW-LABEL: insert_dup_mem_v64i8_i32:
218 ; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0
219 ; AVX512BW-NEXT: retq
221 ; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32:
223 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
224 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
225 ; AVX512DQ-NEXT: retq
227 ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32:
228 ; AVX512VBMI: # %bb.0:
229 ; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0
230 ; AVX512VBMI-NEXT: retq
231 %tmp = load i32, i32* %ptr, align 4
232 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
233 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
234 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> zeroinitializer
238 define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) {
239 ; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8:
241 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
242 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
245 ; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8:
247 ; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0
248 ; AVX512BW-NEXT: retq
250 ; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8:
252 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
253 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
254 ; AVX512DQ-NEXT: retq
256 ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8:
257 ; AVX512VBMI: # %bb.0:
258 ; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0
259 ; AVX512VBMI-NEXT: retq
260 %tmp = load i8, i8* %ptr, align 1
261 %tmp1 = sext i8 %tmp to i32
262 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
263 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
264 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> zeroinitializer
268 define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) {
269 ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32:
271 ; AVX512F-NEXT: vpbroadcastb 1(%rdi), %ymm0
272 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
275 ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32:
277 ; AVX512BW-NEXT: vpbroadcastb 1(%rdi), %zmm0
278 ; AVX512BW-NEXT: retq
280 ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32:
282 ; AVX512DQ-NEXT: vpbroadcastb 1(%rdi), %ymm0
283 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
284 ; AVX512DQ-NEXT: retq
286 ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32:
287 ; AVX512VBMI: # %bb.0:
288 ; AVX512VBMI-NEXT: vpbroadcastb 1(%rdi), %zmm0
289 ; AVX512VBMI-NEXT: retq
290 %tmp = load i32, i32* %ptr, align 4
291 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
292 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
293 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
297 define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) {
298 ; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32:
300 ; AVX512F-NEXT: vpbroadcastb 3(%rdi), %ymm0
301 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
304 ; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32:
306 ; AVX512BW-NEXT: vpbroadcastb 3(%rdi), %zmm0
307 ; AVX512BW-NEXT: retq
309 ; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32:
311 ; AVX512DQ-NEXT: vpbroadcastb 3(%rdi), %ymm0
312 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
313 ; AVX512DQ-NEXT: retq
315 ; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32:
316 ; AVX512VBMI: # %bb.0:
317 ; AVX512VBMI-NEXT: vpbroadcastb 3(%rdi), %zmm0
318 ; AVX512VBMI-NEXT: retq
319 %tmp = load i32, i32* %ptr, align 4
320 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
321 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
322 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
326 define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
327 ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
329 ; AVX512F-NEXT: movsbl (%rdi), %eax
330 ; AVX512F-NEXT: shrl $8, %eax
331 ; AVX512F-NEXT: vmovd %eax, %xmm0
332 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
333 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
336 ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
338 ; AVX512BW-NEXT: movsbl (%rdi), %eax
339 ; AVX512BW-NEXT: shrl $8, %eax
340 ; AVX512BW-NEXT: vpbroadcastb %eax, %zmm0
341 ; AVX512BW-NEXT: retq
343 ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
345 ; AVX512DQ-NEXT: movsbl (%rdi), %eax
346 ; AVX512DQ-NEXT: shrl $8, %eax
347 ; AVX512DQ-NEXT: vmovd %eax, %xmm0
348 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
349 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
350 ; AVX512DQ-NEXT: retq
352 ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
353 ; AVX512VBMI: # %bb.0:
354 ; AVX512VBMI-NEXT: movsbl (%rdi), %eax
355 ; AVX512VBMI-NEXT: shrl $8, %eax
356 ; AVX512VBMI-NEXT: vpbroadcastb %eax, %zmm0
357 ; AVX512VBMI-NEXT: retq
358 %tmp = load i8, i8* %ptr, align 1
359 %tmp1 = sext i8 %tmp to i32
360 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
361 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
362 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
366 define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) {
367 ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
369 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
370 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
371 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
372 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
375 ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
377 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
378 ; AVX512BW-NEXT: retq
380 ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
382 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
383 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
384 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
385 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
386 ; AVX512DQ-NEXT: retq
388 ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
389 ; AVX512VBMI: # %bb.0:
390 ; AVX512VBMI-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
391 ; AVX512VBMI-NEXT: retq
392 %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
393 ret <64 x i8> %shuffle
396 define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) {
397 ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
399 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
400 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
401 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
402 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
405 ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
407 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
408 ; AVX512BW-NEXT: retq
410 ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
412 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
413 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
414 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
415 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
416 ; AVX512DQ-NEXT: retq
418 ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
419 ; AVX512VBMI: # %bb.0:
420 ; AVX512VBMI-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
421 ; AVX512VBMI-NEXT: retq
422 %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 72, i32 0, i32 0, i32 0, i32 73, i32 0, i32 0, i32 0, i32 74, i32 0, i32 0, i32 0, i32 75, i32 0, i32 0, i32 0, i32 76, i32 0, i32 0, i32 0, i32 77, i32 0, i32 0, i32 0, i32 78, i32 0, i32 0, i32 0, i32 79, i32 0, i32 0, i32 0>
423 ret <64 x i8> %shuffle
426 define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) {
427 ; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
429 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
430 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
431 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
432 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
435 ; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
437 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
438 ; AVX512BW-NEXT: retq
440 ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
442 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
443 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
444 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
445 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
446 ; AVX512DQ-NEXT: retq
448 ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
449 ; AVX512VBMI: # %bb.0:
450 ; AVX512VBMI-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
451 ; AVX512VBMI-NEXT: retq
452 %shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 65, i32 0, i32 66, i32 0, i32 67, i32 0, i32 68, i32 0, i32 69, i32 0, i32 70, i32 0, i32 71, i32 0, i32 72, i32 0, i32 73, i32 0, i32 74, i32 0, i32 75, i32 0, i32 76, i32 0, i32 77, i32 0, i32 78, i32 0, i32 79, i32 0, i32 80, i32 0, i32 81, i32 0, i32 82, i32 0, i32 83, i32 0, i32 84, i32 0, i32 85, i32 0, i32 86, i32 0, i32 87, i32 0, i32 88, i32 0, i32 89, i32 0, i32 90, i32 0, i32 91, i32 0, i32 92, i32 0, i32 93, i32 0, i32 94, i32 0, i32 95, i32 0>
453 ret <64 x i8> %shuffle
456 define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
457 ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
459 ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
460 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
461 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
462 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
463 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
464 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
465 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
468 ; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
470 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
471 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zmm0[13],zero,zmm0[11],zero,zmm0[9],zero,zmm0[7],zero,zmm0[5],zero,zmm0[3],zero,zmm0[1],zero,zmm0[31],zero,zmm0[29],zero,zmm0[27],zero,zmm0[25],zero,zmm0[23],zero,zmm0[21],zero,zmm0[19],zero,zmm0[17],zero,zmm0[47],zero,zmm0[45],zero,zmm0[43],zero,zmm0[41],zero,zmm0[39],zero,zmm0[37],zero,zmm0[35],zero,zmm0[33],zero,zmm0[63],zero,zmm0[61],zero,zmm0[59],zero,zmm0[57],zero,zmm0[55],zero,zmm0[53],zero,zmm0[51],zero,zmm0[49],zero
472 ; AVX512BW-NEXT: retq
474 ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
476 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
477 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
478 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
479 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
480 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
481 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
482 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
483 ; AVX512DQ-NEXT: retq
485 ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
486 ; AVX512VBMI: # %bb.0:
487 ; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
488 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
489 ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
490 ; AVX512VBMI-NEXT: retq
491 %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 63, i32 64, i32 61, i32 64, i32 59, i32 64, i32 57, i32 64, i32 55, i32 64, i32 53, i32 64, i32 51, i32 64, i32 49, i32 64, i32 47, i32 64, i32 45, i32 64, i32 43, i32 64, i32 41, i32 64, i32 39, i32 64, i32 37, i32 64, i32 35, i32 64, i32 33, i32 64, i32 31, i32 64, i32 29, i32 64, i32 27, i32 64, i32 25, i32 64, i32 23, i32 64, i32 21, i32 64, i32 19, i32 64, i32 17, i32 64, i32 15, i32 64, i32 13, i32 64, i32 11, i32 64, i32 9, i32 64, i32 7, i32 64, i32 5, i32 64, i32 3, i32 64, i32 1, i32 64>
492 ret <64 x i8> %shuffle
495 define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
496 ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
498 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
499 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
500 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
501 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
502 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
503 ; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2
504 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
505 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
506 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
507 ; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
508 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
511 ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
513 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
514 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
515 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
516 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
517 ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
518 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
519 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
520 ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
521 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
522 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
523 ; AVX512BW-NEXT: retq
525 ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
527 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
528 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
529 ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
530 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
531 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
532 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
533 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
534 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
535 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
536 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
537 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
538 ; AVX512DQ-NEXT: retq
540 ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
541 ; AVX512VBMI: # %bb.0:
542 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
543 ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
544 ; AVX512VBMI-NEXT: retq
545 %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 63, i32 64, i32 61, i32 66, i32 59, i32 68, i32 57, i32 70, i32 55, i32 72, i32 53, i32 74, i32 51, i32 76, i32 49, i32 78, i32 47, i32 80, i32 45, i32 82, i32 43, i32 84, i32 41, i32 86, i32 39, i32 88, i32 37, i32 90, i32 35, i32 92, i32 33, i32 94, i32 31, i32 96, i32 29, i32 98, i32 27, i32 100, i32 25, i32 102, i32 23, i32 104, i32 21, i32 106, i32 19, i32 108, i32 17, i32 110, i32 15, i32 112, i32 13, i32 114, i32 11, i32 116, i32 9, i32 118, i32 7, i32 120, i32 5, i32 122, i32 3, i32 124, i32 1, i32 126>
546 ret <64 x i8> %shuffle
549 define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
550 ; AVX512F-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
552 ; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0
553 ; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1
554 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
555 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
556 ; AVX512F-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
557 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
558 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
561 ; AVX512BW-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
563 ; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0
564 ; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1
565 ; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
566 ; AVX512BW-NEXT: retq
568 ; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
570 ; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0
571 ; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1
572 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
573 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
574 ; AVX512DQ-NEXT: vpackssdw %ymm2, %ymm3, %ymm2
575 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
576 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
577 ; AVX512DQ-NEXT: retq
579 ; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
580 ; AVX512VBMI: # %bb.0:
581 ; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm0
582 ; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1
583 ; AVX512VBMI-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
584 ; AVX512VBMI-NEXT: retq
585 %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
586 %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
587 %3 = bitcast <16 x i32> %1 to <64 x i8>
588 %4 = bitcast <16 x i32> %2 to <64 x i8>
589 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125>
593 define <64 x i8> @shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind {
594 ; AVX512F-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
596 ; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0
597 ; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1
598 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm2
599 ; AVX512F-NEXT: vpacksswb %ymm2, %ymm2, %ymm2
600 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
601 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
602 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
603 ; AVX512F-NEXT: vpacksswb %ymm0, %ymm0, %ymm0
604 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
607 ; AVX512BW-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
609 ; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0
610 ; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1
611 ; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
612 ; AVX512BW-NEXT: vpacksswb %zmm0, %zmm0, %zmm0
613 ; AVX512BW-NEXT: retq
615 ; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
617 ; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0
618 ; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1
619 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm2
620 ; AVX512DQ-NEXT: vpacksswb %ymm2, %ymm2, %ymm2
621 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
622 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
623 ; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
624 ; AVX512DQ-NEXT: vpacksswb %ymm0, %ymm0, %ymm0
625 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
626 ; AVX512DQ-NEXT: retq
628 ; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
629 ; AVX512VBMI: # %bb.0:
630 ; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm0
631 ; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1
632 ; AVX512VBMI-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
633 ; AVX512VBMI-NEXT: vpacksswb %zmm0, %zmm0, %zmm0
634 ; AVX512VBMI-NEXT: retq
635 %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
636 %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
637 %3 = bitcast <16 x i32> %1 to <64 x i8>
638 %4 = bitcast <16 x i32> %2 to <64 x i8>
639 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124>
643 define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
644 ; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
646 ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0
647 ; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1
648 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
649 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
650 ; AVX512F-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
651 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
652 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
655 ; AVX512BW-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
657 ; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0
658 ; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1
659 ; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
660 ; AVX512BW-NEXT: retq
662 ; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
664 ; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0
665 ; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1
666 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
667 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
668 ; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
669 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
670 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
671 ; AVX512DQ-NEXT: retq
673 ; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
674 ; AVX512VBMI: # %bb.0:
675 ; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm0
676 ; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1
677 ; AVX512VBMI-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
678 ; AVX512VBMI-NEXT: retq
679 %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
680 %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
681 %3 = bitcast <16 x i32> %1 to <64 x i8>
682 %4 = bitcast <16 x i32> %2 to <64 x i8>
683 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125>
687 define <64 x i8> @shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind {
688 ; AVX512F-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
690 ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0
691 ; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1
692 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm2
693 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm2, %ymm2
694 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
695 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
696 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
697 ; AVX512F-NEXT: vpackuswb %ymm0, %ymm0, %ymm0
698 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
701 ; AVX512BW-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
703 ; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0
704 ; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1
705 ; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
706 ; AVX512BW-NEXT: vpackuswb %zmm0, %zmm0, %zmm0
707 ; AVX512BW-NEXT: retq
709 ; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
711 ; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0
712 ; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1
713 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm2
714 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm2, %ymm2
715 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
716 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
717 ; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
718 ; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm0, %ymm0
719 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
720 ; AVX512DQ-NEXT: retq
722 ; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
723 ; AVX512VBMI: # %bb.0:
724 ; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm0
725 ; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1
726 ; AVX512VBMI-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
727 ; AVX512VBMI-NEXT: vpackuswb %zmm0, %zmm0, %zmm0
728 ; AVX512VBMI-NEXT: retq
729 %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
730 %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
731 %3 = bitcast <16 x i32> %1 to <64 x i8>
732 %4 = bitcast <16 x i32> %2 to <64 x i8>
733 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124>
737 define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
738 ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
740 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
741 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
742 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
743 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
744 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
745 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
746 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
747 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
748 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
749 ; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7]
752 ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
754 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
755 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
756 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,6,7],zmm1[2,3,6,7]
757 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm3
758 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14>
759 ; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
760 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[0,1,4,5]
761 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
762 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
763 ; AVX512BW-NEXT: vpshufb %ymm5, %ymm1, %ymm1
764 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
765 ; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
766 ; AVX512BW-NEXT: vpshufb %ymm5, %ymm0, %ymm0
767 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
768 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
769 ; AVX512BW-NEXT: retq
771 ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
773 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
774 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
775 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
776 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
777 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
778 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
779 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
780 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
781 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
782 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7]
783 ; AVX512DQ-NEXT: retq
785 ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
786 ; AVX512VBMI: # %bb.0:
787 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127]
788 ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
789 ; AVX512VBMI-NEXT: retq
790 %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
791 %2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
792 %3 = bitcast <32 x i16> %1 to <64 x i8>
793 %4 = bitcast <32 x i16> %2 to <64 x i8>
794 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
798 define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
799 ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
801 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm2
802 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
803 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
804 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm3
805 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
806 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
807 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
808 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
809 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
812 ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
814 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
815 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
816 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
817 ; AVX512BW-NEXT: retq
819 ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
821 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm2
822 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
823 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
824 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm3
825 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
826 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
827 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
828 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
829 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
830 ; AVX512DQ-NEXT: retq
832 ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
833 ; AVX512VBMI: # %bb.0:
834 ; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0
835 ; AVX512VBMI-NEXT: vpsrlw $8, %zmm1, %zmm1
836 ; AVX512VBMI-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
837 ; AVX512VBMI-NEXT: retq
838 %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
839 %2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
840 %3 = bitcast <32 x i16> %1 to <64 x i8>
841 %4 = bitcast <32 x i16> %2 to <64 x i8>
842 %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>