1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; trunc(concat(x,y)) -> pack
7 define <32 x i16> @trunc_concat_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
8 ; AVX512-LABEL: trunc_concat_packssdw_512:
10 ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0
11 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
13 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
15 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
16 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0
17 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1
18 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
20 %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
21 %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
22 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
23 %4 = trunc <32 x i32> %3 to <32 x i16>
27 define <32 x i16> @trunc_concat_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
28 ; AVX512-LABEL: trunc_concat_packusdw_512:
30 ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0
31 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
32 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
33 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
34 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
35 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
36 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0
37 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1
38 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
40 %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
41 %2 = and <16 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
42 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
43 %4 = trunc <32 x i32> %3 to <32 x i16>
47 define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
48 ; AVX512F-LABEL: trunc_concat_packsswb_512:
50 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
51 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
52 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
53 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
54 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
55 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
56 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
57 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
58 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
59 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
60 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
61 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1
62 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
63 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
64 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
65 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
66 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
67 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
68 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
69 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
70 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
71 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
74 ; AVX512BW-LABEL: trunc_concat_packsswb_512:
76 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0
77 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
78 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
79 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
80 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
81 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
82 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0
83 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1
84 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
86 %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
87 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
88 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
89 %4 = trunc <64 x i16> %3 to <64 x i8>
93 define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
94 ; AVX512F-LABEL: trunc_concat_packuswb_512:
96 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
97 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
98 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
99 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
100 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
101 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
102 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
103 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
104 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
105 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
106 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
107 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1
108 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
109 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
110 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
111 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
112 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
113 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
114 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
115 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
116 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
117 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
120 ; AVX512BW-LABEL: trunc_concat_packuswb_512:
122 ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0
123 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
124 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
125 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
126 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
127 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
128 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0
129 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1
130 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
131 ; AVX512BW-NEXT: retq
132 %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
133 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
134 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
135 %4 = trunc <64 x i16> %3 to <64 x i8>
139 ; concat(trunc(x),trunc(y)) -> pack
141 define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
142 ; AVX512-LABEL: concat_trunc_packssdw_512:
144 ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0
145 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
146 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
147 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
148 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
149 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
150 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
151 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
153 %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
154 %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
155 %3 = trunc <16 x i32> %1 to <16 x i16>
156 %4 = trunc <16 x i32> %2 to <16 x i16>
157 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
161 define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
162 ; AVX512-LABEL: concat_trunc_packusdw_512:
164 ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0
165 ; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1
166 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
167 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
168 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
169 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
170 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
171 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
173 %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
174 %2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
175 %3 = trunc <16 x i32> %1 to <16 x i16>
176 %4 = trunc <16 x i32> %2 to <16 x i16>
177 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
181 define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
182 ; AVX512F-LABEL: concat_trunc_packsswb_512:
184 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
185 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
186 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
187 ; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
188 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
189 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
190 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
191 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
192 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
193 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
194 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
195 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
196 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
197 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15]
198 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
201 ; AVX512BW-LABEL: concat_trunc_packsswb_512:
203 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0
204 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
205 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
206 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
207 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
208 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
209 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
210 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
211 ; AVX512BW-NEXT: retq
212 %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
213 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
214 %3 = trunc <32 x i16> %1 to <32 x i8>
215 %4 = trunc <32 x i16> %2 to <32 x i8>
216 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
220 define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
221 ; AVX512F-LABEL: concat_trunc_packuswb_512:
223 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
224 ; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2
225 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
226 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
227 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
228 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
229 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
230 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
231 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
232 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
233 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
234 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
235 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
236 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,5,14,7,15]
237 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
240 ; AVX512BW-LABEL: concat_trunc_packuswb_512:
242 ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0
243 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
244 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
245 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
246 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
247 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
248 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
249 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
250 ; AVX512BW-NEXT: retq
251 %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
252 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
253 %3 = trunc <32 x i16> %1 to <32 x i8>
254 %4 = trunc <32 x i16> %2 to <32 x i8>
255 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
259 define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
260 ; AVX512F-LABEL: concat_packsswd_int_2x256:
262 ; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
263 ; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
264 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
267 ; AVX512BW-LABEL: concat_packsswd_int_2x256:
269 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
270 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
271 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
272 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
273 ; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
274 ; AVX512BW-NEXT: retq
275 %lo = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
276 %hi = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
277 %res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
280 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
282 define <32 x i16> @concat_packuswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
283 ; AVX512F-LABEL: concat_packuswd_int_2x256:
285 ; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
286 ; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
287 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
290 ; AVX512BW-LABEL: concat_packuswd_int_2x256:
292 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
293 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
294 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
295 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
296 ; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
297 ; AVX512BW-NEXT: retq
298 %lo = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
299 %hi = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
300 %res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
303 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)