1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; trunc(concat(x,y)) -> pack
7 define <32 x i16> @trunc_concat_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
8 ; AVX512-LABEL: trunc_concat_packssdw_512:
10 ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0
11 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
13 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
15 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
16 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0
17 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1
18 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
20 %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
21 %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
22 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
23 %4 = trunc <32 x i32> %3 to <32 x i16>
27 define <32 x i16> @trunc_concat_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
28 ; AVX512-LABEL: trunc_concat_packusdw_512:
30 ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0
31 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
32 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
33 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
34 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
35 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
36 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0
37 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1
38 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
40 %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
41 %2 = and <16 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
42 %3 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
43 %4 = trunc <32 x i32> %3 to <32 x i16>
47 define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
48 ; AVX512F-LABEL: trunc_concat_packsswb_512:
50 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
51 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
52 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
53 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
54 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
55 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
56 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
57 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
58 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
59 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
60 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
61 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1
62 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
63 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
64 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
65 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
66 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
67 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
68 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
69 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
70 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
71 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
74 ; AVX512BW-LABEL: trunc_concat_packsswb_512:
76 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0
77 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
78 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
79 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
80 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
81 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
82 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0
83 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1
84 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
86 %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
87 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
88 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
89 %4 = trunc <64 x i16> %3 to <64 x i8>
93 define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
94 ; AVX512F-LABEL: trunc_concat_packuswb_512:
96 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
97 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
98 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
99 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
100 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
101 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
102 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
103 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
104 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
105 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
106 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
107 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm1
108 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
109 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
110 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
111 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
112 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
113 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
114 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
115 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
116 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
117 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
120 ; AVX512BW-LABEL: trunc_concat_packuswb_512:
122 ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0
123 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
124 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
125 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
126 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
127 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
128 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0
129 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1
130 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
131 ; AVX512BW-NEXT: retq
132 %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
133 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
134 %3 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
135 %4 = trunc <64 x i16> %3 to <64 x i8>
139 ; concat(trunc(x),trunc(y)) -> pack
141 define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
142 ; AVX512-LABEL: concat_trunc_packssdw_512:
144 ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0
145 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
146 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
147 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
148 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
149 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
150 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
151 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
153 %1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
154 %2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
155 %3 = trunc <16 x i32> %1 to <16 x i16>
156 %4 = trunc <16 x i32> %2 to <16 x i16>
157 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
161 define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nounwind {
162 ; AVX512-LABEL: concat_trunc_packusdw_512:
164 ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0
165 ; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1
166 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
167 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
168 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
169 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
170 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
171 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
173 %1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
174 %2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
175 %3 = trunc <16 x i32> %1 to <16 x i16>
176 %4 = trunc <16 x i32> %2 to <16 x i16>
177 %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
181 define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
182 ; AVX512F-LABEL: concat_trunc_packsswb_512:
184 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
185 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
186 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
187 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
188 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
189 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
190 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
191 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
192 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
193 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
194 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
195 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
196 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
197 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
198 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
199 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
200 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
201 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
202 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
205 ; AVX512BW-LABEL: concat_trunc_packsswb_512:
207 ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0
208 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
209 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
210 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
211 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
212 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
213 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
214 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
215 ; AVX512BW-NEXT: retq
216 %1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
217 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
218 %3 = trunc <32 x i16> %1 to <32 x i8>
219 %4 = trunc <32 x i16> %2 to <32 x i8>
220 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
224 define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
225 ; AVX512F-LABEL: concat_trunc_packuswb_512:
227 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
228 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
229 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
230 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
231 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
232 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
233 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
234 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
235 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
236 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
237 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
238 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
239 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
240 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
241 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
242 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
243 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
244 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
245 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
248 ; AVX512BW-LABEL: concat_trunc_packuswb_512:
250 ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0
251 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
252 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
253 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
254 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
255 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
256 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
257 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
258 ; AVX512BW-NEXT: retq
259 %1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
260 %2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
261 %3 = trunc <32 x i16> %1 to <32 x i8>
262 %4 = trunc <32 x i16> %2 to <32 x i8>
263 %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>