1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
9 ; trunc(concat(x,y)) -> pack
11 define <8 x i16> @trunc_concat_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
12 ; SSE-LABEL: trunc_concat_packssdw_128:
14 ; SSE-NEXT: psrad $17, %xmm0
15 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
16 ; SSE-NEXT: packssdw %xmm1, %xmm0
19 ; AVX1-LABEL: trunc_concat_packssdw_128:
21 ; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0
22 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
23 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
26 ; AVX2-LABEL: trunc_concat_packssdw_128:
28 ; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0
29 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
30 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
31 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
34 ; AVX512-LABEL: trunc_concat_packssdw_128:
36 ; AVX512-NEXT: vpsrad $17, %xmm0, %xmm0
37 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
38 ; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
40 %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
41 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
42 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
43 %4 = trunc <8 x i32> %3 to <8 x i16>
47 define <8 x i16> @trunc_concat_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
48 ; SSE2-LABEL: trunc_concat_packusdw_128:
50 ; SSE2-NEXT: psrld $17, %xmm0
51 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
52 ; SSE2-NEXT: packssdw %xmm1, %xmm0
55 ; SSE4-LABEL: trunc_concat_packusdw_128:
57 ; SSE4-NEXT: psrld $17, %xmm0
58 ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
59 ; SSE4-NEXT: packusdw %xmm1, %xmm0
62 ; AVX1-LABEL: trunc_concat_packusdw_128:
64 ; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0
65 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
66 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
69 ; AVX2-LABEL: trunc_concat_packusdw_128:
71 ; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0
72 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
73 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
74 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
77 ; AVX512-LABEL: trunc_concat_packusdw_128:
79 ; AVX512-NEXT: vpsrld $17, %xmm0, %xmm0
80 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
81 ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
83 %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
84 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
85 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
86 %4 = trunc <8 x i32> %3 to <8 x i16>
90 define <16 x i8> @trunc_concat_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
91 ; SSE-LABEL: trunc_concat_packsswb_128:
93 ; SSE-NEXT: psraw $15, %xmm0
94 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
95 ; SSE-NEXT: packsswb %xmm1, %xmm0
98 ; AVX-LABEL: trunc_concat_packsswb_128:
100 ; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
101 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
102 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
104 %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
105 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
106 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
107 %4 = trunc <16 x i16> %3 to <16 x i8>
111 define <16 x i8> @trunc_concat_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
112 ; SSE-LABEL: trunc_concat_packuswb_128:
114 ; SSE-NEXT: psrlw $15, %xmm0
115 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
116 ; SSE-NEXT: packuswb %xmm1, %xmm0
119 ; AVX-LABEL: trunc_concat_packuswb_128:
121 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
122 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
123 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
125 %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
126 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
127 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
128 %4 = trunc <16 x i16> %3 to <16 x i8>
132 ; concat(trunc(x),trunc(y)) -> pack
134 define <8 x i16> @concat_trunc_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
135 ; SSE2-LABEL: concat_trunc_packssdw_128:
137 ; SSE2-NEXT: psrad $17, %xmm0
138 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
139 ; SSE2-NEXT: packssdw %xmm0, %xmm0
140 ; SSE2-NEXT: packuswb %xmm1, %xmm1
141 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
144 ; SSE4-LABEL: concat_trunc_packssdw_128:
146 ; SSE4-NEXT: psrad $17, %xmm0
147 ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
148 ; SSE4-NEXT: packssdw %xmm1, %xmm0
151 ; AVX1-LABEL: concat_trunc_packssdw_128:
153 ; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0
154 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
155 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
158 ; AVX2-LABEL: concat_trunc_packssdw_128:
160 ; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0
161 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
162 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
163 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
164 ; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1
165 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
168 ; AVX512-LABEL: concat_trunc_packssdw_128:
170 ; AVX512-NEXT: vpsrad $17, %xmm0, %xmm0
171 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
172 ; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
174 %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
175 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
176 %3 = trunc <4 x i32> %1 to <4 x i16>
177 %4 = trunc <4 x i32> %2 to <4 x i16>
178 %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
182 define <8 x i16> @concat_trunc_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
183 ; SSE2-LABEL: concat_trunc_packusdw_128:
185 ; SSE2-NEXT: psrld $17, %xmm0
186 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
187 ; SSE2-NEXT: packssdw %xmm0, %xmm0
188 ; SSE2-NEXT: packuswb %xmm1, %xmm1
189 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
192 ; SSE4-LABEL: concat_trunc_packusdw_128:
194 ; SSE4-NEXT: psrld $17, %xmm0
195 ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
196 ; SSE4-NEXT: packusdw %xmm1, %xmm0
199 ; AVX1-LABEL: concat_trunc_packusdw_128:
201 ; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0
202 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
203 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
206 ; AVX2-LABEL: concat_trunc_packusdw_128:
208 ; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0
209 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
210 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
211 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
214 ; AVX512-LABEL: concat_trunc_packusdw_128:
216 ; AVX512-NEXT: vpsrld $17, %xmm0, %xmm0
217 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
218 ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
220 %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
221 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
222 %3 = trunc <4 x i32> %1 to <4 x i16>
223 %4 = trunc <4 x i32> %2 to <4 x i16>
224 %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
228 define <16 x i8> @concat_trunc_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
229 ; SSE-LABEL: concat_trunc_packsswb_128:
231 ; SSE-NEXT: psraw $15, %xmm0
232 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
233 ; SSE-NEXT: packsswb %xmm1, %xmm0
236 ; AVX-LABEL: concat_trunc_packsswb_128:
238 ; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
239 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
240 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
242 %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
243 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
244 %3 = trunc <8 x i16> %1 to <8 x i8>
245 %4 = trunc <8 x i16> %2 to <8 x i8>
246 %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
250 define <16 x i8> @concat_trunc_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
251 ; SSE-LABEL: concat_trunc_packuswb_128:
253 ; SSE-NEXT: psrlw $15, %xmm0
254 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
255 ; SSE-NEXT: packuswb %xmm1, %xmm0
258 ; AVX-LABEL: concat_trunc_packuswb_128:
260 ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
261 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
262 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
264 %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
265 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
266 %3 = trunc <8 x i16> %1 to <8 x i8>
267 %4 = trunc <8 x i16> %2 to <8 x i8>
268 %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>