1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
9 ; trunc(concat(x,y)) -> pack
11 define <8 x i16> @trunc_concat_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
12 ; SSE-LABEL: trunc_concat_packssdw_128:
14 ; SSE-NEXT: psrad $17, %xmm0
15 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
16 ; SSE-NEXT: packssdw %xmm1, %xmm0
19 ; AVX1-LABEL: trunc_concat_packssdw_128:
21 ; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0
22 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
23 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
26 ; AVX2-LABEL: trunc_concat_packssdw_128:
28 ; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0
29 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
30 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
31 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
34 ; AVX512-LABEL: trunc_concat_packssdw_128:
36 ; AVX512-NEXT: vpsrad $17, %xmm0, %xmm0
37 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
38 ; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
40 %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
41 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
42 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
43 %4 = trunc <8 x i32> %3 to <8 x i16>
47 define <8 x i16> @trunc_concat_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
48 ; SSE2-LABEL: trunc_concat_packusdw_128:
50 ; SSE2-NEXT: psrld $17, %xmm0
51 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
52 ; SSE2-NEXT: packssdw %xmm1, %xmm0
55 ; SSE4-LABEL: trunc_concat_packusdw_128:
57 ; SSE4-NEXT: psrld $17, %xmm0
58 ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
59 ; SSE4-NEXT: packusdw %xmm1, %xmm0
62 ; AVX1-LABEL: trunc_concat_packusdw_128:
64 ; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0
65 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
66 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
69 ; AVX2-LABEL: trunc_concat_packusdw_128:
71 ; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0
72 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
73 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
74 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
77 ; AVX512-LABEL: trunc_concat_packusdw_128:
79 ; AVX512-NEXT: vpsrld $17, %xmm0, %xmm0
80 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
81 ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
83 %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
84 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
85 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
86 %4 = trunc <8 x i32> %3 to <8 x i16>
90 define <16 x i8> @trunc_concat_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
91 ; SSE-LABEL: trunc_concat_packsswb_128:
93 ; SSE-NEXT: psraw $15, %xmm0
94 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
95 ; SSE-NEXT: packsswb %xmm1, %xmm0
98 ; AVX1-LABEL: trunc_concat_packsswb_128:
100 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
101 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
102 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
105 ; AVX2-LABEL: trunc_concat_packsswb_128:
107 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
108 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
109 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
112 ; AVX512-LABEL: trunc_concat_packsswb_128:
114 ; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0
115 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
116 ; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
118 %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
119 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
120 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
121 %4 = trunc <16 x i16> %3 to <16 x i8>
125 define <16 x i8> @trunc_concat_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
126 ; SSE-LABEL: trunc_concat_packuswb_128:
128 ; SSE-NEXT: psrlw $15, %xmm0
129 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130 ; SSE-NEXT: packuswb %xmm1, %xmm0
133 ; AVX1-LABEL: trunc_concat_packuswb_128:
135 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
136 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
137 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
140 ; AVX2-LABEL: trunc_concat_packuswb_128:
142 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
143 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
144 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
147 ; AVX512-LABEL: trunc_concat_packuswb_128:
149 ; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0
150 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
151 ; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
153 %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
154 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
155 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
156 %4 = trunc <16 x i16> %3 to <16 x i8>
160 ; concat(trunc(x),trunc(y)) -> pack
162 define <8 x i16> @concat_trunc_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
163 ; SSE2-LABEL: concat_trunc_packssdw_128:
165 ; SSE2-NEXT: psrad $17, %xmm0
166 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
167 ; SSE2-NEXT: packssdw %xmm0, %xmm0
168 ; SSE2-NEXT: packuswb %xmm1, %xmm1
169 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
172 ; SSE4-LABEL: concat_trunc_packssdw_128:
174 ; SSE4-NEXT: psrad $17, %xmm0
175 ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
176 ; SSE4-NEXT: packssdw %xmm1, %xmm0
179 ; AVX1-LABEL: concat_trunc_packssdw_128:
181 ; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0
182 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
183 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
186 ; AVX2-LABEL: concat_trunc_packssdw_128:
188 ; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0
189 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
190 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
191 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
194 ; AVX512-LABEL: concat_trunc_packssdw_128:
196 ; AVX512-NEXT: vpsrad $17, %xmm0, %xmm0
197 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
198 ; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
200 %1 = ashr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
201 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
202 %3 = trunc <4 x i32> %1 to <4 x i16>
203 %4 = trunc <4 x i32> %2 to <4 x i16>
204 %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
208 define <8 x i16> @concat_trunc_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwind {
209 ; SSE2-LABEL: concat_trunc_packusdw_128:
211 ; SSE2-NEXT: psrld $17, %xmm0
212 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
213 ; SSE2-NEXT: packssdw %xmm0, %xmm0
214 ; SSE2-NEXT: packuswb %xmm1, %xmm1
215 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
218 ; SSE4-LABEL: concat_trunc_packusdw_128:
220 ; SSE4-NEXT: psrld $17, %xmm0
221 ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
222 ; SSE4-NEXT: packusdw %xmm1, %xmm0
225 ; AVX1-LABEL: concat_trunc_packusdw_128:
227 ; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0
228 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
229 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
232 ; AVX2-LABEL: concat_trunc_packusdw_128:
234 ; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0
235 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15]
236 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
237 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
240 ; AVX512-LABEL: concat_trunc_packusdw_128:
242 ; AVX512-NEXT: vpsrld $17, %xmm0, %xmm0
243 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
244 ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
246 %1 = lshr <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
247 %2 = and <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
248 %3 = trunc <4 x i32> %1 to <4 x i16>
249 %4 = trunc <4 x i32> %2 to <4 x i16>
250 %5 = shufflevector <4 x i16> %3, <4 x i16> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
254 define <16 x i8> @concat_trunc_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
255 ; SSE-LABEL: concat_trunc_packsswb_128:
257 ; SSE-NEXT: psraw $15, %xmm0
258 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
259 ; SSE-NEXT: packsswb %xmm1, %xmm0
262 ; AVX1-LABEL: concat_trunc_packsswb_128:
264 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
265 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
266 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
269 ; AVX2-LABEL: concat_trunc_packsswb_128:
271 ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
272 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
273 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
276 ; AVX512-LABEL: concat_trunc_packsswb_128:
278 ; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0
279 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
280 ; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
282 %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
283 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
284 %3 = trunc <8 x i16> %1 to <8 x i8>
285 %4 = trunc <8 x i16> %2 to <8 x i8>
286 %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
290 define <16 x i8> @concat_trunc_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) nounwind {
291 ; SSE-LABEL: concat_trunc_packuswb_128:
293 ; SSE-NEXT: psrlw $15, %xmm0
294 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
295 ; SSE-NEXT: packuswb %xmm1, %xmm0
298 ; AVX1-LABEL: concat_trunc_packuswb_128:
300 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
301 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
302 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
305 ; AVX2-LABEL: concat_trunc_packuswb_128:
307 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
308 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
309 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
312 ; AVX512-LABEL: concat_trunc_packuswb_128:
314 ; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0
315 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
316 ; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
318 %1 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
319 %2 = and <8 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
320 %3 = trunc <8 x i16> %1 to <8 x i8>
321 %4 = trunc <8 x i16> %2 to <8 x i8>
322 %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
326 ; Fuzz test - don't pack a v1i32 comparison result.
327 define void @autogen_SD10339(<1 x i32> %I49) {
328 ; CHECK-LABEL: autogen_SD10339:
329 ; CHECK: # %bb.0: # %BB
330 ; CHECK-NEXT: .p2align 4, 0x90
331 ; CHECK-NEXT: .LBB8_1: # %CF
332 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
333 ; CHECK-NEXT: movw $1, 0
334 ; CHECK-NEXT: jmp .LBB8_1
336 %Cmp53 = icmp uge <1 x i32> %I49, zeroinitializer
339 CF: ; preds = %CF, %BB
340 %ZE166 = zext <1 x i1> %Cmp53 to <1 x i16>
341 store <1 x i16> %ZE166, ptr null, align 2
345 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: