1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VL
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BWVL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BWVL
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VBMIVL
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512VBMIVL
17 ; Pairs of shufflevector:trunc functions with functional equivalence.
18 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
20 define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind {
21 ; AVX1-LABEL: shuffle_v32i8_to_v16i8:
23 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
24 ; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1
25 ; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0
26 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
27 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
30 ; AVX2-LABEL: shuffle_v32i8_to_v16i8:
32 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
33 ; AVX2-NEXT: vpand 16(%rdi), %xmm0, %xmm1
34 ; AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0
35 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
36 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
39 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
41 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
42 ; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1
43 ; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0
44 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
45 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
48 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
50 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
51 ; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1
52 ; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0
53 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
54 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
57 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
59 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
60 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
61 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
62 ; AVX512BW-NEXT: vzeroupper
65 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
66 ; AVX512BWVL: # %bb.0:
67 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
68 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
69 ; AVX512BWVL-NEXT: vzeroupper
70 ; AVX512BWVL-NEXT: retq
72 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
73 ; AVX512VBMIVL: # %bb.0:
74 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
75 ; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
76 ; AVX512VBMIVL-NEXT: vzeroupper
77 ; AVX512VBMIVL-NEXT: retq
78 %vec = load <32 x i8>, ptr %L
79 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
80 store <16 x i8> %strided.vec, ptr %S
84 define void @trunc_v16i16_to_v16i8(ptr %L, ptr %S) nounwind {
85 ; AVX1-LABEL: trunc_v16i16_to_v16i8:
87 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
88 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
89 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
90 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
91 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
92 ; AVX1-NEXT: vzeroupper
95 ; AVX2-LABEL: trunc_v16i16_to_v16i8:
97 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
98 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
99 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
100 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
101 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
102 ; AVX2-NEXT: vzeroupper
105 ; AVX512F-LABEL: trunc_v16i16_to_v16i8:
107 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
108 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
109 ; AVX512F-NEXT: vzeroupper
112 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
114 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
115 ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
116 ; AVX512VL-NEXT: vzeroupper
117 ; AVX512VL-NEXT: retq
119 ; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
121 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
122 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
123 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
124 ; AVX512BW-NEXT: vzeroupper
125 ; AVX512BW-NEXT: retq
127 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
128 ; AVX512BWVL: # %bb.0:
129 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
130 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
131 ; AVX512BWVL-NEXT: vzeroupper
132 ; AVX512BWVL-NEXT: retq
134 ; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
135 ; AVX512VBMIVL: # %bb.0:
136 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
137 ; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
138 ; AVX512VBMIVL-NEXT: vzeroupper
139 ; AVX512VBMIVL-NEXT: retq
140 %vec = load <32 x i8>, ptr %L
141 %bc = bitcast <32 x i8> %vec to <16 x i16>
142 %strided.vec = trunc <16 x i16> %bc to <16 x i8>
143 store <16 x i8> %strided.vec, ptr %S
147 define void @shuffle_v16i16_to_v8i16(ptr %L, ptr %S) nounwind {
148 ; AVX-LABEL: shuffle_v16i16_to_v8i16:
150 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
151 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
152 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
153 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
154 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
157 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
159 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
160 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
161 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
162 ; AVX512F-NEXT: vzeroupper
165 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
167 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
168 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
169 ; AVX512VL-NEXT: vzeroupper
170 ; AVX512VL-NEXT: retq
172 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
174 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
175 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
176 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
177 ; AVX512BW-NEXT: vzeroupper
178 ; AVX512BW-NEXT: retq
180 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
181 ; AVX512BWVL: # %bb.0:
182 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
183 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
184 ; AVX512BWVL-NEXT: vzeroupper
185 ; AVX512BWVL-NEXT: retq
187 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
188 ; AVX512VBMIVL: # %bb.0:
189 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
190 ; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
191 ; AVX512VBMIVL-NEXT: vzeroupper
192 ; AVX512VBMIVL-NEXT: retq
193 %vec = load <16 x i16>, ptr %L
194 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
195 store <8 x i16> %strided.vec, ptr %S
199 define void @trunc_v8i32_to_v8i16(ptr %L, ptr %S) nounwind {
200 ; AVX1-LABEL: trunc_v8i32_to_v8i16:
202 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
203 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
204 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
205 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
206 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
207 ; AVX1-NEXT: vzeroupper
210 ; AVX2-LABEL: trunc_v8i32_to_v8i16:
212 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
213 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
214 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
215 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
216 ; AVX2-NEXT: vzeroupper
219 ; AVX512F-LABEL: trunc_v8i32_to_v8i16:
221 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
222 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
223 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
224 ; AVX512F-NEXT: vzeroupper
227 ; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
229 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
230 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
231 ; AVX512VL-NEXT: vzeroupper
232 ; AVX512VL-NEXT: retq
234 ; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
236 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
237 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
238 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
239 ; AVX512BW-NEXT: vzeroupper
240 ; AVX512BW-NEXT: retq
242 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
243 ; AVX512BWVL: # %bb.0:
244 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
245 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
246 ; AVX512BWVL-NEXT: vzeroupper
247 ; AVX512BWVL-NEXT: retq
249 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
250 ; AVX512VBMIVL: # %bb.0:
251 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
252 ; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
253 ; AVX512VBMIVL-NEXT: vzeroupper
254 ; AVX512VBMIVL-NEXT: retq
255 %vec = load <16 x i16>, ptr %L
256 %bc = bitcast <16 x i16> %vec to <8 x i32>
257 %strided.vec = trunc <8 x i32> %bc to <8 x i16>
258 store <8 x i16> %strided.vec, ptr %S
262 define void @shuffle_v8i32_to_v4i32(ptr %L, ptr %S) nounwind {
263 ; AVX-LABEL: shuffle_v8i32_to_v4i32:
265 ; AVX-NEXT: vmovaps (%rdi), %xmm0
266 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
267 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
270 ; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
272 ; AVX512F-NEXT: vmovaps (%rdi), %xmm0
273 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
274 ; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
277 ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
279 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
280 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
281 ; AVX512VL-NEXT: vzeroupper
282 ; AVX512VL-NEXT: retq
284 ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
286 ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
287 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
288 ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
289 ; AVX512BW-NEXT: retq
291 ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
292 ; AVX512BWVL: # %bb.0:
293 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
294 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
295 ; AVX512BWVL-NEXT: vzeroupper
296 ; AVX512BWVL-NEXT: retq
298 ; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32:
299 ; AVX512VBMIVL: # %bb.0:
300 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
301 ; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
302 ; AVX512VBMIVL-NEXT: vzeroupper
303 ; AVX512VBMIVL-NEXT: retq
304 %vec = load <8 x i32>, ptr %L
305 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
306 store <4 x i32> %strided.vec, ptr %S
310 define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
311 ; AVX1-LABEL: trunc_v4i64_to_v4i32:
313 ; AVX1-NEXT: vmovaps (%rdi), %xmm0
314 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
315 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
318 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
319 ; AVX2-SLOW: # %bb.0:
320 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
321 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
322 ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
323 ; AVX2-SLOW-NEXT: retq
325 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32:
326 ; AVX2-FAST-ALL: # %bb.0:
327 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
328 ; AVX2-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1]
329 ; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0
330 ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi)
331 ; AVX2-FAST-ALL-NEXT: vzeroupper
332 ; AVX2-FAST-ALL-NEXT: retq
334 ; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i32:
335 ; AVX2-FAST-PERLANE: # %bb.0:
336 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0
337 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
338 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsi)
339 ; AVX2-FAST-PERLANE-NEXT: retq
341 ; AVX512F-LABEL: trunc_v4i64_to_v4i32:
343 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
344 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
345 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
346 ; AVX512F-NEXT: vzeroupper
349 ; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
351 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
352 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
353 ; AVX512VL-NEXT: vzeroupper
354 ; AVX512VL-NEXT: retq
356 ; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
358 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
359 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
360 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
361 ; AVX512BW-NEXT: vzeroupper
362 ; AVX512BW-NEXT: retq
364 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
365 ; AVX512BWVL: # %bb.0:
366 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
367 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
368 ; AVX512BWVL-NEXT: vzeroupper
369 ; AVX512BWVL-NEXT: retq
371 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
372 ; AVX512VBMIVL: # %bb.0:
373 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
374 ; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
375 ; AVX512VBMIVL-NEXT: vzeroupper
376 ; AVX512VBMIVL-NEXT: retq
377 %vec = load <8 x i32>, ptr %L
378 %bc = bitcast <8 x i32> %vec to <4 x i64>
379 %strided.vec = trunc <4 x i64> %bc to <4 x i32>
380 store <4 x i32> %strided.vec, ptr %S
384 define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
385 ; AVX1-LABEL: shuffle_v32i8_to_v8i8:
387 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
388 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
389 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
390 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
391 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
392 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
393 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
396 ; AVX2-LABEL: shuffle_v32i8_to_v8i8:
398 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
399 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
400 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
401 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
402 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
403 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
404 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
407 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
409 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
410 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
411 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
412 ; AVX512F-NEXT: vzeroupper
415 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
417 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
418 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
419 ; AVX512VL-NEXT: vzeroupper
420 ; AVX512VL-NEXT: retq
422 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
424 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
425 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
426 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
427 ; AVX512BW-NEXT: vzeroupper
428 ; AVX512BW-NEXT: retq
430 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
431 ; AVX512BWVL: # %bb.0:
432 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
433 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
434 ; AVX512BWVL-NEXT: vzeroupper
435 ; AVX512BWVL-NEXT: retq
437 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
438 ; AVX512VBMIVL: # %bb.0:
439 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
440 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi)
441 ; AVX512VBMIVL-NEXT: vzeroupper
442 ; AVX512VBMIVL-NEXT: retq
443 %vec = load <32 x i8>, ptr %L
444 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
445 store <8 x i8> %strided.vec, ptr %S
449 define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind {
450 ; AVX1-LABEL: trunc_v8i32_to_v8i8:
452 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
453 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
454 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
455 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
456 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
457 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
461 ; AVX2-LABEL: trunc_v8i32_to_v8i8:
463 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
464 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
465 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
466 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
467 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
468 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
469 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
472 ; AVX512F-LABEL: trunc_v8i32_to_v8i8:
474 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
475 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
476 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
477 ; AVX512F-NEXT: vzeroupper
480 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
482 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
483 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
484 ; AVX512VL-NEXT: vzeroupper
485 ; AVX512VL-NEXT: retq
487 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
489 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
490 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
491 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
492 ; AVX512BW-NEXT: vzeroupper
493 ; AVX512BW-NEXT: retq
495 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
496 ; AVX512BWVL: # %bb.0:
497 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
498 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
499 ; AVX512BWVL-NEXT: vzeroupper
500 ; AVX512BWVL-NEXT: retq
502 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
503 ; AVX512VBMIVL: # %bb.0:
504 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
505 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi)
506 ; AVX512VBMIVL-NEXT: vzeroupper
507 ; AVX512VBMIVL-NEXT: retq
508 %vec = load <32 x i8>, ptr %L
509 %bc = bitcast <32 x i8> %vec to <8 x i32>
510 %strided.vec = trunc <8 x i32> %bc to <8 x i8>
511 store <8 x i8> %strided.vec, ptr %S
516 ; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
518 define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
519 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
521 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
522 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
523 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
524 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
525 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
526 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
527 ; AVX1-NEXT: vzeroupper
530 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
532 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
533 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
534 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
535 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
536 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
537 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
538 ; AVX2-NEXT: vzeroupper
541 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
543 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
544 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
545 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
546 ; AVX512F-NEXT: vzeroupper
549 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
551 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
552 ; AVX512VL-NEXT: vzeroupper
553 ; AVX512VL-NEXT: retq
555 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
557 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
558 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
559 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
560 ; AVX512BW-NEXT: vzeroupper
561 ; AVX512BW-NEXT: retq
563 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
564 ; AVX512BWVL: # %bb.0:
565 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
566 ; AVX512BWVL-NEXT: vzeroupper
567 ; AVX512BWVL-NEXT: retq
569 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
570 ; AVX512VBMIVL: # %bb.0:
571 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
572 ; AVX512VBMIVL-NEXT: vzeroupper
573 ; AVX512VBMIVL-NEXT: retq
574 %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
575 %bc = bitcast <8 x i8> %truncated.vec to i64
576 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
577 ret <2 x i64> %result
580 define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
581 ; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
583 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
584 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
585 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
586 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
587 ; AVX1-NEXT: vzeroupper
590 ; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
592 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
593 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
594 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
595 ; AVX2-NEXT: vzeroupper
598 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
600 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
601 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
602 ; AVX512F-NEXT: vzeroupper
605 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
607 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
608 ; AVX512VL-NEXT: vzeroupper
609 ; AVX512VL-NEXT: retq
611 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
613 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
614 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
615 ; AVX512BW-NEXT: vzeroupper
616 ; AVX512BW-NEXT: retq
618 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
619 ; AVX512BWVL: # %bb.0:
620 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
621 ; AVX512BWVL-NEXT: vzeroupper
622 ; AVX512BWVL-NEXT: retq
624 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
625 ; AVX512VBMIVL: # %bb.0:
626 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
627 ; AVX512VBMIVL-NEXT: vzeroupper
628 ; AVX512VBMIVL-NEXT: retq
629 %truncated = trunc <8 x i32> %vec to <8 x i8>
630 %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
631 %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
632 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
633 ret <16 x i8> %result
636 define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
637 ; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
639 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
640 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
641 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
642 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
643 ; AVX1-NEXT: vzeroupper
646 ; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
648 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
649 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
650 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
651 ; AVX2-NEXT: vzeroupper
654 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
656 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
657 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
658 ; AVX512F-NEXT: vzeroupper
661 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
663 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
664 ; AVX512VL-NEXT: vzeroupper
665 ; AVX512VL-NEXT: retq
667 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
669 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
670 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
671 ; AVX512BW-NEXT: vzeroupper
672 ; AVX512BW-NEXT: retq
674 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
675 ; AVX512BWVL: # %bb.0:
676 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
677 ; AVX512BWVL-NEXT: vzeroupper
678 ; AVX512BWVL-NEXT: retq
680 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
681 ; AVX512VBMIVL: # %bb.0:
682 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
683 ; AVX512VBMIVL-NEXT: vzeroupper
684 ; AVX512VBMIVL-NEXT: retq
685 %truncated = trunc <8 x i32> %vec to <8 x i16>
686 %bc = bitcast <8 x i16> %truncated to <16 x i8>
687 %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
688 ret <16 x i8> %result
691 define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
692 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
694 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
695 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
696 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
697 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
698 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
699 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
700 ; AVX1-NEXT: vzeroupper
703 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
705 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
706 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
707 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
708 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
709 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
710 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
711 ; AVX2-NEXT: vzeroupper
714 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
716 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
717 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
718 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
719 ; AVX512F-NEXT: vzeroupper
722 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
724 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
725 ; AVX512VL-NEXT: vzeroupper
726 ; AVX512VL-NEXT: retq
728 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
730 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
731 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
732 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
733 ; AVX512BW-NEXT: vzeroupper
734 ; AVX512BW-NEXT: retq
736 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
737 ; AVX512BWVL: # %bb.0:
738 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
739 ; AVX512BWVL-NEXT: vzeroupper
740 ; AVX512BWVL-NEXT: retq
742 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
743 ; AVX512VBMIVL: # %bb.0:
744 ; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
745 ; AVX512VBMIVL-NEXT: vzeroupper
746 ; AVX512VBMIVL-NEXT: retq
747 %truncated = trunc <8 x i32> %vec to <8 x i8>
748 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
749 ret <16 x i8> %result
752 define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
754 ; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
755 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
757 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
758 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
759 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
760 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
761 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
762 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
763 ; AVX1-NEXT: vzeroupper
766 ; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
768 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
769 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
770 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
771 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
772 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
773 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
774 ; AVX2-NEXT: vzeroupper
777 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
779 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
780 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
781 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
782 ; AVX512F-NEXT: vzeroupper
785 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
787 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
788 ; AVX512VL-NEXT: vzeroupper
789 ; AVX512VL-NEXT: retq
791 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
793 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
794 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
795 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
796 ; AVX512BW-NEXT: vzeroupper
797 ; AVX512BW-NEXT: retq
799 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
800 ; AVX512BWVL: # %bb.0:
801 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
802 ; AVX512BWVL-NEXT: vzeroupper
803 ; AVX512BWVL-NEXT: retq
805 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
806 ; AVX512VBMIVL: # %bb.0:
807 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
808 ; AVX512VBMIVL-NEXT: vzeroupper
809 ; AVX512VBMIVL-NEXT: retq
810 %truncated = trunc <4 x i64> %vec to <4 x i16>
811 %bc = bitcast <4 x i16> %truncated to i64
812 %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
813 ret <2 x i64> %result
816 define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
817 ; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
819 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
820 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
821 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
822 ; AVX1-NEXT: vzeroupper
825 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
826 ; AVX2-SLOW: # %bb.0:
827 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
828 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
829 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
830 ; AVX2-SLOW-NEXT: vzeroupper
831 ; AVX2-SLOW-NEXT: retq
833 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
834 ; AVX2-FAST-ALL: # %bb.0:
835 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
836 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
837 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
838 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
839 ; AVX2-FAST-ALL-NEXT: vzeroupper
840 ; AVX2-FAST-ALL-NEXT: retq
842 ; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
843 ; AVX2-FAST-PERLANE: # %bb.0:
844 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
845 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
846 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
847 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
848 ; AVX2-FAST-PERLANE-NEXT: retq
850 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
852 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
853 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
854 ; AVX512F-NEXT: vzeroupper
857 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
859 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
860 ; AVX512VL-NEXT: vzeroupper
861 ; AVX512VL-NEXT: retq
863 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
865 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
866 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
867 ; AVX512BW-NEXT: vzeroupper
868 ; AVX512BW-NEXT: retq
870 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
871 ; AVX512BWVL: # %bb.0:
872 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
873 ; AVX512BWVL-NEXT: vzeroupper
874 ; AVX512BWVL-NEXT: retq
876 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
877 ; AVX512VBMIVL: # %bb.0:
878 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
879 ; AVX512VBMIVL-NEXT: vzeroupper
880 ; AVX512VBMIVL-NEXT: retq
881 %truncated = trunc <4 x i64> %vec to <4 x i16>
882 %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
883 %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
884 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
885 ret <8 x i16> %result
888 define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
889 ; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
891 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
892 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
893 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
894 ; AVX1-NEXT: vzeroupper
897 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
898 ; AVX2-SLOW: # %bb.0:
899 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
900 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
901 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
902 ; AVX2-SLOW-NEXT: vzeroupper
903 ; AVX2-SLOW-NEXT: retq
905 ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
906 ; AVX2-FAST-ALL: # %bb.0:
907 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
908 ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
909 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
910 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
911 ; AVX2-FAST-ALL-NEXT: vzeroupper
912 ; AVX2-FAST-ALL-NEXT: retq
914 ; AVX2-FAST-PERLANE-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
915 ; AVX2-FAST-PERLANE: # %bb.0:
916 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
917 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
918 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
919 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
920 ; AVX2-FAST-PERLANE-NEXT: retq
922 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
924 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm0
925 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
926 ; AVX512F-NEXT: vzeroupper
929 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
931 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
932 ; AVX512VL-NEXT: vzeroupper
933 ; AVX512VL-NEXT: retq
935 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
937 ; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
938 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
939 ; AVX512BW-NEXT: vzeroupper
940 ; AVX512BW-NEXT: retq
942 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
943 ; AVX512BWVL: # %bb.0:
944 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
945 ; AVX512BWVL-NEXT: vzeroupper
946 ; AVX512BWVL-NEXT: retq
948 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
949 ; AVX512VBMIVL: # %bb.0:
950 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
951 ; AVX512VBMIVL-NEXT: vzeroupper
952 ; AVX512VBMIVL-NEXT: retq
953 %truncated = trunc <4 x i64> %vec to <4 x i32>
954 %bc = bitcast <4 x i32> %truncated to <8 x i16>
955 %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
956 ret <8 x i16> %result
959 define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
960 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
962 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
963 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
964 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
965 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
966 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
967 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
968 ; AVX1-NEXT: vzeroupper
971 ; AVX2-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
973 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
974 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
975 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
976 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
977 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
978 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
979 ; AVX2-NEXT: vzeroupper
982 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
984 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
985 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
986 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
987 ; AVX512F-NEXT: vzeroupper
990 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
992 ; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
993 ; AVX512VL-NEXT: vzeroupper
994 ; AVX512VL-NEXT: retq
996 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
998 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
999 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1000 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1001 ; AVX512BW-NEXT: vzeroupper
1002 ; AVX512BW-NEXT: retq
1004 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
1005 ; AVX512BWVL: # %bb.0:
1006 ; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
1007 ; AVX512BWVL-NEXT: vzeroupper
1008 ; AVX512BWVL-NEXT: retq
1010 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
1011 ; AVX512VBMIVL: # %bb.0:
1012 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
1013 ; AVX512VBMIVL-NEXT: vzeroupper
1014 ; AVX512VBMIVL-NEXT: retq
1015 %truncated = trunc <4 x i64> %vec to <4 x i16>
1016 %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1017 ret <8 x i16> %result
1020 define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
1021 ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1023 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1024 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
1025 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1026 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1027 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1028 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1029 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1030 ; AVX1-NEXT: vzeroupper
1033 ; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1035 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1036 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1037 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1038 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1039 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1040 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1041 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1042 ; AVX2-NEXT: vzeroupper
1045 ; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1047 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1048 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1049 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
1050 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1051 ; AVX512F-NEXT: vzeroupper
1052 ; AVX512F-NEXT: retq
1054 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1055 ; AVX512VL: # %bb.0:
1056 ; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
1057 ; AVX512VL-NEXT: vzeroupper
1058 ; AVX512VL-NEXT: retq
1060 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1061 ; AVX512BW: # %bb.0:
1062 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1063 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1064 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1065 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1066 ; AVX512BW-NEXT: vzeroupper
1067 ; AVX512BW-NEXT: retq
1069 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1070 ; AVX512BWVL: # %bb.0:
1071 ; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
1072 ; AVX512BWVL-NEXT: vzeroupper
1073 ; AVX512BWVL-NEXT: retq
1075 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
1076 ; AVX512VBMIVL: # %bb.0:
1077 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
1078 ; AVX512VBMIVL-NEXT: vzeroupper
1079 ; AVX512VBMIVL-NEXT: retq
1080 %truncated = trunc <4 x i64> %vec to <4 x i8>
1081 %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
1082 ret <16 x i8> %result
1085 define void @shuffle_v16i16_to_v4i16(ptr %L, ptr %S) nounwind {
1086 ; AVX-LABEL: shuffle_v16i16_to_v4i16:
1088 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1089 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1090 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1091 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1092 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1093 ; AVX-NEXT: vmovq %xmm0, (%rsi)
1096 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
1098 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1099 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1100 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1101 ; AVX512F-NEXT: vzeroupper
1102 ; AVX512F-NEXT: retq
1104 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
1105 ; AVX512VL: # %bb.0:
1106 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1107 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
1108 ; AVX512VL-NEXT: vzeroupper
1109 ; AVX512VL-NEXT: retq
1111 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
1112 ; AVX512BW: # %bb.0:
1113 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1114 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1115 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1116 ; AVX512BW-NEXT: vzeroupper
1117 ; AVX512BW-NEXT: retq
1119 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
1120 ; AVX512BWVL: # %bb.0:
1121 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1122 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
1123 ; AVX512BWVL-NEXT: vzeroupper
1124 ; AVX512BWVL-NEXT: retq
1126 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
1127 ; AVX512VBMIVL: # %bb.0:
1128 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1129 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi)
1130 ; AVX512VBMIVL-NEXT: vzeroupper
1131 ; AVX512VBMIVL-NEXT: retq
1132 %vec = load <16 x i16>, ptr %L
1133 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1134 store <4 x i16> %strided.vec, ptr %S
1138 define void @trunc_v4i64_to_v4i16(ptr %L, ptr %S) nounwind {
1139 ; AVX-LABEL: trunc_v4i64_to_v4i16:
1141 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1142 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1143 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1144 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1145 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1146 ; AVX-NEXT: vmovq %xmm0, (%rsi)
1149 ; AVX512F-LABEL: trunc_v4i64_to_v4i16:
1151 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1152 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1153 ; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1154 ; AVX512F-NEXT: vzeroupper
1155 ; AVX512F-NEXT: retq
1157 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
1158 ; AVX512VL: # %bb.0:
1159 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1160 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
1161 ; AVX512VL-NEXT: vzeroupper
1162 ; AVX512VL-NEXT: retq
1164 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
1165 ; AVX512BW: # %bb.0:
1166 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1167 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1168 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1169 ; AVX512BW-NEXT: vzeroupper
1170 ; AVX512BW-NEXT: retq
1172 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
1173 ; AVX512BWVL: # %bb.0:
1174 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1175 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
1176 ; AVX512BWVL-NEXT: vzeroupper
1177 ; AVX512BWVL-NEXT: retq
1179 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
1180 ; AVX512VBMIVL: # %bb.0:
1181 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1182 ; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi)
1183 ; AVX512VBMIVL-NEXT: vzeroupper
1184 ; AVX512VBMIVL-NEXT: retq
1185 %vec = load <16 x i16>, ptr %L
1186 %bc = bitcast <16 x i16> %vec to <4 x i64>
1187 %strided.vec = trunc <4 x i64> %bc to <4 x i16>
1188 store <4 x i16> %strided.vec, ptr %S
1192 define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
1193 ; AVX1-LABEL: shuffle_v32i8_to_v4i8:
1195 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1196 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1197 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
1198 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1199 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1200 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1201 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1204 ; AVX2-LABEL: shuffle_v32i8_to_v4i8:
1206 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1207 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
1208 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1209 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1210 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1211 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1212 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
1215 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
1217 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1218 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1219 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1220 ; AVX512F-NEXT: vzeroupper
1221 ; AVX512F-NEXT: retq
1223 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
1224 ; AVX512VL: # %bb.0:
1225 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1226 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
1227 ; AVX512VL-NEXT: vzeroupper
1228 ; AVX512VL-NEXT: retq
1230 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
1231 ; AVX512BW: # %bb.0:
1232 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1233 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1234 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1235 ; AVX512BW-NEXT: vzeroupper
1236 ; AVX512BW-NEXT: retq
1238 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
1239 ; AVX512BWVL: # %bb.0:
1240 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1241 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
1242 ; AVX512BWVL-NEXT: vzeroupper
1243 ; AVX512BWVL-NEXT: retq
1245 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
1246 ; AVX512VBMIVL: # %bb.0:
1247 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1248 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi)
1249 ; AVX512VBMIVL-NEXT: vzeroupper
1250 ; AVX512VBMIVL-NEXT: retq
1251 %vec = load <32 x i8>, ptr %L
1252 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1253 store <4 x i8> %strided.vec, ptr %S
1257 define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind {
1258 ; AVX1-LABEL: trunc_v4i64_to_v4i8:
1260 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1261 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1262 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
1263 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1264 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1265 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1266 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1269 ; AVX2-LABEL: trunc_v4i64_to_v4i8:
1271 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1272 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
1273 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
1274 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1275 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1276 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1277 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
1280 ; AVX512F-LABEL: trunc_v4i64_to_v4i8:
1282 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1283 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
1284 ; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1285 ; AVX512F-NEXT: vzeroupper
1286 ; AVX512F-NEXT: retq
1288 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
1289 ; AVX512VL: # %bb.0:
1290 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1291 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
1292 ; AVX512VL-NEXT: vzeroupper
1293 ; AVX512VL-NEXT: retq
1295 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
1296 ; AVX512BW: # %bb.0:
1297 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1298 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
1299 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1300 ; AVX512BW-NEXT: vzeroupper
1301 ; AVX512BW-NEXT: retq
1303 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
1304 ; AVX512BWVL: # %bb.0:
1305 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1306 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
1307 ; AVX512BWVL-NEXT: vzeroupper
1308 ; AVX512BWVL-NEXT: retq
1310 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
1311 ; AVX512VBMIVL: # %bb.0:
1312 ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1313 ; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi)
1314 ; AVX512VBMIVL-NEXT: vzeroupper
1315 ; AVX512VBMIVL-NEXT: retq
1316 %vec = load <32 x i8>, ptr %L
1317 %bc = bitcast <32 x i8> %vec to <4 x i64>
1318 %strided.vec = trunc <4 x i64> %bc to <4 x i8>
1319 store <4 x i8> %strided.vec, ptr %S
1323 ; In this case not all elements are collected from the same source vector, so
1324 ; the resulting BUILD_VECTOR should not be combined to a truncate.
1325 define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
1326 ; AVX1-LABEL: negative:
1328 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
1329 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1330 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
1331 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1332 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1333 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
1334 ; AVX1-NEXT: vzeroupper
1337 ; AVX2-LABEL: negative:
1339 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1340 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1341 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1342 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1343 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1344 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1345 ; AVX2-NEXT: vzeroupper
1348 ; AVX512F-LABEL: negative:
1350 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1351 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1352 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1353 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1354 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1355 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1356 ; AVX512F-NEXT: vzeroupper
1357 ; AVX512F-NEXT: retq
1359 ; AVX512VL-LABEL: negative:
1360 ; AVX512VL: # %bb.0:
1361 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1362 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1363 ; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1364 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1365 ; AVX512VL-NEXT: vzeroupper
1366 ; AVX512VL-NEXT: retq
1368 ; AVX512BW-LABEL: negative:
1369 ; AVX512BW: # %bb.0:
1370 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1371 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1372 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
1373 ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1374 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1375 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1376 ; AVX512BW-NEXT: vzeroupper
1377 ; AVX512BW-NEXT: retq
1379 ; AVX512BWVL-LABEL: negative:
1380 ; AVX512BWVL: # %bb.0:
1381 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
1382 ; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
1383 ; AVX512BWVL-NEXT: kmovd %eax, %k1
1384 ; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
1385 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1386 ; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1387 ; AVX512BWVL-NEXT: vzeroupper
1388 ; AVX512BWVL-NEXT: retq
1390 ; AVX512VBMIVL-LABEL: negative:
1391 ; AVX512VBMIVL: # %bb.0:
1392 ; AVX512VBMIVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1393 ; AVX512VBMIVL-NEXT: # ymm2 = mem[0,1,0,1]
1394 ; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
1395 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1396 ; AVX512VBMIVL-NEXT: vzeroupper
1397 ; AVX512VBMIVL-NEXT: retq
1398 %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1399 %w0 = extractelement <32 x i8> %w, i32 0
1400 %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
1401 ret <16 x i8> %merged