1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
5 ; Just one 32-bit run to make sure we do reasonable things.
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32-AVX512F
8 define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp {
9 ; ALL-LABEL: merge_8f64_2f64_12u4:
11 ; ALL-NEXT: vmovups 16(%rdi), %ymm0
12 ; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
13 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
16 ; X32-AVX512F-LABEL: merge_8f64_2f64_12u4:
17 ; X32-AVX512F: # %bb.0:
18 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
19 ; X32-AVX512F-NEXT: vmovups 16(%eax), %ymm0
20 ; X32-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1
21 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
22 ; X32-AVX512F-NEXT: retl
23 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1
24 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
25 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 4
26 %val0 = load <2 x double>, <2 x double>* %ptr0
27 %val1 = load <2 x double>, <2 x double>* %ptr1
28 %val3 = load <2 x double>, <2 x double>* %ptr3
29 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
30 %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
31 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
35 define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp {
36 ; ALL-LABEL: merge_8f64_2f64_23z5:
38 ; ALL-NEXT: vmovups 32(%rdi), %ymm0
39 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
40 ; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1
41 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
44 ; X32-AVX512F-LABEL: merge_8f64_2f64_23z5:
45 ; X32-AVX512F: # %bb.0:
46 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
47 ; X32-AVX512F-NEXT: vmovups 32(%eax), %ymm0
48 ; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
49 ; X32-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1
50 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
51 ; X32-AVX512F-NEXT: retl
52 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
53 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
54 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 5
55 %val0 = load <2 x double>, <2 x double>* %ptr0
56 %val1 = load <2 x double>, <2 x double>* %ptr1
57 %val3 = load <2 x double>, <2 x double>* %ptr3
58 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
59 %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
60 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
64 define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp {
65 ; ALL-LABEL: merge_8f64_4f64_z2:
67 ; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
68 ; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0
71 ; X32-AVX512F-LABEL: merge_8f64_4f64_z2:
72 ; X32-AVX512F: # %bb.0:
73 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
74 ; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
75 ; X32-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0
76 ; X32-AVX512F-NEXT: retl
77 %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2
78 %val1 = load <4 x double>, <4 x double>* %ptr1
79 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
83 define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp {
84 ; ALL-LABEL: merge_8f64_f64_23uuuuu9:
86 ; ALL-NEXT: vmovups 16(%rdi), %zmm0
89 ; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
90 ; X32-AVX512F: # %bb.0:
91 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
92 ; X32-AVX512F-NEXT: vmovups 16(%eax), %zmm0
93 ; X32-AVX512F-NEXT: retl
94 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
95 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
96 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
97 %val0 = load double, double* %ptr0
98 %val1 = load double, double* %ptr1
99 %val7 = load double, double* %ptr7
100 %res0 = insertelement <8 x double> undef, double %val0, i32 0
101 %res1 = insertelement <8 x double> %res0, double %val1, i32 1
102 %res7 = insertelement <8 x double> %res1, double %val7, i32 7
103 ret <8 x double> %res7
106 define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp {
107 ; ALL-LABEL: merge_8f64_f64_12zzuuzz:
109 ; ALL-NEXT: vmovups 8(%rdi), %xmm0
112 ; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
113 ; X32-AVX512F: # %bb.0:
114 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
115 ; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
116 ; X32-AVX512F-NEXT: retl
117 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
118 %ptr1 = getelementptr inbounds double, double* %ptr, i64 2
119 %val0 = load double, double* %ptr0
120 %val1 = load double, double* %ptr1
121 %res0 = insertelement <8 x double> undef, double %val0, i32 0
122 %res1 = insertelement <8 x double> %res0, double %val1, i32 1
123 %res2 = insertelement <8 x double> %res1, double 0.0, i32 2
124 %res3 = insertelement <8 x double> %res2, double 0.0, i32 3
125 %res6 = insertelement <8 x double> %res3, double 0.0, i32 6
126 %res7 = insertelement <8 x double> %res6, double 0.0, i32 7
127 ret <8 x double> %res7
130 define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
131 ; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
133 ; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
134 ; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
137 ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
138 ; X32-AVX512F: # %bb.0:
139 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
140 ; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
141 ; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0
142 ; X32-AVX512F-NEXT: retl
143 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
144 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3
145 %ptr4 = getelementptr inbounds double, double* %ptr, i64 5
146 %ptr7 = getelementptr inbounds double, double* %ptr, i64 8
147 %val0 = load double, double* %ptr0
148 %val2 = load double, double* %ptr2
149 %val4 = load double, double* %ptr4
150 %val7 = load double, double* %ptr7
151 %res0 = insertelement <8 x double> undef, double %val0, i32 0
152 %res2 = insertelement <8 x double> %res0, double %val2, i32 2
153 %res4 = insertelement <8 x double> %res2, double %val4, i32 4
154 %res5 = insertelement <8 x double> %res4, double 0.0, i32 5
155 %res7 = insertelement <8 x double> %res5, double %val7, i32 7
156 ret <8 x double> %res7
159 define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp {
160 ; ALL-LABEL: merge_8i64_4i64_z3:
162 ; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
163 ; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0
166 ; X32-AVX512F-LABEL: merge_8i64_4i64_z3:
167 ; X32-AVX512F: # %bb.0:
168 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
169 ; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
170 ; X32-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0
171 ; X32-AVX512F-NEXT: retl
172 %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3
173 %val1 = load <4 x i64>, <4 x i64>* %ptr1
174 %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
178 define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp {
179 ; ALL-LABEL: merge_8i64_i64_56zz9uzz:
181 ; ALL-NEXT: vmovups 40(%rdi), %xmm0
182 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
183 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
186 ; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
187 ; X32-AVX512F: # %bb.0:
188 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
189 ; X32-AVX512F-NEXT: vmovups 40(%eax), %xmm0
190 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
191 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
192 ; X32-AVX512F-NEXT: retl
193 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5
194 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6
195 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 9
196 %val0 = load i64, i64* %ptr0
197 %val1 = load i64, i64* %ptr1
198 %val4 = load i64, i64* %ptr4
199 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
200 %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1
201 %res2 = insertelement <8 x i64> %res1, i64 0, i32 2
202 %res3 = insertelement <8 x i64> %res2, i64 0, i32 3
203 %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4
204 %res6 = insertelement <8 x i64> %res4, i64 0, i32 6
205 %res7 = insertelement <8 x i64> %res6, i64 0, i32 7
209 define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
210 ; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
212 ; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
213 ; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
216 ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
217 ; X32-AVX512F: # %bb.0:
218 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
219 ; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
220 ; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0
221 ; X32-AVX512F-NEXT: retl
222 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
223 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
224 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5
225 %ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8
226 %val0 = load i64, i64* %ptr0
227 %val2 = load i64, i64* %ptr2
228 %val4 = load i64, i64* %ptr4
229 %val7 = load i64, i64* %ptr7
230 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
231 %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2
232 %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4
233 %res5 = insertelement <8 x i64> %res4, i64 0, i32 5
234 %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7
238 define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp {
239 ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
241 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
244 ; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
245 ; X32-AVX512F: # %bb.0:
246 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
247 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
248 ; X32-AVX512F-NEXT: retl
249 %ptr0 = getelementptr inbounds float, float* %ptr, i64 8
250 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9
251 %val0 = load float, float* %ptr0
252 %val1 = load float, float* %ptr1
253 %res0 = insertelement <16 x float> undef, float %val0, i32 0
254 %res1 = insertelement <16 x float> %res0, float %val1, i32 1
255 %res2 = insertelement <16 x float> %res1, float 0.0, i32 2
256 %res3 = insertelement <16 x float> %res2, float 0.0, i32 3
257 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
258 %resF = insertelement <16 x float> %res4, float 0.0, i32 15
259 ret <16 x float> %resF
262 define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp {
263 ; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
265 ; ALL-NEXT: vmovups 16(%rdi), %xmm0
268 ; X32-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
269 ; X32-AVX512F: # %bb.0:
270 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
271 ; X32-AVX512F-NEXT: vmovups 16(%eax), %xmm0
272 ; X32-AVX512F-NEXT: retl
273 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
274 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
275 %ptr3 = getelementptr inbounds float, float* %ptr, i64 7
276 %val0 = load float, float* %ptr0
277 %val1 = load float, float* %ptr1
278 %val3 = load float, float* %ptr3
279 %res0 = insertelement <16 x float> undef, float %val0, i32 0
280 %res1 = insertelement <16 x float> %res0, float %val1, i32 1
281 %res3 = insertelement <16 x float> %res1, float %val3, i32 3
282 ret <16 x float> %res3
285 define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp {
286 ; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
288 ; ALL-NEXT: vmovups (%rdi), %zmm0
291 ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
292 ; X32-AVX512F: # %bb.0:
293 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
294 ; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
295 ; X32-AVX512F-NEXT: retl
296 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
297 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
298 %ptrC = getelementptr inbounds float, float* %ptr, i64 12
299 %ptrE = getelementptr inbounds float, float* %ptr, i64 14
300 %ptrF = getelementptr inbounds float, float* %ptr, i64 15
301 %val0 = load float, float* %ptr0
302 %val3 = load float, float* %ptr3
303 %valC = load float, float* %ptrC
304 %valE = load float, float* %ptrE
305 %valF = load float, float* %ptrF
306 %res0 = insertelement <16 x float> undef, float %val0, i32 0
307 %res3 = insertelement <16 x float> %res0, float %val3, i32 3
308 %resC = insertelement <16 x float> %res3, float %valC, i32 12
309 %resE = insertelement <16 x float> %resC, float %valE, i32 14
310 %resF = insertelement <16 x float> %resE, float %valF, i32 15
311 ret <16 x float> %resF
314 define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
315 ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
317 ; ALL-NEXT: vmovups (%rdi), %zmm1
318 ; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
319 ; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
320 ; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
323 ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
324 ; X32-AVX512F: # %bb.0:
325 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
326 ; X32-AVX512F-NEXT: vmovups (%eax), %zmm1
327 ; X32-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
328 ; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
329 ; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
330 ; X32-AVX512F-NEXT: retl
331 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
332 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
333 %ptrC = getelementptr inbounds float, float* %ptr, i64 12
334 %ptrE = getelementptr inbounds float, float* %ptr, i64 14
335 %ptrF = getelementptr inbounds float, float* %ptr, i64 15
336 %val0 = load float, float* %ptr0
337 %val3 = load float, float* %ptr3
338 %valC = load float, float* %ptrC
339 %valE = load float, float* %ptrE
340 %valF = load float, float* %ptrF
341 %res0 = insertelement <16 x float> undef, float %val0, i32 0
342 %res3 = insertelement <16 x float> %res0, float %val3, i32 3
343 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
344 %res5 = insertelement <16 x float> %res4, float 0.0, i32 5
345 %resC = insertelement <16 x float> %res5, float %valC, i32 12
346 %resD = insertelement <16 x float> %resC, float 0.0, i32 13
347 %resE = insertelement <16 x float> %resD, float %valE, i32 14
348 %resF = insertelement <16 x float> %resE, float %valF, i32 15
349 ret <16 x float> %resF
352 define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp {
353 ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
355 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
358 ; X32-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
359 ; X32-AVX512F: # %bb.0:
360 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
361 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
362 ; X32-AVX512F-NEXT: retl
363 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
364 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2
365 %val0 = load i32, i32* %ptr0
366 %val1 = load i32, i32* %ptr1
367 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
368 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
369 %res2 = insertelement <16 x i32> %res1, i32 0, i32 2
370 %res3 = insertelement <16 x i32> %res2, i32 0, i32 3
371 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
372 %resF = insertelement <16 x i32> %res4, i32 0, i32 15
376 define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp {
377 ; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
379 ; ALL-NEXT: vmovups 8(%rdi), %xmm0
382 ; X32-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
383 ; X32-AVX512F: # %bb.0:
384 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
385 ; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
386 ; X32-AVX512F-NEXT: retl
387 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
388 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
389 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
390 %val0 = load i32, i32* %ptr0
391 %val1 = load i32, i32* %ptr1
392 %val3 = load i32, i32* %ptr3
393 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
394 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
395 %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3
399 define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp {
400 ; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
402 ; ALL-NEXT: vmovups (%rdi), %zmm0
405 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
406 ; X32-AVX512F: # %bb.0:
407 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
408 ; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
409 ; X32-AVX512F-NEXT: retl
410 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
411 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
412 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
413 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
414 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
415 %val0 = load i32, i32* %ptr0
416 %val3 = load i32, i32* %ptr3
417 %valC = load i32, i32* %ptrC
418 %valE = load i32, i32* %ptrE
419 %valF = load i32, i32* %ptrF
420 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
421 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
422 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
423 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
424 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
428 define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
429 ; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
431 ; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
432 ; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
435 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
436 ; X32-AVX512F: # %bb.0:
437 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
438 ; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
439 ; X32-AVX512F-NEXT: vpandd {{\.LCPI.*}}, %zmm0, %zmm0
440 ; X32-AVX512F-NEXT: retl
441 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
442 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
443 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
444 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
445 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
446 %val0 = load i32, i32* %ptr0
447 %val3 = load i32, i32* %ptr3
448 %valC = load i32, i32* %ptrC
449 %valE = load i32, i32* %ptrE
450 %valF = load i32, i32* %ptrF
451 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
452 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
453 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
454 %res5 = insertelement <16 x i32> %res4, i32 0, i32 5
455 %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12
456 %resD = insertelement <16 x i32> %resC, i32 0, i32 13
457 %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14
458 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
462 define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp {
463 ; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
465 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
466 ; AVX512F-NEXT: vmovaps %ymm0, %ymm0
469 ; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
471 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
472 ; AVX512BW-NEXT: retq
474 ; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
475 ; X32-AVX512F: # %bb.0:
476 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
477 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
478 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
479 ; X32-AVX512F-NEXT: retl
480 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
481 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
482 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 4
483 %val0 = load i16, i16* %ptr0
484 %val1 = load i16, i16* %ptr1
485 %val3 = load i16, i16* %ptr3
486 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
487 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
488 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
489 %res30 = insertelement <32 x i16> %res3, i16 0, i16 30
490 %res31 = insertelement <32 x i16> %res30, i16 0, i16 31
491 ret <32 x i16> %res31
494 define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
495 ; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
497 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
500 ; X32-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
501 ; X32-AVX512F: # %bb.0:
502 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
503 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
504 ; X32-AVX512F-NEXT: retl
505 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
506 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
507 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
508 %val0 = load i16, i16* %ptr0
509 %val1 = load i16, i16* %ptr1
510 %val3 = load i16, i16* %ptr3
511 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
512 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
513 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
517 define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
518 ; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
520 ; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
521 ; AVX512F-NEXT: vmovaps %ymm0, %ymm0
524 ; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
526 ; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
527 ; AVX512BW-NEXT: retq
529 ; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
530 ; X32-AVX512F: # %bb.0:
531 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
532 ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
533 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
534 ; X32-AVX512F-NEXT: retl
535 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
536 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
537 %val0 = load i16, i16* %ptr0
538 %val1 = load i16, i16* %ptr1
539 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
540 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
541 %res3 = insertelement <32 x i16> %res1, i16 0, i16 3
542 %resE = insertelement <32 x i16> %res3, i16 0, i16 14
543 %resF = insertelement <32 x i16> %resE, i16 0, i16 15
544 %resG = insertelement <32 x i16> %resF, i16 0, i16 16
545 %resH = insertelement <32 x i16> %resG, i16 0, i16 17
549 define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
550 ; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
552 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
553 ; AVX512F-NEXT: vmovaps %ymm0, %ymm0
556 ; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
558 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
559 ; AVX512BW-NEXT: retq
561 ; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
562 ; X32-AVX512F: # %bb.0:
563 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
564 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
565 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
566 ; X32-AVX512F-NEXT: retl
567 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
568 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
569 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
570 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 8
571 %val0 = load i8, i8* %ptr0
572 %val1 = load i8, i8* %ptr1
573 %val3 = load i8, i8* %ptr3
574 %val7 = load i8, i8* %ptr7
575 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
576 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
577 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
578 %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7
579 %res14 = insertelement <64 x i8> %res7, i8 0, i8 14
580 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
581 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
582 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
583 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
587 define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
588 ; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
590 ; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
591 ; AVX512F-NEXT: vmovaps %ymm0, %ymm0
594 ; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
596 ; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
597 ; AVX512BW-NEXT: retq
599 ; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
600 ; X32-AVX512F: # %bb.0:
601 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
602 ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
603 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
604 ; X32-AVX512F-NEXT: retl
605 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
606 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
607 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
608 %val0 = load i8, i8* %ptr0
609 %val1 = load i8, i8* %ptr1
610 %val3 = load i8, i8* %ptr3
611 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
612 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
613 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
614 %res14 = insertelement <64 x i8> %res3, i8 0, i8 14
615 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
616 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
617 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
618 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
623 ; consecutive loads including any/all volatiles may not be combined
626 define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp {
627 ; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile:
629 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
630 ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
631 ; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1
632 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
635 ; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile:
636 ; X32-AVX512F: # %bb.0:
637 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
638 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
639 ; X32-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
640 ; X32-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1
641 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
642 ; X32-AVX512F-NEXT: retl
643 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
644 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
645 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
646 %val0 = load volatile double, double* %ptr0
647 %val1 = load double, double* %ptr1
648 %val7 = load double, double* %ptr7
649 %res0 = insertelement <8 x double> undef, double %val0, i32 0
650 %res1 = insertelement <8 x double> %res0, double %val1, i32 1
651 %res7 = insertelement <8 x double> %res1, double %val7, i32 7
652 ret <8 x double> %res7
655 define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp {
656 ; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
658 ; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
659 ; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0
660 ; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
661 ; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1
662 ; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1
663 ; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
664 ; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
667 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
668 ; X32-AVX512F: # %bb.0:
669 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
670 ; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
671 ; X32-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
672 ; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
673 ; X32-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1
674 ; X32-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1
675 ; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
676 ; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
677 ; X32-AVX512F-NEXT: retl
678 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
679 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
680 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
681 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
682 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
683 %val0 = load volatile i32, i32* %ptr0
684 %val3 = load volatile i32, i32* %ptr3
685 %valC = load volatile i32, i32* %ptrC
686 %valE = load volatile i32, i32* %ptrE
687 %valF = load volatile i32, i32* %ptrF
688 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
689 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
690 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
691 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
692 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15