1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
5 ; Just one 32-bit run to make sure we do reasonable things.
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32-AVX512F
8 define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp {
9 ; ALL-LABEL: merge_8f64_2f64_12u4:
11 ; ALL-NEXT: vmovups 16(%rdi), %ymm0
12 ; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
13 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
16 ; X32-AVX512F-LABEL: merge_8f64_2f64_12u4:
17 ; X32-AVX512F: # %bb.0:
18 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
19 ; X32-AVX512F-NEXT: vmovups 16(%eax), %ymm0
20 ; X32-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1
21 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
22 ; X32-AVX512F-NEXT: retl
23 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1
24 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
25 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 4
26 %val0 = load <2 x double>, <2 x double>* %ptr0
27 %val1 = load <2 x double>, <2 x double>* %ptr1
28 %val3 = load <2 x double>, <2 x double>* %ptr3
29 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
30 %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
31 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
35 define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp {
36 ; ALL-LABEL: merge_8f64_2f64_23z5:
38 ; ALL-NEXT: vmovups 32(%rdi), %ymm0
39 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
40 ; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1
41 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
44 ; X32-AVX512F-LABEL: merge_8f64_2f64_23z5:
45 ; X32-AVX512F: # %bb.0:
46 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
47 ; X32-AVX512F-NEXT: vmovups 32(%eax), %ymm0
48 ; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
49 ; X32-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1
50 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
51 ; X32-AVX512F-NEXT: retl
52 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
53 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
54 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 5
55 %val0 = load <2 x double>, <2 x double>* %ptr0
56 %val1 = load <2 x double>, <2 x double>* %ptr1
57 %val3 = load <2 x double>, <2 x double>* %ptr3
58 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
59 %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
60 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
64 define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp {
65 ; ALL-LABEL: merge_8f64_4f64_z2:
67 ; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
68 ; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0
71 ; X32-AVX512F-LABEL: merge_8f64_4f64_z2:
72 ; X32-AVX512F: # %bb.0:
73 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
74 ; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
75 ; X32-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0
76 ; X32-AVX512F-NEXT: retl
77 %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2
78 %val1 = load <4 x double>, <4 x double>* %ptr1
79 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
83 define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp {
84 ; ALL-LABEL: merge_8f64_f64_23uuuuu9:
86 ; ALL-NEXT: vmovups 16(%rdi), %zmm0
89 ; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
90 ; X32-AVX512F: # %bb.0:
91 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
92 ; X32-AVX512F-NEXT: vmovups 16(%eax), %zmm0
93 ; X32-AVX512F-NEXT: retl
94 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
95 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
96 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
97 %val0 = load double, double* %ptr0
98 %val1 = load double, double* %ptr1
99 %val7 = load double, double* %ptr7
100 %res0 = insertelement <8 x double> undef, double %val0, i32 0
101 %res1 = insertelement <8 x double> %res0, double %val1, i32 1
102 %res7 = insertelement <8 x double> %res1, double %val7, i32 7
103 ret <8 x double> %res7
106 define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp {
107 ; ALL-LABEL: merge_8f64_f64_12zzuuzz:
109 ; ALL-NEXT: vmovups 8(%rdi), %xmm0
112 ; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
113 ; X32-AVX512F: # %bb.0:
114 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
115 ; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
116 ; X32-AVX512F-NEXT: retl
117 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
118 %ptr1 = getelementptr inbounds double, double* %ptr, i64 2
119 %val0 = load double, double* %ptr0
120 %val1 = load double, double* %ptr1
121 %res0 = insertelement <8 x double> undef, double %val0, i32 0
122 %res1 = insertelement <8 x double> %res0, double %val1, i32 1
123 %res2 = insertelement <8 x double> %res1, double 0.0, i32 2
124 %res3 = insertelement <8 x double> %res2, double 0.0, i32 3
125 %res6 = insertelement <8 x double> %res3, double 0.0, i32 6
126 %res7 = insertelement <8 x double> %res6, double 0.0, i32 7
127 ret <8 x double> %res7
130 define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
131 ; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
133 ; AVX512F-NEXT: movb $32, %al
134 ; AVX512F-NEXT: kmovw %eax, %k0
135 ; AVX512F-NEXT: knotw %k0, %k1
136 ; AVX512F-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z}
139 ; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8:
141 ; AVX512BW-NEXT: movb $32, %al
142 ; AVX512BW-NEXT: kmovd %eax, %k0
143 ; AVX512BW-NEXT: knotw %k0, %k1
144 ; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z}
145 ; AVX512BW-NEXT: retq
147 ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
148 ; X32-AVX512F: # %bb.0:
149 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
150 ; X32-AVX512F-NEXT: movb $32, %cl
151 ; X32-AVX512F-NEXT: kmovw %ecx, %k0
152 ; X32-AVX512F-NEXT: knotw %k0, %k1
153 ; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z}
154 ; X32-AVX512F-NEXT: retl
155 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
156 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3
157 %ptr4 = getelementptr inbounds double, double* %ptr, i64 5
158 %ptr7 = getelementptr inbounds double, double* %ptr, i64 8
159 %val0 = load double, double* %ptr0
160 %val2 = load double, double* %ptr2
161 %val4 = load double, double* %ptr4
162 %val7 = load double, double* %ptr7
163 %res0 = insertelement <8 x double> undef, double %val0, i32 0
164 %res2 = insertelement <8 x double> %res0, double %val2, i32 2
165 %res4 = insertelement <8 x double> %res2, double %val4, i32 4
166 %res5 = insertelement <8 x double> %res4, double 0.0, i32 5
167 %res7 = insertelement <8 x double> %res5, double %val7, i32 7
168 ret <8 x double> %res7
171 define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp {
172 ; ALL-LABEL: merge_8i64_4i64_z3:
174 ; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
175 ; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0
178 ; X32-AVX512F-LABEL: merge_8i64_4i64_z3:
179 ; X32-AVX512F: # %bb.0:
180 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
181 ; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
182 ; X32-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0
183 ; X32-AVX512F-NEXT: retl
184 %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3
185 %val1 = load <4 x i64>, <4 x i64>* %ptr1
186 %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
190 define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp {
191 ; ALL-LABEL: merge_8i64_i64_56zz9uzz:
193 ; ALL-NEXT: vmovups 40(%rdi), %xmm0
194 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
195 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
198 ; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
199 ; X32-AVX512F: # %bb.0:
200 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
201 ; X32-AVX512F-NEXT: vmovups 40(%eax), %xmm0
202 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
203 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
204 ; X32-AVX512F-NEXT: retl
205 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5
206 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6
207 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 9
208 %val0 = load i64, i64* %ptr0
209 %val1 = load i64, i64* %ptr1
210 %val4 = load i64, i64* %ptr4
211 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
212 %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1
213 %res2 = insertelement <8 x i64> %res1, i64 0, i32 2
214 %res3 = insertelement <8 x i64> %res2, i64 0, i32 3
215 %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4
216 %res6 = insertelement <8 x i64> %res4, i64 0, i32 6
217 %res7 = insertelement <8 x i64> %res6, i64 0, i32 7
221 define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
222 ; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
224 ; AVX512F-NEXT: movb $32, %al
225 ; AVX512F-NEXT: kmovw %eax, %k0
226 ; AVX512F-NEXT: knotw %k0, %k1
227 ; AVX512F-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
230 ; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8:
232 ; AVX512BW-NEXT: movb $32, %al
233 ; AVX512BW-NEXT: kmovd %eax, %k0
234 ; AVX512BW-NEXT: knotw %k0, %k1
235 ; AVX512BW-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
236 ; AVX512BW-NEXT: retq
238 ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
239 ; X32-AVX512F: # %bb.0:
240 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
241 ; X32-AVX512F-NEXT: movb $32, %cl
242 ; X32-AVX512F-NEXT: kmovw %ecx, %k0
243 ; X32-AVX512F-NEXT: knotw %k0, %k1
244 ; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z}
245 ; X32-AVX512F-NEXT: retl
246 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
247 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
248 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5
249 %ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8
250 %val0 = load i64, i64* %ptr0
251 %val2 = load i64, i64* %ptr2
252 %val4 = load i64, i64* %ptr4
253 %val7 = load i64, i64* %ptr7
254 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
255 %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2
256 %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4
257 %res5 = insertelement <8 x i64> %res4, i64 0, i32 5
258 %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7
262 define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp {
263 ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
265 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
268 ; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
269 ; X32-AVX512F: # %bb.0:
270 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
271 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
272 ; X32-AVX512F-NEXT: retl
273 %ptr0 = getelementptr inbounds float, float* %ptr, i64 8
274 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9
275 %val0 = load float, float* %ptr0
276 %val1 = load float, float* %ptr1
277 %res0 = insertelement <16 x float> undef, float %val0, i32 0
278 %res1 = insertelement <16 x float> %res0, float %val1, i32 1
279 %res2 = insertelement <16 x float> %res1, float 0.0, i32 2
280 %res3 = insertelement <16 x float> %res2, float 0.0, i32 3
281 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
282 %resF = insertelement <16 x float> %res4, float 0.0, i32 15
283 ret <16 x float> %resF
286 define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp {
287 ; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
289 ; ALL-NEXT: vmovups 16(%rdi), %xmm0
292 ; X32-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
293 ; X32-AVX512F: # %bb.0:
294 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
295 ; X32-AVX512F-NEXT: vmovups 16(%eax), %xmm0
296 ; X32-AVX512F-NEXT: retl
297 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
298 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
299 %ptr3 = getelementptr inbounds float, float* %ptr, i64 7
300 %val0 = load float, float* %ptr0
301 %val1 = load float, float* %ptr1
302 %val3 = load float, float* %ptr3
303 %res0 = insertelement <16 x float> undef, float %val0, i32 0
304 %res1 = insertelement <16 x float> %res0, float %val1, i32 1
305 %res3 = insertelement <16 x float> %res1, float %val3, i32 3
306 ret <16 x float> %res3
309 define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp {
310 ; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
312 ; ALL-NEXT: vmovups (%rdi), %zmm0
315 ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
316 ; X32-AVX512F: # %bb.0:
317 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
318 ; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
319 ; X32-AVX512F-NEXT: retl
320 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
321 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
322 %ptrC = getelementptr inbounds float, float* %ptr, i64 12
323 %ptrE = getelementptr inbounds float, float* %ptr, i64 14
324 %ptrF = getelementptr inbounds float, float* %ptr, i64 15
325 %val0 = load float, float* %ptr0
326 %val3 = load float, float* %ptr3
327 %valC = load float, float* %ptrC
328 %valE = load float, float* %ptrE
329 %valF = load float, float* %ptrF
330 %res0 = insertelement <16 x float> undef, float %val0, i32 0
331 %res3 = insertelement <16 x float> %res0, float %val3, i32 3
332 %resC = insertelement <16 x float> %res3, float %valC, i32 12
333 %resE = insertelement <16 x float> %resC, float %valE, i32 14
334 %resF = insertelement <16 x float> %resE, float %valF, i32 15
335 ret <16 x float> %resF
338 define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
339 ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
341 ; ALL-NEXT: vmovups (%rdi), %zmm1
342 ; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
343 ; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
344 ; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
347 ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
348 ; X32-AVX512F: # %bb.0:
349 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
350 ; X32-AVX512F-NEXT: vmovups (%eax), %zmm1
351 ; X32-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
352 ; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
353 ; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
354 ; X32-AVX512F-NEXT: retl
355 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
356 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
357 %ptrC = getelementptr inbounds float, float* %ptr, i64 12
358 %ptrE = getelementptr inbounds float, float* %ptr, i64 14
359 %ptrF = getelementptr inbounds float, float* %ptr, i64 15
360 %val0 = load float, float* %ptr0
361 %val3 = load float, float* %ptr3
362 %valC = load float, float* %ptrC
363 %valE = load float, float* %ptrE
364 %valF = load float, float* %ptrF
365 %res0 = insertelement <16 x float> undef, float %val0, i32 0
366 %res3 = insertelement <16 x float> %res0, float %val3, i32 3
367 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
368 %res5 = insertelement <16 x float> %res4, float 0.0, i32 5
369 %resC = insertelement <16 x float> %res5, float %valC, i32 12
370 %resD = insertelement <16 x float> %resC, float 0.0, i32 13
371 %resE = insertelement <16 x float> %resD, float %valE, i32 14
372 %resF = insertelement <16 x float> %resE, float %valF, i32 15
373 ret <16 x float> %resF
376 define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp {
377 ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
379 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
382 ; X32-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
383 ; X32-AVX512F: # %bb.0:
384 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
385 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
386 ; X32-AVX512F-NEXT: retl
387 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
388 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2
389 %val0 = load i32, i32* %ptr0
390 %val1 = load i32, i32* %ptr1
391 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
392 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
393 %res2 = insertelement <16 x i32> %res1, i32 0, i32 2
394 %res3 = insertelement <16 x i32> %res2, i32 0, i32 3
395 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
396 %resF = insertelement <16 x i32> %res4, i32 0, i32 15
400 define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp {
401 ; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
403 ; ALL-NEXT: vmovups 8(%rdi), %xmm0
406 ; X32-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
407 ; X32-AVX512F: # %bb.0:
408 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
409 ; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
410 ; X32-AVX512F-NEXT: retl
411 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
412 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
413 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
414 %val0 = load i32, i32* %ptr0
415 %val1 = load i32, i32* %ptr1
416 %val3 = load i32, i32* %ptr3
417 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
418 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
419 %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3
423 define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp {
424 ; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
426 ; ALL-NEXT: vmovups (%rdi), %zmm0
429 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
430 ; X32-AVX512F: # %bb.0:
431 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
432 ; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
433 ; X32-AVX512F-NEXT: retl
434 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
435 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
436 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
437 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
438 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
439 %val0 = load i32, i32* %ptr0
440 %val3 = load i32, i32* %ptr3
441 %valC = load i32, i32* %ptrC
442 %valE = load i32, i32* %ptrE
443 %valF = load i32, i32* %ptrF
444 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
445 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
446 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
447 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
448 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
452 define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
453 ; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
455 ; AVX512F-NEXT: movw $8240, %ax # imm = 0x2030
456 ; AVX512F-NEXT: kmovw %eax, %k0
457 ; AVX512F-NEXT: knotw %k0, %k1
458 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
461 ; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
463 ; AVX512BW-NEXT: movw $8240, %ax # imm = 0x2030
464 ; AVX512BW-NEXT: kmovd %eax, %k0
465 ; AVX512BW-NEXT: knotw %k0, %k1
466 ; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
467 ; AVX512BW-NEXT: retq
469 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
470 ; X32-AVX512F: # %bb.0:
471 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
472 ; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030
473 ; X32-AVX512F-NEXT: kmovw %ecx, %k0
474 ; X32-AVX512F-NEXT: knotw %k0, %k1
475 ; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z}
476 ; X32-AVX512F-NEXT: retl
477 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
478 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
479 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
480 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
481 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
482 %val0 = load i32, i32* %ptr0
483 %val3 = load i32, i32* %ptr3
484 %valC = load i32, i32* %ptrC
485 %valE = load i32, i32* %ptrE
486 %valF = load i32, i32* %ptrF
487 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
488 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
489 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
490 %res5 = insertelement <16 x i32> %res4, i32 0, i32 5
491 %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12
492 %resD = insertelement <16 x i32> %resC, i32 0, i32 13
493 %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14
494 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
498 define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp {
499 ; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
501 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
502 ; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
505 ; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
507 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
508 ; AVX512BW-NEXT: retq
510 ; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
511 ; X32-AVX512F: # %bb.0:
512 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
513 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
514 ; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
515 ; X32-AVX512F-NEXT: retl
516 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
517 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
518 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 4
519 %val0 = load i16, i16* %ptr0
520 %val1 = load i16, i16* %ptr1
521 %val3 = load i16, i16* %ptr3
522 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
523 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
524 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
525 %res30 = insertelement <32 x i16> %res3, i16 0, i16 30
526 %res31 = insertelement <32 x i16> %res30, i16 0, i16 31
527 ret <32 x i16> %res31
530 define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
531 ; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
533 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
536 ; X32-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
537 ; X32-AVX512F: # %bb.0:
538 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
539 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
540 ; X32-AVX512F-NEXT: retl
541 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
542 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
543 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
544 %val0 = load i16, i16* %ptr0
545 %val1 = load i16, i16* %ptr1
546 %val3 = load i16, i16* %ptr3
547 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
548 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
549 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
553 define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
554 ; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
556 ; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
557 ; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
560 ; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
562 ; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
563 ; AVX512BW-NEXT: retq
565 ; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
566 ; X32-AVX512F: # %bb.0:
567 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
568 ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
569 ; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
570 ; X32-AVX512F-NEXT: retl
571 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
572 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
573 %val0 = load i16, i16* %ptr0
574 %val1 = load i16, i16* %ptr1
575 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
576 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
577 %res3 = insertelement <32 x i16> %res1, i16 0, i16 3
578 %resE = insertelement <32 x i16> %res3, i16 0, i16 14
579 %resF = insertelement <32 x i16> %resE, i16 0, i16 15
580 %resG = insertelement <32 x i16> %resF, i16 0, i16 16
581 %resH = insertelement <32 x i16> %resG, i16 0, i16 17
585 define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
586 ; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
588 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
589 ; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
592 ; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
594 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
595 ; AVX512BW-NEXT: retq
597 ; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
598 ; X32-AVX512F: # %bb.0:
599 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
600 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
601 ; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
602 ; X32-AVX512F-NEXT: retl
603 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
604 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
605 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
606 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 8
607 %val0 = load i8, i8* %ptr0
608 %val1 = load i8, i8* %ptr1
609 %val3 = load i8, i8* %ptr3
610 %val7 = load i8, i8* %ptr7
611 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
612 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
613 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
614 %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7
615 %res14 = insertelement <64 x i8> %res7, i8 0, i8 14
616 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
617 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
618 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
619 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
623 define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
624 ; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
626 ; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
627 ; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
630 ; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
632 ; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
633 ; AVX512BW-NEXT: retq
635 ; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
636 ; X32-AVX512F: # %bb.0:
637 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
638 ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
639 ; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
640 ; X32-AVX512F-NEXT: retl
641 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
642 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
643 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
644 %val0 = load i8, i8* %ptr0
645 %val1 = load i8, i8* %ptr1
646 %val3 = load i8, i8* %ptr3
647 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
648 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
649 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
650 %res14 = insertelement <64 x i8> %res3, i8 0, i8 14
651 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
652 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
653 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
654 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
659 ; consecutive loads including any/all volatiles may not be combined
662 define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp {
663 ; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile:
665 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
666 ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
667 ; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1
668 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
671 ; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile:
672 ; X32-AVX512F: # %bb.0:
673 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
674 ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
675 ; X32-AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
676 ; X32-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1
677 ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
678 ; X32-AVX512F-NEXT: retl
679 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
680 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
681 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
682 %val0 = load volatile double, double* %ptr0
683 %val1 = load double, double* %ptr1
684 %val7 = load double, double* %ptr7
685 %res0 = insertelement <8 x double> undef, double %val0, i32 0
686 %res1 = insertelement <8 x double> %res0, double %val1, i32 1
687 %res7 = insertelement <8 x double> %res1, double %val7, i32 7
688 ret <8 x double> %res7
691 define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp {
692 ; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
694 ; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
695 ; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0
696 ; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
697 ; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1
698 ; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1
699 ; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
700 ; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
703 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
704 ; X32-AVX512F: # %bb.0:
705 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
706 ; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
707 ; X32-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
708 ; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
709 ; X32-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1
710 ; X32-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1
711 ; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
712 ; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
713 ; X32-AVX512F-NEXT: retl
714 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
715 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
716 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
717 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
718 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
719 %val0 = load volatile i32, i32* %ptr0
720 %val3 = load volatile i32, i32* %ptr3
721 %valC = load volatile i32, i32* %ptrC
722 %valE = load volatile i32, i32* %ptrE
723 %valF = load volatile i32, i32* %ptrF
724 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
725 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
726 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
727 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
728 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15