1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX2
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
4 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512VL
5 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VL
7 define <16 x i8> @BB16(i8* %ptr) nounwind uwtable readnone ssp {
9 ; X32: ## %bb.0: ## %entry
10 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
11 ; X32-NEXT: vpbroadcastb (%eax), %xmm0
15 ; X64: ## %bb.0: ## %entry
16 ; X64-NEXT: vpbroadcastb (%rdi), %xmm0
19 %q = load i8, i8* %ptr, align 4
20 %q0 = insertelement <16 x i8> undef, i8 %q, i32 0
21 %q1 = insertelement <16 x i8> %q0, i8 %q, i32 1
22 %q2 = insertelement <16 x i8> %q1, i8 %q, i32 2
23 %q3 = insertelement <16 x i8> %q2, i8 %q, i32 3
24 %q4 = insertelement <16 x i8> %q3, i8 %q, i32 4
25 %q5 = insertelement <16 x i8> %q4, i8 %q, i32 5
26 %q6 = insertelement <16 x i8> %q5, i8 %q, i32 6
27 %q7 = insertelement <16 x i8> %q6, i8 %q, i32 7
28 %q8 = insertelement <16 x i8> %q7, i8 %q, i32 8
29 %q9 = insertelement <16 x i8> %q8, i8 %q, i32 9
30 %qa = insertelement <16 x i8> %q9, i8 %q, i32 10
31 %qb = insertelement <16 x i8> %qa, i8 %q, i32 11
32 %qc = insertelement <16 x i8> %qb, i8 %q, i32 12
33 %qd = insertelement <16 x i8> %qc, i8 %q, i32 13
34 %qe = insertelement <16 x i8> %qd, i8 %q, i32 14
35 %qf = insertelement <16 x i8> %qe, i8 %q, i32 15
39 define <32 x i8> @BB32(i8* %ptr) nounwind uwtable readnone ssp {
41 ; X32: ## %bb.0: ## %entry
42 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
43 ; X32-NEXT: vpbroadcastb (%eax), %ymm0
47 ; X64: ## %bb.0: ## %entry
48 ; X64-NEXT: vpbroadcastb (%rdi), %ymm0
51 %q = load i8, i8* %ptr, align 4
52 %q0 = insertelement <32 x i8> undef, i8 %q, i32 0
53 %q1 = insertelement <32 x i8> %q0, i8 %q, i32 1
54 %q2 = insertelement <32 x i8> %q1, i8 %q, i32 2
55 %q3 = insertelement <32 x i8> %q2, i8 %q, i32 3
56 %q4 = insertelement <32 x i8> %q3, i8 %q, i32 4
57 %q5 = insertelement <32 x i8> %q4, i8 %q, i32 5
58 %q6 = insertelement <32 x i8> %q5, i8 %q, i32 6
59 %q7 = insertelement <32 x i8> %q6, i8 %q, i32 7
60 %q8 = insertelement <32 x i8> %q7, i8 %q, i32 8
61 %q9 = insertelement <32 x i8> %q8, i8 %q, i32 9
62 %qa = insertelement <32 x i8> %q9, i8 %q, i32 10
63 %qb = insertelement <32 x i8> %qa, i8 %q, i32 11
64 %qc = insertelement <32 x i8> %qb, i8 %q, i32 12
65 %qd = insertelement <32 x i8> %qc, i8 %q, i32 13
66 %qe = insertelement <32 x i8> %qd, i8 %q, i32 14
67 %qf = insertelement <32 x i8> %qe, i8 %q, i32 15
69 %q20 = insertelement <32 x i8> %qf, i8 %q, i32 16
70 %q21 = insertelement <32 x i8> %q20, i8 %q, i32 17
71 %q22 = insertelement <32 x i8> %q21, i8 %q, i32 18
72 %q23 = insertelement <32 x i8> %q22, i8 %q, i32 19
73 %q24 = insertelement <32 x i8> %q23, i8 %q, i32 20
74 %q25 = insertelement <32 x i8> %q24, i8 %q, i32 21
75 %q26 = insertelement <32 x i8> %q25, i8 %q, i32 22
76 %q27 = insertelement <32 x i8> %q26, i8 %q, i32 23
77 %q28 = insertelement <32 x i8> %q27, i8 %q, i32 24
78 %q29 = insertelement <32 x i8> %q28, i8 %q, i32 25
79 %q2a = insertelement <32 x i8> %q29, i8 %q, i32 26
80 %q2b = insertelement <32 x i8> %q2a, i8 %q, i32 27
81 %q2c = insertelement <32 x i8> %q2b, i8 %q, i32 28
82 %q2d = insertelement <32 x i8> %q2c, i8 %q, i32 29
83 %q2e = insertelement <32 x i8> %q2d, i8 %q, i32 30
84 %q2f = insertelement <32 x i8> %q2e, i8 %q, i32 31
88 define <8 x i16> @W16(i16* %ptr) nounwind uwtable readnone ssp {
90 ; X32: ## %bb.0: ## %entry
91 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
92 ; X32-NEXT: vpbroadcastw (%eax), %xmm0
96 ; X64: ## %bb.0: ## %entry
97 ; X64-NEXT: vpbroadcastw (%rdi), %xmm0
100 %q = load i16, i16* %ptr, align 4
101 %q0 = insertelement <8 x i16> undef, i16 %q, i32 0
102 %q1 = insertelement <8 x i16> %q0, i16 %q, i32 1
103 %q2 = insertelement <8 x i16> %q1, i16 %q, i32 2
104 %q3 = insertelement <8 x i16> %q2, i16 %q, i32 3
105 %q4 = insertelement <8 x i16> %q3, i16 %q, i32 4
106 %q5 = insertelement <8 x i16> %q4, i16 %q, i32 5
107 %q6 = insertelement <8 x i16> %q5, i16 %q, i32 6
108 %q7 = insertelement <8 x i16> %q6, i16 %q, i32 7
112 define <16 x i16> @WW16(i16* %ptr) nounwind uwtable readnone ssp {
114 ; X32: ## %bb.0: ## %entry
115 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
116 ; X32-NEXT: vpbroadcastw (%eax), %ymm0
120 ; X64: ## %bb.0: ## %entry
121 ; X64-NEXT: vpbroadcastw (%rdi), %ymm0
124 %q = load i16, i16* %ptr, align 4
125 %q0 = insertelement <16 x i16> undef, i16 %q, i32 0
126 %q1 = insertelement <16 x i16> %q0, i16 %q, i32 1
127 %q2 = insertelement <16 x i16> %q1, i16 %q, i32 2
128 %q3 = insertelement <16 x i16> %q2, i16 %q, i32 3
129 %q4 = insertelement <16 x i16> %q3, i16 %q, i32 4
130 %q5 = insertelement <16 x i16> %q4, i16 %q, i32 5
131 %q6 = insertelement <16 x i16> %q5, i16 %q, i32 6
132 %q7 = insertelement <16 x i16> %q6, i16 %q, i32 7
133 %q8 = insertelement <16 x i16> %q7, i16 %q, i32 8
134 %q9 = insertelement <16 x i16> %q8, i16 %q, i32 9
135 %qa = insertelement <16 x i16> %q9, i16 %q, i32 10
136 %qb = insertelement <16 x i16> %qa, i16 %q, i32 11
137 %qc = insertelement <16 x i16> %qb, i16 %q, i32 12
138 %qd = insertelement <16 x i16> %qc, i16 %q, i32 13
139 %qe = insertelement <16 x i16> %qd, i16 %q, i32 14
140 %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
144 define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
146 ; X32: ## %bb.0: ## %entry
147 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
148 ; X32-NEXT: vbroadcastss (%eax), %xmm0
152 ; X64: ## %bb.0: ## %entry
153 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
156 %q = load i32, i32* %ptr, align 4
157 %q0 = insertelement <4 x i32> undef, i32 %q, i32 0
158 %q1 = insertelement <4 x i32> %q0, i32 %q, i32 1
159 %q2 = insertelement <4 x i32> %q1, i32 %q, i32 2
160 %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
164 define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
166 ; X32: ## %bb.0: ## %entry
167 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
168 ; X32-NEXT: vbroadcastss (%eax), %ymm0
172 ; X64: ## %bb.0: ## %entry
173 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
176 %q = load i32, i32* %ptr, align 4
177 %q0 = insertelement <8 x i32> undef, i32 %q, i32 0
178 %q1 = insertelement <8 x i32> %q0, i32 %q, i32 1
179 %q2 = insertelement <8 x i32> %q1, i32 %q, i32 2
180 %q3 = insertelement <8 x i32> %q2, i32 %q, i32 3
181 %q4 = insertelement <8 x i32> %q3, i32 %q, i32 4
182 %q5 = insertelement <8 x i32> %q4, i32 %q, i32 5
183 %q6 = insertelement <8 x i32> %q5, i32 %q, i32 6
184 %q7 = insertelement <8 x i32> %q6, i32 %q, i32 7
188 define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp {
190 ; X32: ## %bb.0: ## %entry
191 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
192 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
196 ; X64: ## %bb.0: ## %entry
197 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
200 %q = load i64, i64* %ptr, align 4
201 %q0 = insertelement <2 x i64> undef, i64 %q, i32 0
202 %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
206 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
208 ; X32: ## %bb.0: ## %entry
209 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
210 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
211 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
215 ; X64: ## %bb.0: ## %entry
216 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
219 %q = load i64, i64* %ptr, align 4
220 %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
221 %q1 = insertelement <4 x i64> %q0, i64 %q, i32 1
222 %q2 = insertelement <4 x i64> %q1, i64 %q, i32 2
223 %q3 = insertelement <4 x i64> %q2, i64 %q, i32 3
227 define <8 x i16> @broadcast_mem_v4i16_v8i16(<4 x i16>* %ptr) {
228 ; X32-LABEL: broadcast_mem_v4i16_v8i16:
230 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
231 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
234 ; X64-LABEL: broadcast_mem_v4i16_v8i16:
236 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
238 %load = load <4 x i16>, <4 x i16>* %ptr
239 %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
243 define <16 x i16> @broadcast_mem_v4i16_v16i16(<4 x i16>* %ptr) {
244 ; X32-LABEL: broadcast_mem_v4i16_v16i16:
246 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
247 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
250 ; X64-LABEL: broadcast_mem_v4i16_v16i16:
252 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
254 %load = load <4 x i16>, <4 x i16>* %ptr
255 %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
259 ; FIXME: Pointer adjusted broadcasts
261 define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
262 ; X32-LABEL: load_splat_16i8_16i8_1111111111111111:
263 ; X32: ## %bb.0: ## %entry
264 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
265 ; X32-NEXT: vpbroadcastb 1(%eax), %xmm0
268 ; X64-LABEL: load_splat_16i8_16i8_1111111111111111:
269 ; X64: ## %bb.0: ## %entry
270 ; X64-NEXT: vpbroadcastb 1(%rdi), %xmm0
273 %ld = load <16 x i8>, <16 x i8>* %ptr
274 %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
278 define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
279 ; X32-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
280 ; X32: ## %bb.0: ## %entry
281 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
282 ; X32-NEXT: vpbroadcastb 1(%eax), %ymm0
285 ; X64-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
286 ; X64: ## %bb.0: ## %entry
287 ; X64-NEXT: vpbroadcastb 1(%rdi), %ymm0
290 %ld = load <16 x i8>, <16 x i8>* %ptr
291 %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
295 define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
296 ; X32-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
297 ; X32: ## %bb.0: ## %entry
298 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
299 ; X32-NEXT: vpbroadcastb 1(%eax), %ymm0
302 ; X64-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
303 ; X64: ## %bb.0: ## %entry
304 ; X64-NEXT: vpbroadcastb 1(%rdi), %ymm0
307 %ld = load <32 x i8>, <32 x i8>* %ptr
308 %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
312 define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
313 ; X32-LABEL: load_splat_8i16_8i16_11111111:
314 ; X32: ## %bb.0: ## %entry
315 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
316 ; X32-NEXT: vpbroadcastw 2(%eax), %xmm0
319 ; X64-LABEL: load_splat_8i16_8i16_11111111:
320 ; X64: ## %bb.0: ## %entry
321 ; X64-NEXT: vpbroadcastw 2(%rdi), %xmm0
324 %ld = load <8 x i16>, <8 x i16>* %ptr
325 %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
329 define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
330 ; X32-LABEL: load_splat_16i16_8i16_1111111111111111:
331 ; X32: ## %bb.0: ## %entry
332 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
333 ; X32-NEXT: vpbroadcastw 2(%eax), %ymm0
336 ; X64-LABEL: load_splat_16i16_8i16_1111111111111111:
337 ; X64: ## %bb.0: ## %entry
338 ; X64-NEXT: vpbroadcastw 2(%rdi), %ymm0
341 %ld = load <8 x i16>, <8 x i16>* %ptr
342 %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
346 define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
347 ; X32-LABEL: load_splat_16i16_16i16_1111111111111111:
348 ; X32: ## %bb.0: ## %entry
349 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
350 ; X32-NEXT: vpbroadcastw 2(%eax), %ymm0
353 ; X64-LABEL: load_splat_16i16_16i16_1111111111111111:
354 ; X64: ## %bb.0: ## %entry
355 ; X64-NEXT: vpbroadcastw 2(%rdi), %ymm0
358 %ld = load <16 x i16>, <16 x i16>* %ptr
359 %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
363 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
364 ; X32-LABEL: load_splat_4i32_4i32_1111:
365 ; X32: ## %bb.0: ## %entry
366 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
367 ; X32-NEXT: vbroadcastss 4(%eax), %xmm0
370 ; X64-LABEL: load_splat_4i32_4i32_1111:
371 ; X64: ## %bb.0: ## %entry
372 ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
375 %ld = load <4 x i32>, <4 x i32>* %ptr
376 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
380 define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
381 ; X32-LABEL: load_splat_8i32_4i32_33333333:
382 ; X32: ## %bb.0: ## %entry
383 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
384 ; X32-NEXT: vbroadcastss 12(%eax), %ymm0
387 ; X64-LABEL: load_splat_8i32_4i32_33333333:
388 ; X64: ## %bb.0: ## %entry
389 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
392 %ld = load <4 x i32>, <4 x i32>* %ptr
393 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
397 define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
398 ; X32-LABEL: load_splat_8i32_8i32_55555555:
399 ; X32: ## %bb.0: ## %entry
400 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
401 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
404 ; X64-LABEL: load_splat_8i32_8i32_55555555:
405 ; X64: ## %bb.0: ## %entry
406 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
409 %ld = load <8 x i32>, <8 x i32>* %ptr
410 %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
414 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
415 ; X32-LABEL: load_splat_4f32_4f32_1111:
416 ; X32: ## %bb.0: ## %entry
417 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
418 ; X32-NEXT: vbroadcastss 4(%eax), %xmm0
421 ; X64-LABEL: load_splat_4f32_4f32_1111:
422 ; X64: ## %bb.0: ## %entry
423 ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
426 %ld = load <4 x float>, <4 x float>* %ptr
427 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
431 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
432 ; X32-LABEL: load_splat_8f32_4f32_33333333:
433 ; X32: ## %bb.0: ## %entry
434 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
435 ; X32-NEXT: vbroadcastss 12(%eax), %ymm0
438 ; X64-LABEL: load_splat_8f32_4f32_33333333:
439 ; X64: ## %bb.0: ## %entry
440 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
443 %ld = load <4 x float>, <4 x float>* %ptr
444 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
448 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
449 ; X32-LABEL: load_splat_8f32_8f32_55555555:
450 ; X32: ## %bb.0: ## %entry
451 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
452 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
455 ; X64-LABEL: load_splat_8f32_8f32_55555555:
456 ; X64: ## %bb.0: ## %entry
457 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
460 %ld = load <8 x float>, <8 x float>* %ptr
461 %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
465 define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
466 ; X32-LABEL: load_splat_2i64_2i64_1111:
467 ; X32: ## %bb.0: ## %entry
468 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
469 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
472 ; X64-LABEL: load_splat_2i64_2i64_1111:
473 ; X64: ## %bb.0: ## %entry
474 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
477 %ld = load <2 x i64>, <2 x i64>* %ptr
478 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
482 define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
483 ; X32-LABEL: load_splat_4i64_2i64_1111:
484 ; X32: ## %bb.0: ## %entry
485 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
486 ; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
489 ; X64-LABEL: load_splat_4i64_2i64_1111:
490 ; X64: ## %bb.0: ## %entry
491 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
494 %ld = load <2 x i64>, <2 x i64>* %ptr
495 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
499 define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
500 ; X32-LABEL: load_splat_4i64_4i64_2222:
501 ; X32: ## %bb.0: ## %entry
502 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
503 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
506 ; X64-LABEL: load_splat_4i64_4i64_2222:
507 ; X64: ## %bb.0: ## %entry
508 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
511 %ld = load <4 x i64>, <4 x i64>* %ptr
512 %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
516 define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
517 ; X32-LABEL: load_splat_2f64_2f64_1111:
518 ; X32: ## %bb.0: ## %entry
519 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
520 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
523 ; X64-LABEL: load_splat_2f64_2f64_1111:
524 ; X64: ## %bb.0: ## %entry
525 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
528 %ld = load <2 x double>, <2 x double>* %ptr
529 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
530 ret <2 x double> %ret
533 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
534 ; X32-LABEL: load_splat_4f64_2f64_1111:
535 ; X32: ## %bb.0: ## %entry
536 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
537 ; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
540 ; X64-LABEL: load_splat_4f64_2f64_1111:
541 ; X64: ## %bb.0: ## %entry
542 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
545 %ld = load <2 x double>, <2 x double>* %ptr
546 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
547 ret <4 x double> %ret
550 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
551 ; X32-LABEL: load_splat_4f64_4f64_2222:
552 ; X32: ## %bb.0: ## %entry
553 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
554 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
557 ; X64-LABEL: load_splat_4f64_4f64_2222:
558 ; X64: ## %bb.0: ## %entry
559 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
562 %ld = load <4 x double>, <4 x double>* %ptr
563 %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
564 ret <4 x double> %ret
567 ; make sure that we still don't support broadcast double into 128-bit vector
569 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
571 ; X32: ## %bb.0: ## %entry
572 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
573 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
577 ; X64: ## %bb.0: ## %entry
578 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
581 %q = load double, double* %ptr, align 4
582 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
583 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
584 ret <2 x double> %vecinit2.i
587 define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
588 ; X32-AVX2-LABEL: V111:
589 ; X32-AVX2: ## %bb.0: ## %entry
590 ; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
591 ; X32-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
592 ; X32-AVX2-NEXT: retl
594 ; X64-AVX2-LABEL: V111:
595 ; X64-AVX2: ## %bb.0: ## %entry
596 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
597 ; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
598 ; X64-AVX2-NEXT: retq
600 ; X32-AVX512VL-LABEL: V111:
601 ; X32-AVX512VL: ## %bb.0: ## %entry
602 ; X32-AVX512VL-NEXT: vpaddd LCPI29_0{1to8}, %ymm0, %ymm0
603 ; X32-AVX512VL-NEXT: retl
605 ; X64-AVX512VL-LABEL: V111:
606 ; X64-AVX512VL: ## %bb.0: ## %entry
607 ; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
608 ; X64-AVX512VL-NEXT: retq
610 %g = add <8 x i32> %in, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
614 define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
615 ; X32-AVX2-LABEL: V113:
616 ; X32-AVX2: ## %bb.0: ## %entry
617 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
618 ; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
619 ; X32-AVX2-NEXT: retl
621 ; X64-AVX2-LABEL: V113:
622 ; X64-AVX2: ## %bb.0: ## %entry
623 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
624 ; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
625 ; X64-AVX2-NEXT: retq
627 ; X32-AVX512VL-LABEL: V113:
628 ; X32-AVX512VL: ## %bb.0: ## %entry
629 ; X32-AVX512VL-NEXT: vaddps LCPI30_0{1to8}, %ymm0, %ymm0
630 ; X32-AVX512VL-NEXT: retl
632 ; X64-AVX512VL-LABEL: V113:
633 ; X64-AVX512VL: ## %bb.0: ## %entry
634 ; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip){1to8}, %ymm0, %ymm0
635 ; X64-AVX512VL-NEXT: retq
637 %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
641 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
644 ; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
649 ; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
651 %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
652 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
653 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
654 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
655 ret <4 x float> %vecinit6.i
658 define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
661 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
666 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
668 %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
669 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
670 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
671 %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3
672 %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4
673 %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5
674 %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6
675 %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7
676 ret <8 x i8> %vecinit7.i
679 define void @crash() nounwind alwaysinline {
681 ; X32: ## %bb.0: ## %WGLoopsEntry
682 ; X32-NEXT: xorl %eax, %eax
683 ; X32-NEXT: testb %al, %al
684 ; X32-NEXT: je LBB33_1
685 ; X32-NEXT: ## %bb.2: ## %ret
687 ; X32-NEXT: .p2align 4, 0x90
688 ; X32-NEXT: LBB33_1: ## %footer329VF
689 ; X32-NEXT: ## =>This Inner Loop Header: Depth=1
690 ; X32-NEXT: jmp LBB33_1
693 ; X64: ## %bb.0: ## %WGLoopsEntry
694 ; X64-NEXT: xorl %eax, %eax
695 ; X64-NEXT: testb %al, %al
696 ; X64-NEXT: je LBB33_1
697 ; X64-NEXT: ## %bb.2: ## %ret
699 ; X64-NEXT: .p2align 4, 0x90
700 ; X64-NEXT: LBB33_1: ## %footer329VF
701 ; X64-NEXT: ## =>This Inner Loop Header: Depth=1
702 ; X64-NEXT: jmp LBB33_1
704 br i1 undef, label %ret, label %footer329VF
707 %A.0.inVF = fmul float undef, 6.553600e+04
708 %B.0.in407VF = fmul <8 x float> undef, <float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04>
709 %A.0VF = fptosi float %A.0.inVF to i32
710 %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32>
711 %0 = and <8 x i32> %B.0408VF, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
712 %1 = and i32 %A.0VF, 65535
713 %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0
714 %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer
715 br i1 undef, label %preload1201VF, label %footer349VF
718 br label %footer349VF
721 %2 = mul nsw <8 x i32> undef, %0
722 %3 = mul nsw <8 x i32> undef, %vector1099VF
723 br label %footer329VF
729 define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
730 ; X32-LABEL: _inreg0:
732 ; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
735 ; X64-AVX2-LABEL: _inreg0:
736 ; X64-AVX2: ## %bb.0:
737 ; X64-AVX2-NEXT: vmovd %edi, %xmm0
738 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
739 ; X64-AVX2-NEXT: retq
741 ; X64-AVX512VL-LABEL: _inreg0:
742 ; X64-AVX512VL: ## %bb.0:
743 ; X64-AVX512VL-NEXT: vpbroadcastd %edi, %ymm0
744 ; X64-AVX512VL-NEXT: retq
745 %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
746 %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
750 define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
751 ; X32-LABEL: _inreg1:
753 ; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
756 ; X64-LABEL: _inreg1:
758 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
760 %in = insertelement <8 x float> undef, float %scalar, i32 0
761 %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
762 ret <8 x float> %wide
765 define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
766 ; X32-LABEL: _inreg2:
768 ; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0
771 ; X64-LABEL: _inreg2:
773 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
775 %in = insertelement <4 x float> undef, float %scalar, i32 0
776 %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
777 ret <4 x float> %wide
780 define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
781 ; X32-LABEL: _inreg3:
783 ; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
786 ; X64-LABEL: _inreg3:
788 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
790 %in = insertelement <4 x double> undef, double %scalar, i32 0
791 %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
792 ret <4 x double> %wide
795 define <8 x float> @_inreg8xfloat(<8 x float> %a) {
796 ; X32-LABEL: _inreg8xfloat:
798 ; X32-NEXT: vbroadcastss %xmm0, %ymm0
801 ; X64-LABEL: _inreg8xfloat:
803 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
805 %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
809 define <4 x float> @_inreg4xfloat(<4 x float> %a) {
810 ; X32-LABEL: _inreg4xfloat:
812 ; X32-NEXT: vbroadcastss %xmm0, %xmm0
815 ; X64-LABEL: _inreg4xfloat:
817 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
819 %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
823 define <16 x i16> @_inreg16xi16(<16 x i16> %a) {
824 ; X32-LABEL: _inreg16xi16:
826 ; X32-NEXT: vpbroadcastw %xmm0, %ymm0
829 ; X64-LABEL: _inreg16xi16:
831 ; X64-NEXT: vpbroadcastw %xmm0, %ymm0
833 %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
837 define <8 x i16> @_inreg8xi16(<8 x i16> %a) {
838 ; X32-LABEL: _inreg8xi16:
840 ; X32-NEXT: vpbroadcastw %xmm0, %xmm0
843 ; X64-LABEL: _inreg8xi16:
845 ; X64-NEXT: vpbroadcastw %xmm0, %xmm0
847 %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
851 define <4 x i64> @_inreg4xi64(<4 x i64> %a) {
852 ; X32-LABEL: _inreg4xi64:
854 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
857 ; X64-LABEL: _inreg4xi64:
859 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
861 %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
865 define <2 x i64> @_inreg2xi64(<2 x i64> %a) {
866 ; X32-LABEL: _inreg2xi64:
868 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
871 ; X64-LABEL: _inreg2xi64:
873 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
875 %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
879 define <4 x double> @_inreg4xdouble(<4 x double> %a) {
880 ; X32-LABEL: _inreg4xdouble:
882 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
885 ; X64-LABEL: _inreg4xdouble:
887 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
889 %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
893 define <2 x double> @_inreg2xdouble(<2 x double> %a) {
894 ; X32-LABEL: _inreg2xdouble:
896 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
899 ; X64-LABEL: _inreg2xdouble:
901 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
903 %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
907 define <8 x i32> @_inreg8xi32(<8 x i32> %a) {
908 ; X32-LABEL: _inreg8xi32:
910 ; X32-NEXT: vbroadcastss %xmm0, %ymm0
913 ; X64-LABEL: _inreg8xi32:
915 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
917 %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
921 define <4 x i32> @_inreg4xi32(<4 x i32> %a) {
922 ; X32-LABEL: _inreg4xi32:
924 ; X32-NEXT: vbroadcastss %xmm0, %xmm0
927 ; X64-LABEL: _inreg4xi32:
929 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
931 %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
935 define <32 x i8> @_inreg32xi8(<32 x i8> %a) {
936 ; X32-LABEL: _inreg32xi8:
938 ; X32-NEXT: vpbroadcastb %xmm0, %ymm0
941 ; X64-LABEL: _inreg32xi8:
943 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
945 %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
949 define <16 x i8> @_inreg16xi8(<16 x i8> %a) {
950 ; X32-LABEL: _inreg16xi8:
952 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0
955 ; X64-LABEL: _inreg16xi8:
957 ; X64-NEXT: vpbroadcastb %xmm0, %xmm0
959 %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
963 ; These tests check that a vbroadcast instruction is used when we have a splat
964 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
965 ; (via the insertelements).
967 define <8 x float> @splat_concat1(float %f) {
968 ; X32-LABEL: splat_concat1:
970 ; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
973 ; X64-LABEL: splat_concat1:
975 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
977 %1 = insertelement <4 x float> undef, float %f, i32 0
978 %2 = insertelement <4 x float> %1, float %f, i32 1
979 %3 = insertelement <4 x float> %2, float %f, i32 2
980 %4 = insertelement <4 x float> %3, float %f, i32 3
981 %5 = shufflevector <4 x float> %4, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
985 define <8 x float> @splat_concat2(float %f) {
986 ; X32-LABEL: splat_concat2:
988 ; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
991 ; X64-LABEL: splat_concat2:
993 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
995 %1 = insertelement <4 x float> undef, float %f, i32 0
996 %2 = insertelement <4 x float> %1, float %f, i32 1
997 %3 = insertelement <4 x float> %2, float %f, i32 2
998 %4 = insertelement <4 x float> %3, float %f, i32 3
999 %5 = insertelement <4 x float> undef, float %f, i32 0
1000 %6 = insertelement <4 x float> %5, float %f, i32 1
1001 %7 = insertelement <4 x float> %6, float %f, i32 2
1002 %8 = insertelement <4 x float> %7, float %f, i32 3
1003 %9 = shufflevector <4 x float> %4, <4 x float> %8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1007 define <4 x double> @splat_concat3(double %d) {
1008 ; X32-LABEL: splat_concat3:
1010 ; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
1013 ; X64-LABEL: splat_concat3:
1015 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
1017 %1 = insertelement <2 x double> undef, double %d, i32 0
1018 %2 = insertelement <2 x double> %1, double %d, i32 1
1019 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1023 define <4 x double> @splat_concat4(double %d) {
1024 ; X32-LABEL: splat_concat4:
1026 ; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
1029 ; X64-LABEL: splat_concat4:
1031 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
1033 %1 = insertelement <2 x double> undef, double %d, i32 0
1034 %2 = insertelement <2 x double> %1, double %d, i32 1
1035 %3 = insertelement <2 x double> undef, double %d, i32 0
1036 %4 = insertelement <2 x double> %3, double %d, i32 1
1037 %5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1041 ; Test cases for <rdar://problem/16074331>.
1042 ; Instruction selection for broacast instruction fails if
1043 ; the load cannot be folded into the broadcast.
1044 ; This happens if the load has initial one use but other uses are
1045 ; created later, or if selection DAG cannot prove that folding the
1046 ; load will not create a cycle in the DAG.
1047 ; Those test cases exerce the latter.
1049 define void @isel_crash_16b(i8* %cV_R.addr) {
1050 ; X32-LABEL: isel_crash_16b:
1051 ; X32: ## %bb.0: ## %eintry
1052 ; X32-NEXT: subl $60, %esp
1053 ; X32-NEXT: .cfi_def_cfa_offset 64
1054 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1055 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1056 ; X32-NEXT: vmovaps %xmm0, (%esp)
1057 ; X32-NEXT: vpbroadcastb (%eax), %xmm1
1058 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1059 ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
1060 ; X32-NEXT: addl $60, %esp
1063 ; X64-LABEL: isel_crash_16b:
1064 ; X64: ## %bb.0: ## %eintry
1065 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1066 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1067 ; X64-NEXT: vpbroadcastb (%rdi), %xmm1
1068 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1069 ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
1072 %__a.addr.i = alloca <2 x i64>, align 16
1073 %__b.addr.i = alloca <2 x i64>, align 16
1074 %vCr = alloca <2 x i64>, align 16
1075 store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
1076 %tmp = load <2 x i64>, <2 x i64>* %vCr, align 16
1077 %tmp2 = load i8, i8* %cV_R.addr, align 4
1078 %splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0
1079 %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
1080 %tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64>
1081 store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
1082 store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
1086 define void @isel_crash_32b(i8* %cV_R.addr) {
1087 ; X32-LABEL: isel_crash_32b:
1088 ; X32: ## %bb.0: ## %eintry
1089 ; X32-NEXT: pushl %ebp
1090 ; X32-NEXT: .cfi_def_cfa_offset 8
1091 ; X32-NEXT: .cfi_offset %ebp, -8
1092 ; X32-NEXT: movl %esp, %ebp
1093 ; X32-NEXT: .cfi_def_cfa_register %ebp
1094 ; X32-NEXT: andl $-32, %esp
1095 ; X32-NEXT: subl $128, %esp
1096 ; X32-NEXT: movl 8(%ebp), %eax
1097 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1098 ; X32-NEXT: vmovaps %ymm0, (%esp)
1099 ; X32-NEXT: vpbroadcastb (%eax), %ymm1
1100 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1101 ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
1102 ; X32-NEXT: movl %ebp, %esp
1103 ; X32-NEXT: popl %ebp
1104 ; X32-NEXT: vzeroupper
1107 ; X64-LABEL: isel_crash_32b:
1108 ; X64: ## %bb.0: ## %eintry
1109 ; X64-NEXT: pushq %rbp
1110 ; X64-NEXT: .cfi_def_cfa_offset 16
1111 ; X64-NEXT: .cfi_offset %rbp, -16
1112 ; X64-NEXT: movq %rsp, %rbp
1113 ; X64-NEXT: .cfi_def_cfa_register %rbp
1114 ; X64-NEXT: andq $-32, %rsp
1115 ; X64-NEXT: subq $128, %rsp
1116 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1117 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1118 ; X64-NEXT: vpbroadcastb (%rdi), %ymm1
1119 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1120 ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1121 ; X64-NEXT: movq %rbp, %rsp
1122 ; X64-NEXT: popq %rbp
1123 ; X64-NEXT: vzeroupper
1126 %__a.addr.i = alloca <4 x i64>, align 16
1127 %__b.addr.i = alloca <4 x i64>, align 16
1128 %vCr = alloca <4 x i64>, align 16
1129 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
1130 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
1131 %tmp2 = load i8, i8* %cV_R.addr, align 4
1132 %splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0
1133 %splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer
1134 %tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64>
1135 store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
1136 store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
1140 define void @isel_crash_8w(i16* %cV_R.addr) {
1141 ; X32-LABEL: isel_crash_8w:
1142 ; X32: ## %bb.0: ## %entry
1143 ; X32-NEXT: subl $60, %esp
1144 ; X32-NEXT: .cfi_def_cfa_offset 64
1145 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1146 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1147 ; X32-NEXT: vmovaps %xmm0, (%esp)
1148 ; X32-NEXT: vpbroadcastw (%eax), %xmm1
1149 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1150 ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
1151 ; X32-NEXT: addl $60, %esp
1154 ; X64-LABEL: isel_crash_8w:
1155 ; X64: ## %bb.0: ## %entry
1156 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1157 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1158 ; X64-NEXT: vpbroadcastw (%rdi), %xmm1
1159 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1160 ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
1163 %__a.addr.i = alloca <2 x i64>, align 16
1164 %__b.addr.i = alloca <2 x i64>, align 16
1165 %vCr = alloca <2 x i64>, align 16
1166 store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
1167 %tmp = load <2 x i64>, <2 x i64>* %vCr, align 16
1168 %tmp2 = load i16, i16* %cV_R.addr, align 4
1169 %splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0
1170 %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
1171 %tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64>
1172 store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
1173 store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
1177 define void @isel_crash_16w(i16* %cV_R.addr) {
1178 ; X32-LABEL: isel_crash_16w:
1179 ; X32: ## %bb.0: ## %eintry
1180 ; X32-NEXT: pushl %ebp
1181 ; X32-NEXT: .cfi_def_cfa_offset 8
1182 ; X32-NEXT: .cfi_offset %ebp, -8
1183 ; X32-NEXT: movl %esp, %ebp
1184 ; X32-NEXT: .cfi_def_cfa_register %ebp
1185 ; X32-NEXT: andl $-32, %esp
1186 ; X32-NEXT: subl $128, %esp
1187 ; X32-NEXT: movl 8(%ebp), %eax
1188 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1189 ; X32-NEXT: vmovaps %ymm0, (%esp)
1190 ; X32-NEXT: vpbroadcastw (%eax), %ymm1
1191 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1192 ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
1193 ; X32-NEXT: movl %ebp, %esp
1194 ; X32-NEXT: popl %ebp
1195 ; X32-NEXT: vzeroupper
1198 ; X64-LABEL: isel_crash_16w:
1199 ; X64: ## %bb.0: ## %eintry
1200 ; X64-NEXT: pushq %rbp
1201 ; X64-NEXT: .cfi_def_cfa_offset 16
1202 ; X64-NEXT: .cfi_offset %rbp, -16
1203 ; X64-NEXT: movq %rsp, %rbp
1204 ; X64-NEXT: .cfi_def_cfa_register %rbp
1205 ; X64-NEXT: andq $-32, %rsp
1206 ; X64-NEXT: subq $128, %rsp
1207 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1208 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1209 ; X64-NEXT: vpbroadcastw (%rdi), %ymm1
1210 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1211 ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1212 ; X64-NEXT: movq %rbp, %rsp
1213 ; X64-NEXT: popq %rbp
1214 ; X64-NEXT: vzeroupper
1217 %__a.addr.i = alloca <4 x i64>, align 16
1218 %__b.addr.i = alloca <4 x i64>, align 16
1219 %vCr = alloca <4 x i64>, align 16
1220 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
1221 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
1222 %tmp2 = load i16, i16* %cV_R.addr, align 4
1223 %splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0
1224 %splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer
1225 %tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64>
1226 store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
1227 store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
1231 define void @isel_crash_4d(i32* %cV_R.addr) {
1232 ; X32-LABEL: isel_crash_4d:
1233 ; X32: ## %bb.0: ## %entry
1234 ; X32-NEXT: subl $60, %esp
1235 ; X32-NEXT: .cfi_def_cfa_offset 64
1236 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1237 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1238 ; X32-NEXT: vmovaps %xmm0, (%esp)
1239 ; X32-NEXT: vbroadcastss (%eax), %xmm1
1240 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1241 ; X32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
1242 ; X32-NEXT: addl $60, %esp
1245 ; X64-LABEL: isel_crash_4d:
1246 ; X64: ## %bb.0: ## %entry
1247 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1248 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1249 ; X64-NEXT: vbroadcastss (%rdi), %xmm1
1250 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1251 ; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1254 %__a.addr.i = alloca <2 x i64>, align 16
1255 %__b.addr.i = alloca <2 x i64>, align 16
1256 %vCr = alloca <2 x i64>, align 16
1257 store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
1258 %tmp = load <2 x i64>, <2 x i64>* %vCr, align 16
1259 %tmp2 = load i32, i32* %cV_R.addr, align 4
1260 %splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0
1261 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1262 %tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64>
1263 store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
1264 store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
1268 define void @isel_crash_8d(i32* %cV_R.addr) {
1269 ; X32-LABEL: isel_crash_8d:
1270 ; X32: ## %bb.0: ## %eintry
1271 ; X32-NEXT: pushl %ebp
1272 ; X32-NEXT: .cfi_def_cfa_offset 8
1273 ; X32-NEXT: .cfi_offset %ebp, -8
1274 ; X32-NEXT: movl %esp, %ebp
1275 ; X32-NEXT: .cfi_def_cfa_register %ebp
1276 ; X32-NEXT: andl $-32, %esp
1277 ; X32-NEXT: subl $128, %esp
1278 ; X32-NEXT: movl 8(%ebp), %eax
1279 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1280 ; X32-NEXT: vmovaps %ymm0, (%esp)
1281 ; X32-NEXT: vbroadcastss (%eax), %ymm1
1282 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1283 ; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
1284 ; X32-NEXT: movl %ebp, %esp
1285 ; X32-NEXT: popl %ebp
1286 ; X32-NEXT: vzeroupper
1289 ; X64-LABEL: isel_crash_8d:
1290 ; X64: ## %bb.0: ## %eintry
1291 ; X64-NEXT: pushq %rbp
1292 ; X64-NEXT: .cfi_def_cfa_offset 16
1293 ; X64-NEXT: .cfi_offset %rbp, -16
1294 ; X64-NEXT: movq %rsp, %rbp
1295 ; X64-NEXT: .cfi_def_cfa_register %rbp
1296 ; X64-NEXT: andq $-32, %rsp
1297 ; X64-NEXT: subq $128, %rsp
1298 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1299 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1300 ; X64-NEXT: vbroadcastss (%rdi), %ymm1
1301 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1302 ; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1303 ; X64-NEXT: movq %rbp, %rsp
1304 ; X64-NEXT: popq %rbp
1305 ; X64-NEXT: vzeroupper
1308 %__a.addr.i = alloca <4 x i64>, align 16
1309 %__b.addr.i = alloca <4 x i64>, align 16
1310 %vCr = alloca <4 x i64>, align 16
1311 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
1312 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
1313 %tmp2 = load i32, i32* %cV_R.addr, align 4
1314 %splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0
1315 %splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1316 %tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64>
1317 store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
1318 store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
1322 define void @isel_crash_2q(i64* %cV_R.addr) {
1323 ; X32-LABEL: isel_crash_2q:
1324 ; X32: ## %bb.0: ## %entry
1325 ; X32-NEXT: subl $60, %esp
1326 ; X32-NEXT: .cfi_def_cfa_offset 64
1327 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1328 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1329 ; X32-NEXT: vmovaps %xmm0, (%esp)
1330 ; X32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
1331 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1332 ; X32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
1333 ; X32-NEXT: addl $60, %esp
1336 ; X64-LABEL: isel_crash_2q:
1337 ; X64: ## %bb.0: ## %entry
1338 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1339 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1340 ; X64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
1341 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1342 ; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1345 %__a.addr.i = alloca <2 x i64>, align 16
1346 %__b.addr.i = alloca <2 x i64>, align 16
1347 %vCr = alloca <2 x i64>, align 16
1348 store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
1349 %tmp = load <2 x i64>, <2 x i64>* %vCr, align 16
1350 %tmp2 = load i64, i64* %cV_R.addr, align 4
1351 %splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0
1352 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
1353 store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
1354 store <2 x i64> %splat.splat, <2 x i64>* %__b.addr.i, align 16
1358 define void @isel_crash_4q(i64* %cV_R.addr) {
1359 ; X32-LABEL: isel_crash_4q:
1360 ; X32: ## %bb.0: ## %eintry
1361 ; X32-NEXT: pushl %ebp
1362 ; X32-NEXT: .cfi_def_cfa_offset 8
1363 ; X32-NEXT: .cfi_offset %ebp, -8
1364 ; X32-NEXT: movl %esp, %ebp
1365 ; X32-NEXT: .cfi_def_cfa_register %ebp
1366 ; X32-NEXT: andl $-32, %esp
1367 ; X32-NEXT: subl $128, %esp
1368 ; X32-NEXT: movl 8(%ebp), %eax
1369 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
1370 ; X32-NEXT: vmovaps %ymm0, (%esp)
1371 ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1372 ; X32-NEXT: vbroadcastsd %xmm1, %ymm1
1373 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1374 ; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
1375 ; X32-NEXT: movl %ebp, %esp
1376 ; X32-NEXT: popl %ebp
1377 ; X32-NEXT: vzeroupper
1380 ; X64-LABEL: isel_crash_4q:
1381 ; X64: ## %bb.0: ## %eintry
1382 ; X64-NEXT: pushq %rbp
1383 ; X64-NEXT: .cfi_def_cfa_offset 16
1384 ; X64-NEXT: .cfi_offset %rbp, -16
1385 ; X64-NEXT: movq %rsp, %rbp
1386 ; X64-NEXT: .cfi_def_cfa_register %rbp
1387 ; X64-NEXT: andq $-32, %rsp
1388 ; X64-NEXT: subq $128, %rsp
1389 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1390 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1391 ; X64-NEXT: vbroadcastsd (%rdi), %ymm1
1392 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1393 ; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1394 ; X64-NEXT: movq %rbp, %rsp
1395 ; X64-NEXT: popq %rbp
1396 ; X64-NEXT: vzeroupper
1399 %__a.addr.i = alloca <4 x i64>, align 16
1400 %__b.addr.i = alloca <4 x i64>, align 16
1401 %vCr = alloca <4 x i64>, align 16
1402 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
1403 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
1404 %tmp2 = load i64, i64* %cV_R.addr, align 4
1405 %splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0
1406 %splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
1407 store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
1408 store <4 x i64> %splat.splat, <4 x i64>* %__b.addr.i, align 16