1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2,+mmx | FileCheck %s --check-prefixes=X86,X86-AVX2
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2,+mmx | FileCheck %s --check-prefixes=X64,X64-AVX2
4 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl,+avx512dq,+mmx | FileCheck %s --check-prefixes=X86,X86-AVX512VL
5 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl,+avx512dq,+mmx | FileCheck %s --check-prefixes=X64,X64-AVX512VL
7 define <16 x i8> @BB16(ptr %ptr) nounwind uwtable readnone ssp {
9 ; X86: ## %bb.0: ## %entry
10 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
11 ; X86-NEXT: vpbroadcastb (%eax), %xmm0
15 ; X64: ## %bb.0: ## %entry
16 ; X64-NEXT: vpbroadcastb (%rdi), %xmm0
19 %q = load i8, ptr %ptr, align 4
20 %q0 = insertelement <16 x i8> undef, i8 %q, i32 0
21 %q1 = insertelement <16 x i8> %q0, i8 %q, i32 1
22 %q2 = insertelement <16 x i8> %q1, i8 %q, i32 2
23 %q3 = insertelement <16 x i8> %q2, i8 %q, i32 3
24 %q4 = insertelement <16 x i8> %q3, i8 %q, i32 4
25 %q5 = insertelement <16 x i8> %q4, i8 %q, i32 5
26 %q6 = insertelement <16 x i8> %q5, i8 %q, i32 6
27 %q7 = insertelement <16 x i8> %q6, i8 %q, i32 7
28 %q8 = insertelement <16 x i8> %q7, i8 %q, i32 8
29 %q9 = insertelement <16 x i8> %q8, i8 %q, i32 9
30 %qa = insertelement <16 x i8> %q9, i8 %q, i32 10
31 %qb = insertelement <16 x i8> %qa, i8 %q, i32 11
32 %qc = insertelement <16 x i8> %qb, i8 %q, i32 12
33 %qd = insertelement <16 x i8> %qc, i8 %q, i32 13
34 %qe = insertelement <16 x i8> %qd, i8 %q, i32 14
35 %qf = insertelement <16 x i8> %qe, i8 %q, i32 15
39 define <32 x i8> @BB32(ptr %ptr) nounwind uwtable readnone ssp {
41 ; X86: ## %bb.0: ## %entry
42 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
43 ; X86-NEXT: vpbroadcastb (%eax), %ymm0
47 ; X64: ## %bb.0: ## %entry
48 ; X64-NEXT: vpbroadcastb (%rdi), %ymm0
51 %q = load i8, ptr %ptr, align 4
52 %q0 = insertelement <32 x i8> undef, i8 %q, i32 0
53 %q1 = insertelement <32 x i8> %q0, i8 %q, i32 1
54 %q2 = insertelement <32 x i8> %q1, i8 %q, i32 2
55 %q3 = insertelement <32 x i8> %q2, i8 %q, i32 3
56 %q4 = insertelement <32 x i8> %q3, i8 %q, i32 4
57 %q5 = insertelement <32 x i8> %q4, i8 %q, i32 5
58 %q6 = insertelement <32 x i8> %q5, i8 %q, i32 6
59 %q7 = insertelement <32 x i8> %q6, i8 %q, i32 7
60 %q8 = insertelement <32 x i8> %q7, i8 %q, i32 8
61 %q9 = insertelement <32 x i8> %q8, i8 %q, i32 9
62 %qa = insertelement <32 x i8> %q9, i8 %q, i32 10
63 %qb = insertelement <32 x i8> %qa, i8 %q, i32 11
64 %qc = insertelement <32 x i8> %qb, i8 %q, i32 12
65 %qd = insertelement <32 x i8> %qc, i8 %q, i32 13
66 %qe = insertelement <32 x i8> %qd, i8 %q, i32 14
67 %qf = insertelement <32 x i8> %qe, i8 %q, i32 15
69 %q20 = insertelement <32 x i8> %qf, i8 %q, i32 16
70 %q21 = insertelement <32 x i8> %q20, i8 %q, i32 17
71 %q22 = insertelement <32 x i8> %q21, i8 %q, i32 18
72 %q23 = insertelement <32 x i8> %q22, i8 %q, i32 19
73 %q24 = insertelement <32 x i8> %q23, i8 %q, i32 20
74 %q25 = insertelement <32 x i8> %q24, i8 %q, i32 21
75 %q26 = insertelement <32 x i8> %q25, i8 %q, i32 22
76 %q27 = insertelement <32 x i8> %q26, i8 %q, i32 23
77 %q28 = insertelement <32 x i8> %q27, i8 %q, i32 24
78 %q29 = insertelement <32 x i8> %q28, i8 %q, i32 25
79 %q2a = insertelement <32 x i8> %q29, i8 %q, i32 26
80 %q2b = insertelement <32 x i8> %q2a, i8 %q, i32 27
81 %q2c = insertelement <32 x i8> %q2b, i8 %q, i32 28
82 %q2d = insertelement <32 x i8> %q2c, i8 %q, i32 29
83 %q2e = insertelement <32 x i8> %q2d, i8 %q, i32 30
84 %q2f = insertelement <32 x i8> %q2e, i8 %q, i32 31
88 define <8 x i16> @W16(ptr %ptr) nounwind uwtable readnone ssp {
90 ; X86: ## %bb.0: ## %entry
91 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
92 ; X86-NEXT: vpbroadcastw (%eax), %xmm0
96 ; X64: ## %bb.0: ## %entry
97 ; X64-NEXT: vpbroadcastw (%rdi), %xmm0
100 %q = load i16, ptr %ptr, align 4
101 %q0 = insertelement <8 x i16> undef, i16 %q, i32 0
102 %q1 = insertelement <8 x i16> %q0, i16 %q, i32 1
103 %q2 = insertelement <8 x i16> %q1, i16 %q, i32 2
104 %q3 = insertelement <8 x i16> %q2, i16 %q, i32 3
105 %q4 = insertelement <8 x i16> %q3, i16 %q, i32 4
106 %q5 = insertelement <8 x i16> %q4, i16 %q, i32 5
107 %q6 = insertelement <8 x i16> %q5, i16 %q, i32 6
108 %q7 = insertelement <8 x i16> %q6, i16 %q, i32 7
112 define <16 x i16> @WW16(ptr %ptr) nounwind uwtable readnone ssp {
114 ; X86: ## %bb.0: ## %entry
115 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
116 ; X86-NEXT: vpbroadcastw (%eax), %ymm0
120 ; X64: ## %bb.0: ## %entry
121 ; X64-NEXT: vpbroadcastw (%rdi), %ymm0
124 %q = load i16, ptr %ptr, align 4
125 %q0 = insertelement <16 x i16> undef, i16 %q, i32 0
126 %q1 = insertelement <16 x i16> %q0, i16 %q, i32 1
127 %q2 = insertelement <16 x i16> %q1, i16 %q, i32 2
128 %q3 = insertelement <16 x i16> %q2, i16 %q, i32 3
129 %q4 = insertelement <16 x i16> %q3, i16 %q, i32 4
130 %q5 = insertelement <16 x i16> %q4, i16 %q, i32 5
131 %q6 = insertelement <16 x i16> %q5, i16 %q, i32 6
132 %q7 = insertelement <16 x i16> %q6, i16 %q, i32 7
133 %q8 = insertelement <16 x i16> %q7, i16 %q, i32 8
134 %q9 = insertelement <16 x i16> %q8, i16 %q, i32 9
135 %qa = insertelement <16 x i16> %q9, i16 %q, i32 10
136 %qb = insertelement <16 x i16> %qa, i16 %q, i32 11
137 %qc = insertelement <16 x i16> %qb, i16 %q, i32 12
138 %qd = insertelement <16 x i16> %qc, i16 %q, i32 13
139 %qe = insertelement <16 x i16> %qd, i16 %q, i32 14
140 %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
144 define <4 x i32> @D32(ptr %ptr) nounwind uwtable readnone ssp {
146 ; X86: ## %bb.0: ## %entry
147 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
148 ; X86-NEXT: vbroadcastss (%eax), %xmm0
152 ; X64: ## %bb.0: ## %entry
153 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
156 %q = load i32, ptr %ptr, align 4
157 %q0 = insertelement <4 x i32> undef, i32 %q, i32 0
158 %q1 = insertelement <4 x i32> %q0, i32 %q, i32 1
159 %q2 = insertelement <4 x i32> %q1, i32 %q, i32 2
160 %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
164 define <8 x i32> @DD32(ptr %ptr) nounwind uwtable readnone ssp {
166 ; X86: ## %bb.0: ## %entry
167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
168 ; X86-NEXT: vbroadcastss (%eax), %ymm0
172 ; X64: ## %bb.0: ## %entry
173 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
176 %q = load i32, ptr %ptr, align 4
177 %q0 = insertelement <8 x i32> undef, i32 %q, i32 0
178 %q1 = insertelement <8 x i32> %q0, i32 %q, i32 1
179 %q2 = insertelement <8 x i32> %q1, i32 %q, i32 2
180 %q3 = insertelement <8 x i32> %q2, i32 %q, i32 3
181 %q4 = insertelement <8 x i32> %q3, i32 %q, i32 4
182 %q5 = insertelement <8 x i32> %q4, i32 %q, i32 5
183 %q6 = insertelement <8 x i32> %q5, i32 %q, i32 6
184 %q7 = insertelement <8 x i32> %q6, i32 %q, i32 7
188 define <2 x i64> @Q64(ptr %ptr) nounwind uwtable readnone ssp {
190 ; X86: ## %bb.0: ## %entry
191 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
192 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
196 ; X64: ## %bb.0: ## %entry
197 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
200 %q = load i64, ptr %ptr, align 4
201 %q0 = insertelement <2 x i64> undef, i64 %q, i32 0
202 %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
206 define <4 x i64> @QQ64(ptr %ptr) nounwind uwtable readnone ssp {
208 ; X86: ## %bb.0: ## %entry
209 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
210 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
214 ; X64: ## %bb.0: ## %entry
215 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
218 %q = load i64, ptr %ptr, align 4
219 %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
220 %q1 = insertelement <4 x i64> %q0, i64 %q, i32 1
221 %q2 = insertelement <4 x i64> %q1, i64 %q, i32 2
222 %q3 = insertelement <4 x i64> %q2, i64 %q, i32 3
226 define <8 x i16> @broadcast_mem_v4i16_v8i16(ptr %ptr) {
227 ; X86-LABEL: broadcast_mem_v4i16_v8i16:
229 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
230 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
233 ; X64-LABEL: broadcast_mem_v4i16_v8i16:
235 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
237 %load = load <4 x i16>, ptr %ptr
238 %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
242 define <16 x i16> @broadcast_mem_v4i16_v16i16(ptr %ptr) {
243 ; X86-LABEL: broadcast_mem_v4i16_v16i16:
245 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
246 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
249 ; X64-LABEL: broadcast_mem_v4i16_v16i16:
251 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
253 %load = load <4 x i16>, ptr %ptr
254 %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
258 ; FIXME: Pointer adjusted broadcasts
260 define <16 x i8> @load_splat_16i8_16i8_1111111111111111(ptr %ptr) nounwind uwtable readnone ssp {
261 ; X86-LABEL: load_splat_16i8_16i8_1111111111111111:
262 ; X86: ## %bb.0: ## %entry
263 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
264 ; X86-NEXT: vpbroadcastb 1(%eax), %xmm0
267 ; X64-LABEL: load_splat_16i8_16i8_1111111111111111:
268 ; X64: ## %bb.0: ## %entry
269 ; X64-NEXT: vpbroadcastb 1(%rdi), %xmm0
272 %ld = load <16 x i8>, ptr %ptr
273 %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
277 define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(ptr %ptr) nounwind uwtable readnone ssp {
278 ; X86-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
279 ; X86: ## %bb.0: ## %entry
280 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
281 ; X86-NEXT: vpbroadcastb 1(%eax), %ymm0
284 ; X64-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
285 ; X64: ## %bb.0: ## %entry
286 ; X64-NEXT: vpbroadcastb 1(%rdi), %ymm0
289 %ld = load <16 x i8>, ptr %ptr
290 %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
294 define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(ptr %ptr) nounwind uwtable readnone ssp {
295 ; X86-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
296 ; X86: ## %bb.0: ## %entry
297 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
298 ; X86-NEXT: vpbroadcastb 1(%eax), %ymm0
301 ; X64-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
302 ; X64: ## %bb.0: ## %entry
303 ; X64-NEXT: vpbroadcastb 1(%rdi), %ymm0
306 %ld = load <32 x i8>, ptr %ptr
307 %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
311 define <8 x i16> @load_splat_8i16_8i16_11111111(ptr %ptr) nounwind uwtable readnone ssp {
312 ; X86-LABEL: load_splat_8i16_8i16_11111111:
313 ; X86: ## %bb.0: ## %entry
314 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
315 ; X86-NEXT: vpbroadcastw 2(%eax), %xmm0
318 ; X64-LABEL: load_splat_8i16_8i16_11111111:
319 ; X64: ## %bb.0: ## %entry
320 ; X64-NEXT: vpbroadcastw 2(%rdi), %xmm0
323 %ld = load <8 x i16>, ptr %ptr
324 %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
328 define <16 x i16> @load_splat_16i16_8i16_1111111111111111(ptr %ptr) nounwind uwtable readnone ssp {
329 ; X86-LABEL: load_splat_16i16_8i16_1111111111111111:
330 ; X86: ## %bb.0: ## %entry
331 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
332 ; X86-NEXT: vpbroadcastw 2(%eax), %ymm0
335 ; X64-LABEL: load_splat_16i16_8i16_1111111111111111:
336 ; X64: ## %bb.0: ## %entry
337 ; X64-NEXT: vpbroadcastw 2(%rdi), %ymm0
340 %ld = load <8 x i16>, ptr %ptr
341 %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
345 define <16 x i16> @load_splat_16i16_16i16_1111111111111111(ptr %ptr) nounwind uwtable readnone ssp {
346 ; X86-LABEL: load_splat_16i16_16i16_1111111111111111:
347 ; X86: ## %bb.0: ## %entry
348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
349 ; X86-NEXT: vpbroadcastw 2(%eax), %ymm0
352 ; X64-LABEL: load_splat_16i16_16i16_1111111111111111:
353 ; X64: ## %bb.0: ## %entry
354 ; X64-NEXT: vpbroadcastw 2(%rdi), %ymm0
357 %ld = load <16 x i16>, ptr %ptr
358 %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
362 define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ssp {
363 ; X86-LABEL: load_splat_4i32_4i32_1111:
364 ; X86: ## %bb.0: ## %entry
365 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
366 ; X86-NEXT: vbroadcastss 4(%eax), %xmm0
369 ; X64-LABEL: load_splat_4i32_4i32_1111:
370 ; X64: ## %bb.0: ## %entry
371 ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
374 %ld = load <4 x i32>, ptr %ptr
375 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
379 define <8 x i32> @load_splat_8i32_4i32_33333333(ptr %ptr) nounwind uwtable readnone ssp {
380 ; X86-LABEL: load_splat_8i32_4i32_33333333:
381 ; X86: ## %bb.0: ## %entry
382 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
383 ; X86-NEXT: vbroadcastss 12(%eax), %ymm0
386 ; X64-LABEL: load_splat_8i32_4i32_33333333:
387 ; X64: ## %bb.0: ## %entry
388 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
391 %ld = load <4 x i32>, ptr %ptr
392 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
396 define <8 x i32> @load_splat_8i32_8i32_55555555(ptr %ptr) nounwind uwtable readnone ssp {
397 ; X86-LABEL: load_splat_8i32_8i32_55555555:
398 ; X86: ## %bb.0: ## %entry
399 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
400 ; X86-NEXT: vbroadcastss 20(%eax), %ymm0
403 ; X64-LABEL: load_splat_8i32_8i32_55555555:
404 ; X64: ## %bb.0: ## %entry
405 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
408 %ld = load <8 x i32>, ptr %ptr
409 %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
413 define <4 x float> @load_splat_4f32_4f32_1111(ptr %ptr) nounwind uwtable readnone ssp {
414 ; X86-LABEL: load_splat_4f32_4f32_1111:
415 ; X86: ## %bb.0: ## %entry
416 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
417 ; X86-NEXT: vbroadcastss 4(%eax), %xmm0
420 ; X64-LABEL: load_splat_4f32_4f32_1111:
421 ; X64: ## %bb.0: ## %entry
422 ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
425 %ld = load <4 x float>, ptr %ptr
426 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
430 define <8 x float> @load_splat_8f32_4f32_33333333(ptr %ptr) nounwind uwtable readnone ssp {
431 ; X86-LABEL: load_splat_8f32_4f32_33333333:
432 ; X86: ## %bb.0: ## %entry
433 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
434 ; X86-NEXT: vbroadcastss 12(%eax), %ymm0
437 ; X64-LABEL: load_splat_8f32_4f32_33333333:
438 ; X64: ## %bb.0: ## %entry
439 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
442 %ld = load <4 x float>, ptr %ptr
443 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
447 define <8 x float> @load_splat_8f32_8f32_55555555(ptr %ptr) nounwind uwtable readnone ssp {
448 ; X86-LABEL: load_splat_8f32_8f32_55555555:
449 ; X86: ## %bb.0: ## %entry
450 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
451 ; X86-NEXT: vbroadcastss 20(%eax), %ymm0
454 ; X64-LABEL: load_splat_8f32_8f32_55555555:
455 ; X64: ## %bb.0: ## %entry
456 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
459 %ld = load <8 x float>, ptr %ptr
460 %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
464 define <2 x i64> @load_splat_2i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ssp {
465 ; X86-LABEL: load_splat_2i64_2i64_1111:
466 ; X86: ## %bb.0: ## %entry
467 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
468 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
471 ; X64-LABEL: load_splat_2i64_2i64_1111:
472 ; X64: ## %bb.0: ## %entry
473 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
476 %ld = load <2 x i64>, ptr %ptr
477 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
481 define <4 x i64> @load_splat_4i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ssp {
482 ; X86-LABEL: load_splat_4i64_2i64_1111:
483 ; X86: ## %bb.0: ## %entry
484 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
485 ; X86-NEXT: vbroadcastsd 8(%eax), %ymm0
488 ; X64-LABEL: load_splat_4i64_2i64_1111:
489 ; X64: ## %bb.0: ## %entry
490 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
493 %ld = load <2 x i64>, ptr %ptr
494 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
498 define <4 x i64> @load_splat_4i64_4i64_2222(ptr %ptr) nounwind uwtable readnone ssp {
499 ; X86-LABEL: load_splat_4i64_4i64_2222:
500 ; X86: ## %bb.0: ## %entry
501 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
502 ; X86-NEXT: vbroadcastsd 16(%eax), %ymm0
505 ; X64-LABEL: load_splat_4i64_4i64_2222:
506 ; X64: ## %bb.0: ## %entry
507 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
510 %ld = load <4 x i64>, ptr %ptr
511 %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
515 define <2 x double> @load_splat_2f64_2f64_1111(ptr %ptr) nounwind uwtable readnone ssp {
516 ; X86-LABEL: load_splat_2f64_2f64_1111:
517 ; X86: ## %bb.0: ## %entry
518 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
519 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
522 ; X64-LABEL: load_splat_2f64_2f64_1111:
523 ; X64: ## %bb.0: ## %entry
524 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
527 %ld = load <2 x double>, ptr %ptr
528 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
529 ret <2 x double> %ret
532 define <4 x double> @load_splat_4f64_2f64_1111(ptr %ptr) nounwind uwtable readnone ssp {
533 ; X86-LABEL: load_splat_4f64_2f64_1111:
534 ; X86: ## %bb.0: ## %entry
535 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
536 ; X86-NEXT: vbroadcastsd 8(%eax), %ymm0
539 ; X64-LABEL: load_splat_4f64_2f64_1111:
540 ; X64: ## %bb.0: ## %entry
541 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
544 %ld = load <2 x double>, ptr %ptr
545 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
546 ret <4 x double> %ret
549 define <4 x double> @load_splat_4f64_4f64_2222(ptr %ptr) nounwind uwtable readnone ssp {
550 ; X86-LABEL: load_splat_4f64_4f64_2222:
551 ; X86: ## %bb.0: ## %entry
552 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
553 ; X86-NEXT: vbroadcastsd 16(%eax), %ymm0
556 ; X64-LABEL: load_splat_4f64_4f64_2222:
557 ; X64: ## %bb.0: ## %entry
558 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
561 %ld = load <4 x double>, ptr %ptr
562 %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
563 ret <4 x double> %ret
566 ; make sure that we still don't support broadcast double into 128-bit vector
568 define <2 x double> @I(ptr %ptr) nounwind uwtable readnone ssp {
570 ; X86: ## %bb.0: ## %entry
571 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
572 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
576 ; X64: ## %bb.0: ## %entry
577 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
580 %q = load double, ptr %ptr, align 4
581 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
582 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
583 ret <2 x double> %vecinit2.i
586 define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
587 ; X86-AVX2-LABEL: V111:
588 ; X86-AVX2: ## %bb.0: ## %entry
589 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
590 ; X86-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
591 ; X86-AVX2-NEXT: retl
593 ; X64-AVX2-LABEL: V111:
594 ; X64-AVX2: ## %bb.0: ## %entry
595 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
596 ; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
597 ; X64-AVX2-NEXT: retq
599 ; X86-AVX512VL-LABEL: V111:
600 ; X86-AVX512VL: ## %bb.0: ## %entry
601 ; X86-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
602 ; X86-AVX512VL-NEXT: retl
604 ; X64-AVX512VL-LABEL: V111:
605 ; X64-AVX512VL: ## %bb.0: ## %entry
606 ; X64-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
607 ; X64-AVX512VL-NEXT: retq
609 %g = add <8 x i32> %in, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
613 define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
614 ; X86-AVX2-LABEL: V113:
615 ; X86-AVX2: ## %bb.0: ## %entry
616 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
617 ; X86-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
618 ; X86-AVX2-NEXT: retl
620 ; X64-AVX2-LABEL: V113:
621 ; X64-AVX2: ## %bb.0: ## %entry
622 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
623 ; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
624 ; X64-AVX2-NEXT: retq
626 ; X86-AVX512VL-LABEL: V113:
627 ; X86-AVX512VL: ## %bb.0: ## %entry
628 ; X86-AVX512VL-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
629 ; X86-AVX512VL-NEXT: retl
631 ; X64-AVX512VL-LABEL: V113:
632 ; X64-AVX512VL: ## %bb.0: ## %entry
633 ; X64-AVX512VL-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
634 ; X64-AVX512VL-NEXT: retq
636 %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
640 define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
643 ; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
648 ; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
650 %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
651 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
652 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
653 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
654 ret <4 x float> %vecinit6.i
657 define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp {
660 ; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
665 ; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
667 %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
668 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
669 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
670 %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3
671 %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4
672 %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5
673 %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6
674 %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7
675 ret <8 x i8> %vecinit7.i
678 define void @crash() nounwind alwaysinline {
680 ; X86: ## %bb.0: ## %WGLoopsEntry
681 ; X86-NEXT: xorl %eax, %eax
682 ; X86-NEXT: testb %al, %al
683 ; X86-NEXT: je LBB33_1
684 ; X86-NEXT: ## %bb.2: ## %ret
686 ; X86-NEXT: .p2align 4
687 ; X86-NEXT: LBB33_1: ## %footer329VF
688 ; X86-NEXT: ## =>This Inner Loop Header: Depth=1
689 ; X86-NEXT: jmp LBB33_1
692 ; X64: ## %bb.0: ## %WGLoopsEntry
693 ; X64-NEXT: xorl %eax, %eax
694 ; X64-NEXT: testb %al, %al
695 ; X64-NEXT: je LBB33_1
696 ; X64-NEXT: ## %bb.2: ## %ret
698 ; X64-NEXT: .p2align 4
699 ; X64-NEXT: LBB33_1: ## %footer329VF
700 ; X64-NEXT: ## =>This Inner Loop Header: Depth=1
701 ; X64-NEXT: jmp LBB33_1
703 br i1 undef, label %ret, label %footer329VF
706 %A.0.inVF = fmul float undef, 6.553600e+04
707 %B.0.in407VF = fmul <8 x float> undef, <float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04>
708 %A.0VF = fptosi float %A.0.inVF to i32
709 %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32>
710 %0 = and <8 x i32> %B.0408VF, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
711 %1 = and i32 %A.0VF, 65535
712 %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0
713 %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer
714 br i1 undef, label %preload1201VF, label %footer349VF
717 br label %footer349VF
720 %2 = mul nsw <8 x i32> undef, %0
721 %3 = mul nsw <8 x i32> undef, %vector1099VF
722 br label %footer329VF
728 define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
729 ; X86-LABEL: _inreg0:
731 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
734 ; X64-AVX2-LABEL: _inreg0:
735 ; X64-AVX2: ## %bb.0:
736 ; X64-AVX2-NEXT: vmovd %edi, %xmm0
737 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
738 ; X64-AVX2-NEXT: retq
740 ; X64-AVX512VL-LABEL: _inreg0:
741 ; X64-AVX512VL: ## %bb.0:
742 ; X64-AVX512VL-NEXT: vpbroadcastd %edi, %ymm0
743 ; X64-AVX512VL-NEXT: retq
744 %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
745 %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
749 define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
750 ; X86-LABEL: _inreg1:
752 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
755 ; X64-LABEL: _inreg1:
757 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
759 %in = insertelement <8 x float> undef, float %scalar, i32 0
760 %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
761 ret <8 x float> %wide
764 define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
765 ; X86-LABEL: _inreg2:
767 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0
770 ; X64-LABEL: _inreg2:
772 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
774 %in = insertelement <4 x float> undef, float %scalar, i32 0
775 %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
776 ret <4 x float> %wide
779 define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
780 ; X86-LABEL: _inreg3:
782 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
785 ; X64-LABEL: _inreg3:
787 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
789 %in = insertelement <4 x double> undef, double %scalar, i32 0
790 %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
791 ret <4 x double> %wide
794 define <8 x float> @_inreg8xfloat(<8 x float> %a) {
795 ; X86-LABEL: _inreg8xfloat:
797 ; X86-NEXT: vbroadcastss %xmm0, %ymm0
800 ; X64-LABEL: _inreg8xfloat:
802 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
804 %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
808 define <4 x float> @_inreg4xfloat(<4 x float> %a) {
809 ; X86-LABEL: _inreg4xfloat:
811 ; X86-NEXT: vbroadcastss %xmm0, %xmm0
814 ; X64-LABEL: _inreg4xfloat:
816 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
818 %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
822 define <16 x i16> @_inreg16xi16(<16 x i16> %a) {
823 ; X86-LABEL: _inreg16xi16:
825 ; X86-NEXT: vpbroadcastw %xmm0, %ymm0
828 ; X64-LABEL: _inreg16xi16:
830 ; X64-NEXT: vpbroadcastw %xmm0, %ymm0
832 %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
836 define <8 x i16> @_inreg8xi16(<8 x i16> %a) {
837 ; X86-LABEL: _inreg8xi16:
839 ; X86-NEXT: vpbroadcastw %xmm0, %xmm0
842 ; X64-LABEL: _inreg8xi16:
844 ; X64-NEXT: vpbroadcastw %xmm0, %xmm0
846 %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
850 define <4 x i64> @_inreg4xi64(<4 x i64> %a) {
851 ; X86-LABEL: _inreg4xi64:
853 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0
856 ; X64-LABEL: _inreg4xi64:
858 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
860 %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
864 define <2 x i64> @_inreg2xi64(<2 x i64> %a) {
865 ; X86-LABEL: _inreg2xi64:
867 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
870 ; X64-LABEL: _inreg2xi64:
872 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
874 %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
878 define <4 x double> @_inreg4xdouble(<4 x double> %a) {
879 ; X86-LABEL: _inreg4xdouble:
881 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0
884 ; X64-LABEL: _inreg4xdouble:
886 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
888 %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
892 define <2 x double> @_inreg2xdouble(<2 x double> %a) {
893 ; X86-LABEL: _inreg2xdouble:
895 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
898 ; X64-LABEL: _inreg2xdouble:
900 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
902 %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
906 define <8 x i32> @_inreg8xi32(<8 x i32> %a) {
907 ; X86-LABEL: _inreg8xi32:
909 ; X86-NEXT: vbroadcastss %xmm0, %ymm0
912 ; X64-LABEL: _inreg8xi32:
914 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
916 %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
920 define <4 x i32> @_inreg4xi32(<4 x i32> %a) {
921 ; X86-LABEL: _inreg4xi32:
923 ; X86-NEXT: vbroadcastss %xmm0, %xmm0
926 ; X64-LABEL: _inreg4xi32:
928 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
930 %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
934 define <32 x i8> @_inreg32xi8(<32 x i8> %a) {
935 ; X86-LABEL: _inreg32xi8:
937 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
940 ; X64-LABEL: _inreg32xi8:
942 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
944 %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
948 define <16 x i8> @_inreg16xi8(<16 x i8> %a) {
949 ; X86-LABEL: _inreg16xi8:
951 ; X86-NEXT: vpbroadcastb %xmm0, %xmm0
954 ; X64-LABEL: _inreg16xi8:
956 ; X64-NEXT: vpbroadcastb %xmm0, %xmm0
958 %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
962 ; These tests check that a vbroadcast instruction is used when we have a splat
963 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
964 ; (via the insertelements).
966 define <8 x float> @splat_concat1(float %f) {
967 ; X86-LABEL: splat_concat1:
969 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
972 ; X64-LABEL: splat_concat1:
974 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
976 %1 = insertelement <4 x float> undef, float %f, i32 0
977 %2 = insertelement <4 x float> %1, float %f, i32 1
978 %3 = insertelement <4 x float> %2, float %f, i32 2
979 %4 = insertelement <4 x float> %3, float %f, i32 3
980 %5 = shufflevector <4 x float> %4, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
984 define <8 x float> @splat_concat2(float %f) {
985 ; X86-LABEL: splat_concat2:
987 ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
990 ; X64-LABEL: splat_concat2:
992 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
994 %1 = insertelement <4 x float> undef, float %f, i32 0
995 %2 = insertelement <4 x float> %1, float %f, i32 1
996 %3 = insertelement <4 x float> %2, float %f, i32 2
997 %4 = insertelement <4 x float> %3, float %f, i32 3
998 %5 = insertelement <4 x float> undef, float %f, i32 0
999 %6 = insertelement <4 x float> %5, float %f, i32 1
1000 %7 = insertelement <4 x float> %6, float %f, i32 2
1001 %8 = insertelement <4 x float> %7, float %f, i32 3
1002 %9 = shufflevector <4 x float> %4, <4 x float> %8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1006 define <4 x double> @splat_concat3(double %d) {
1007 ; X86-LABEL: splat_concat3:
1009 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
1012 ; X64-LABEL: splat_concat3:
1014 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
1016 %1 = insertelement <2 x double> undef, double %d, i32 0
1017 %2 = insertelement <2 x double> %1, double %d, i32 1
1018 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1022 define <4 x double> @splat_concat4(double %d) {
1023 ; X86-LABEL: splat_concat4:
1025 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
1028 ; X64-LABEL: splat_concat4:
1030 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
1032 %1 = insertelement <2 x double> undef, double %d, i32 0
1033 %2 = insertelement <2 x double> %1, double %d, i32 1
1034 %3 = insertelement <2 x double> undef, double %d, i32 0
1035 %4 = insertelement <2 x double> %3, double %d, i32 1
1036 %5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1040 define void @broadcast_v16i32(ptr %a, ptr %b) {
1041 ; X86-AVX2-LABEL: broadcast_v16i32:
1042 ; X86-AVX2: ## %bb.0:
1043 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1044 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
1045 ; X86-AVX2-NEXT: vbroadcastss (%ecx), %ymm0
1046 ; X86-AVX2-NEXT: vmovups %ymm0, 32(%eax)
1047 ; X86-AVX2-NEXT: vmovups %ymm0, (%eax)
1048 ; X86-AVX2-NEXT: vzeroupper
1049 ; X86-AVX2-NEXT: retl
1051 ; X64-AVX2-LABEL: broadcast_v16i32:
1052 ; X64-AVX2: ## %bb.0:
1053 ; X64-AVX2-NEXT: vbroadcastss (%rdi), %ymm0
1054 ; X64-AVX2-NEXT: vmovups %ymm0, 32(%rsi)
1055 ; X64-AVX2-NEXT: vmovups %ymm0, (%rsi)
1056 ; X64-AVX2-NEXT: vzeroupper
1057 ; X64-AVX2-NEXT: retq
1059 ; X86-AVX512VL-LABEL: broadcast_v16i32:
1060 ; X86-AVX512VL: ## %bb.0:
1061 ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
1062 ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
1063 ; X86-AVX512VL-NEXT: vbroadcastss (%ecx), %zmm0
1064 ; X86-AVX512VL-NEXT: vmovups %zmm0, (%eax)
1065 ; X86-AVX512VL-NEXT: vzeroupper
1066 ; X86-AVX512VL-NEXT: retl
1068 ; X64-AVX512VL-LABEL: broadcast_v16i32:
1069 ; X64-AVX512VL: ## %bb.0:
1070 ; X64-AVX512VL-NEXT: vbroadcastss (%rdi), %zmm0
1071 ; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi)
1072 ; X64-AVX512VL-NEXT: vzeroupper
1073 ; X64-AVX512VL-NEXT: retq
1074 %1 = load i32, ptr %a, align 4
1075 %2 = insertelement <8 x i32> undef, i32 %1, i32 0
1076 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
1077 %4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
1078 store <16 x i32> %4, ptr %b, align 4
1082 ; Test cases for <rdar://problem/16074331>.
1083 ; Instruction selection for broacast instruction fails if
1084 ; the load cannot be folded into the broadcast.
1085 ; This happens if the load has initial one use but other uses are
1086 ; created later, or if selection DAG cannot prove that folding the
1087 ; load will not create a cycle in the DAG.
1088 ; Those test cases exerce the latter.
1090 define void @isel_crash_16b(ptr %cV_R.addr) {
1091 ; X86-LABEL: isel_crash_16b:
1092 ; X86: ## %bb.0: ## %eintry
1093 ; X86-NEXT: subl $60, %esp
1094 ; X86-NEXT: .cfi_def_cfa_offset 64
1095 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1096 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1097 ; X86-NEXT: vmovaps %xmm0, (%esp)
1098 ; X86-NEXT: vpbroadcastb (%eax), %xmm1
1099 ; X86-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1100 ; X86-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
1101 ; X86-NEXT: addl $60, %esp
1104 ; X64-LABEL: isel_crash_16b:
1105 ; X64: ## %bb.0: ## %eintry
1106 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1107 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1108 ; X64-NEXT: vpbroadcastb (%rdi), %xmm1
1109 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1110 ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
1113 %__a.addr.i = alloca <2 x i64>, align 16
1114 %__b.addr.i = alloca <2 x i64>, align 16
1115 %vCr = alloca <2 x i64>, align 16
1116 store <2 x i64> zeroinitializer, ptr %vCr, align 16
1117 %tmp = load <2 x i64>, ptr %vCr, align 16
1118 %tmp2 = load i8, ptr %cV_R.addr, align 4
1119 %splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0
1120 %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
1121 %tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64>
1122 store <2 x i64> %tmp, ptr %__a.addr.i, align 16
1123 store <2 x i64> %tmp3, ptr %__b.addr.i, align 16
1127 define void @isel_crash_32b(ptr %cV_R.addr) {
1128 ; X86-LABEL: isel_crash_32b:
1129 ; X86: ## %bb.0: ## %eintry
1130 ; X86-NEXT: pushl %ebp
1131 ; X86-NEXT: .cfi_def_cfa_offset 8
1132 ; X86-NEXT: .cfi_offset %ebp, -8
1133 ; X86-NEXT: movl %esp, %ebp
1134 ; X86-NEXT: .cfi_def_cfa_register %ebp
1135 ; X86-NEXT: andl $-32, %esp
1136 ; X86-NEXT: subl $128, %esp
1137 ; X86-NEXT: movl 8(%ebp), %eax
1138 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1139 ; X86-NEXT: vmovaps %ymm0, (%esp)
1140 ; X86-NEXT: vpbroadcastb (%eax), %ymm1
1141 ; X86-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1142 ; X86-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
1143 ; X86-NEXT: movl %ebp, %esp
1144 ; X86-NEXT: popl %ebp
1145 ; X86-NEXT: vzeroupper
1148 ; X64-LABEL: isel_crash_32b:
1149 ; X64: ## %bb.0: ## %eintry
1150 ; X64-NEXT: pushq %rbp
1151 ; X64-NEXT: .cfi_def_cfa_offset 16
1152 ; X64-NEXT: .cfi_offset %rbp, -16
1153 ; X64-NEXT: movq %rsp, %rbp
1154 ; X64-NEXT: .cfi_def_cfa_register %rbp
1155 ; X64-NEXT: andq $-32, %rsp
1156 ; X64-NEXT: subq $128, %rsp
1157 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1158 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1159 ; X64-NEXT: vpbroadcastb (%rdi), %ymm1
1160 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1161 ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1162 ; X64-NEXT: movq %rbp, %rsp
1163 ; X64-NEXT: popq %rbp
1164 ; X64-NEXT: vzeroupper
1167 %__a.addr.i = alloca <4 x i64>, align 32
1168 %__b.addr.i = alloca <4 x i64>, align 32
1169 %vCr = alloca <4 x i64>, align 32
1170 store <4 x i64> zeroinitializer, ptr %vCr, align 16
1171 %tmp = load <4 x i64>, ptr %vCr, align 16
1172 %tmp2 = load i8, ptr %cV_R.addr, align 4
1173 %splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0
1174 %splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer
1175 %tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64>
1176 store <4 x i64> %tmp, ptr %__a.addr.i, align 16
1177 store <4 x i64> %tmp3, ptr %__b.addr.i, align 16
1181 define void @isel_crash_8w(ptr %cV_R.addr) {
1182 ; X86-LABEL: isel_crash_8w:
1183 ; X86: ## %bb.0: ## %entry
1184 ; X86-NEXT: subl $60, %esp
1185 ; X86-NEXT: .cfi_def_cfa_offset 64
1186 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1187 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1188 ; X86-NEXT: vmovaps %xmm0, (%esp)
1189 ; X86-NEXT: vpbroadcastw (%eax), %xmm1
1190 ; X86-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1191 ; X86-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
1192 ; X86-NEXT: addl $60, %esp
1195 ; X64-LABEL: isel_crash_8w:
1196 ; X64: ## %bb.0: ## %entry
1197 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1198 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1199 ; X64-NEXT: vpbroadcastw (%rdi), %xmm1
1200 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1201 ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
1204 %__a.addr.i = alloca <2 x i64>, align 16
1205 %__b.addr.i = alloca <2 x i64>, align 16
1206 %vCr = alloca <2 x i64>, align 16
1207 store <2 x i64> zeroinitializer, ptr %vCr, align 16
1208 %tmp = load <2 x i64>, ptr %vCr, align 16
1209 %tmp2 = load i16, ptr %cV_R.addr, align 4
1210 %splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0
1211 %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
1212 %tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64>
1213 store <2 x i64> %tmp, ptr %__a.addr.i, align 16
1214 store <2 x i64> %tmp3, ptr %__b.addr.i, align 16
1218 define void @isel_crash_16w(ptr %cV_R.addr) {
1219 ; X86-LABEL: isel_crash_16w:
1220 ; X86: ## %bb.0: ## %eintry
1221 ; X86-NEXT: pushl %ebp
1222 ; X86-NEXT: .cfi_def_cfa_offset 8
1223 ; X86-NEXT: .cfi_offset %ebp, -8
1224 ; X86-NEXT: movl %esp, %ebp
1225 ; X86-NEXT: .cfi_def_cfa_register %ebp
1226 ; X86-NEXT: andl $-32, %esp
1227 ; X86-NEXT: subl $128, %esp
1228 ; X86-NEXT: movl 8(%ebp), %eax
1229 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1230 ; X86-NEXT: vmovaps %ymm0, (%esp)
1231 ; X86-NEXT: vpbroadcastw (%eax), %ymm1
1232 ; X86-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1233 ; X86-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
1234 ; X86-NEXT: movl %ebp, %esp
1235 ; X86-NEXT: popl %ebp
1236 ; X86-NEXT: vzeroupper
1239 ; X64-LABEL: isel_crash_16w:
1240 ; X64: ## %bb.0: ## %eintry
1241 ; X64-NEXT: pushq %rbp
1242 ; X64-NEXT: .cfi_def_cfa_offset 16
1243 ; X64-NEXT: .cfi_offset %rbp, -16
1244 ; X64-NEXT: movq %rsp, %rbp
1245 ; X64-NEXT: .cfi_def_cfa_register %rbp
1246 ; X64-NEXT: andq $-32, %rsp
1247 ; X64-NEXT: subq $128, %rsp
1248 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1249 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1250 ; X64-NEXT: vpbroadcastw (%rdi), %ymm1
1251 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1252 ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1253 ; X64-NEXT: movq %rbp, %rsp
1254 ; X64-NEXT: popq %rbp
1255 ; X64-NEXT: vzeroupper
1258 %__a.addr.i = alloca <4 x i64>, align 32
1259 %__b.addr.i = alloca <4 x i64>, align 32
1260 %vCr = alloca <4 x i64>, align 32
1261 store <4 x i64> zeroinitializer, ptr %vCr, align 16
1262 %tmp = load <4 x i64>, ptr %vCr, align 16
1263 %tmp2 = load i16, ptr %cV_R.addr, align 4
1264 %splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0
1265 %splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer
1266 %tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64>
1267 store <4 x i64> %tmp, ptr %__a.addr.i, align 16
1268 store <4 x i64> %tmp3, ptr %__b.addr.i, align 16
1272 define void @isel_crash_4d(ptr %cV_R.addr) {
1273 ; X86-LABEL: isel_crash_4d:
1274 ; X86: ## %bb.0: ## %entry
1275 ; X86-NEXT: subl $60, %esp
1276 ; X86-NEXT: .cfi_def_cfa_offset 64
1277 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1278 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1279 ; X86-NEXT: vmovaps %xmm0, (%esp)
1280 ; X86-NEXT: vbroadcastss (%eax), %xmm1
1281 ; X86-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1282 ; X86-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
1283 ; X86-NEXT: addl $60, %esp
1286 ; X64-LABEL: isel_crash_4d:
1287 ; X64: ## %bb.0: ## %entry
1288 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1289 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1290 ; X64-NEXT: vbroadcastss (%rdi), %xmm1
1291 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1292 ; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1295 %__a.addr.i = alloca <2 x i64>, align 16
1296 %__b.addr.i = alloca <2 x i64>, align 16
1297 %vCr = alloca <2 x i64>, align 16
1298 store <2 x i64> zeroinitializer, ptr %vCr, align 16
1299 %tmp = load <2 x i64>, ptr %vCr, align 16
1300 %tmp2 = load i32, ptr %cV_R.addr, align 4
1301 %splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0
1302 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1303 %tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64>
1304 store <2 x i64> %tmp, ptr %__a.addr.i, align 16
1305 store <2 x i64> %tmp3, ptr %__b.addr.i, align 16
1309 define void @isel_crash_8d(ptr %cV_R.addr) {
1310 ; X86-LABEL: isel_crash_8d:
1311 ; X86: ## %bb.0: ## %eintry
1312 ; X86-NEXT: pushl %ebp
1313 ; X86-NEXT: .cfi_def_cfa_offset 8
1314 ; X86-NEXT: .cfi_offset %ebp, -8
1315 ; X86-NEXT: movl %esp, %ebp
1316 ; X86-NEXT: .cfi_def_cfa_register %ebp
1317 ; X86-NEXT: andl $-32, %esp
1318 ; X86-NEXT: subl $128, %esp
1319 ; X86-NEXT: movl 8(%ebp), %eax
1320 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1321 ; X86-NEXT: vmovaps %ymm0, (%esp)
1322 ; X86-NEXT: vbroadcastss (%eax), %ymm1
1323 ; X86-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1324 ; X86-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
1325 ; X86-NEXT: movl %ebp, %esp
1326 ; X86-NEXT: popl %ebp
1327 ; X86-NEXT: vzeroupper
1330 ; X64-LABEL: isel_crash_8d:
1331 ; X64: ## %bb.0: ## %eintry
1332 ; X64-NEXT: pushq %rbp
1333 ; X64-NEXT: .cfi_def_cfa_offset 16
1334 ; X64-NEXT: .cfi_offset %rbp, -16
1335 ; X64-NEXT: movq %rsp, %rbp
1336 ; X64-NEXT: .cfi_def_cfa_register %rbp
1337 ; X64-NEXT: andq $-32, %rsp
1338 ; X64-NEXT: subq $128, %rsp
1339 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1340 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1341 ; X64-NEXT: vbroadcastss (%rdi), %ymm1
1342 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1343 ; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1344 ; X64-NEXT: movq %rbp, %rsp
1345 ; X64-NEXT: popq %rbp
1346 ; X64-NEXT: vzeroupper
1349 %__a.addr.i = alloca <4 x i64>, align 32
1350 %__b.addr.i = alloca <4 x i64>, align 32
1351 %vCr = alloca <4 x i64>, align 32
1352 store <4 x i64> zeroinitializer, ptr %vCr, align 16
1353 %tmp = load <4 x i64>, ptr %vCr, align 16
1354 %tmp2 = load i32, ptr %cV_R.addr, align 4
1355 %splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0
1356 %splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1357 %tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64>
1358 store <4 x i64> %tmp, ptr %__a.addr.i, align 16
1359 store <4 x i64> %tmp3, ptr %__b.addr.i, align 16
1363 define void @isel_crash_2q(ptr %cV_R.addr) {
1364 ; X86-LABEL: isel_crash_2q:
1365 ; X86: ## %bb.0: ## %entry
1366 ; X86-NEXT: subl $60, %esp
1367 ; X86-NEXT: .cfi_def_cfa_offset 64
1368 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1369 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1370 ; X86-NEXT: vmovaps %xmm0, (%esp)
1371 ; X86-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
1372 ; X86-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
1373 ; X86-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
1374 ; X86-NEXT: addl $60, %esp
1377 ; X64-LABEL: isel_crash_2q:
1378 ; X64: ## %bb.0: ## %entry
1379 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1380 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1381 ; X64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
1382 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1383 ; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1386 %__a.addr.i = alloca <2 x i64>, align 16
1387 %__b.addr.i = alloca <2 x i64>, align 16
1388 %vCr = alloca <2 x i64>, align 16
1389 store <2 x i64> zeroinitializer, ptr %vCr, align 16
1390 %tmp = load <2 x i64>, ptr %vCr, align 16
1391 %tmp2 = load i64, ptr %cV_R.addr, align 4
1392 %splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0
1393 %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
1394 store <2 x i64> %tmp, ptr %__a.addr.i, align 16
1395 store <2 x i64> %splat.splat, ptr %__b.addr.i, align 16
1399 define void @isel_crash_4q(ptr %cV_R.addr) {
1400 ; X86-LABEL: isel_crash_4q:
1401 ; X86: ## %bb.0: ## %eintry
1402 ; X86-NEXT: pushl %ebp
1403 ; X86-NEXT: .cfi_def_cfa_offset 8
1404 ; X86-NEXT: .cfi_offset %ebp, -8
1405 ; X86-NEXT: movl %esp, %ebp
1406 ; X86-NEXT: .cfi_def_cfa_register %ebp
1407 ; X86-NEXT: andl $-32, %esp
1408 ; X86-NEXT: subl $128, %esp
1409 ; X86-NEXT: movl 8(%ebp), %eax
1410 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1411 ; X86-NEXT: vmovaps %ymm0, (%esp)
1412 ; X86-NEXT: vbroadcastsd (%eax), %ymm1
1413 ; X86-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1414 ; X86-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
1415 ; X86-NEXT: movl %ebp, %esp
1416 ; X86-NEXT: popl %ebp
1417 ; X86-NEXT: vzeroupper
1420 ; X64-LABEL: isel_crash_4q:
1421 ; X64: ## %bb.0: ## %eintry
1422 ; X64-NEXT: pushq %rbp
1423 ; X64-NEXT: .cfi_def_cfa_offset 16
1424 ; X64-NEXT: .cfi_offset %rbp, -16
1425 ; X64-NEXT: movq %rsp, %rbp
1426 ; X64-NEXT: .cfi_def_cfa_register %rbp
1427 ; X64-NEXT: andq $-32, %rsp
1428 ; X64-NEXT: subq $128, %rsp
1429 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1430 ; X64-NEXT: vmovaps %ymm0, (%rsp)
1431 ; X64-NEXT: vbroadcastsd (%rdi), %ymm1
1432 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
1433 ; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1434 ; X64-NEXT: movq %rbp, %rsp
1435 ; X64-NEXT: popq %rbp
1436 ; X64-NEXT: vzeroupper
1439 %__a.addr.i = alloca <4 x i64>, align 32
1440 %__b.addr.i = alloca <4 x i64>, align 32
1441 %vCr = alloca <4 x i64>, align 32
1442 store <4 x i64> zeroinitializer, ptr %vCr, align 16
1443 %tmp = load <4 x i64>, ptr %vCr, align 16
1444 %tmp2 = load i64, ptr %cV_R.addr, align 4
1445 %splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0
1446 %splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
1447 store <4 x i64> %tmp, ptr %__a.addr.i, align 16
1448 store <4 x i64> %splat.splat, ptr %__b.addr.i, align 16
1452 define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
1453 ; X86-LABEL: broadcast_x86_mmx:
1454 ; X86: ## %bb.0: ## %bb
1455 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1458 ; X64-AVX2-LABEL: broadcast_x86_mmx:
1459 ; X64-AVX2: ## %bb.0: ## %bb
1460 ; X64-AVX2-NEXT: vmovq %rdi, %xmm0
1461 ; X64-AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1462 ; X64-AVX2-NEXT: retq
1464 ; X64-AVX512VL-LABEL: broadcast_x86_mmx:
1465 ; X64-AVX512VL: ## %bb.0: ## %bb
1466 ; X64-AVX512VL-NEXT: vpbroadcastq %rdi, %xmm0
1467 ; X64-AVX512VL-NEXT: retq
1469 %tmp1 = bitcast <1 x i64> %tmp to i64
1470 %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
1471 %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
1472 %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>