1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
5 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
7 ; X32: ## %bb.0: ## %entry
8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
9 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
13 ; X64: ## %bb.0: ## %entry
14 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
17 %q = load i64, i64* %ptr, align 8
18 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
19 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
20 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
21 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
22 ret <4 x i64> %vecinit6.i
25 define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
27 ; X32: ## %bb.0: ## %entry
28 ; X32-NEXT: pushl %esi
29 ; X32-NEXT: .cfi_def_cfa_offset 8
30 ; X32-NEXT: .cfi_offset %esi, -8
31 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
32 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
33 ; X32-NEXT: movl (%ecx), %edx
34 ; X32-NEXT: movl 4(%ecx), %esi
35 ; X32-NEXT: vbroadcastsd (%ecx), %ymm0
36 ; X32-NEXT: movl %edx, (%eax)
37 ; X32-NEXT: movl %esi, 4(%eax)
42 ; X64: ## %bb.0: ## %entry
43 ; X64-NEXT: movq (%rdi), %rax
44 ; X64-NEXT: movq %rax, (%rsi)
45 ; X64-NEXT: vmovq %rax, %xmm0
46 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
47 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
50 %q = load i64, i64* %ptr, align 8
51 store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
52 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
53 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
54 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
55 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
56 ret <4 x i64> %vecinit6.i
59 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
61 ; X32: ## %bb.0: ## %entry
62 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
63 ; X32-NEXT: vbroadcastss (%eax), %ymm0
67 ; X64: ## %bb.0: ## %entry
68 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
71 %q = load i32, i32* %ptr, align 4
72 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
73 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
74 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
75 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
76 ret <8 x i32> %vecinit6.i
79 define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
81 ; X32: ## %bb.0: ## %entry
82 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
83 ; X32-NEXT: vbroadcastss (%eax), %ymm0
87 ; X64: ## %bb.0: ## %entry
88 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
91 %q = load i32, i32* %ptr, align 4
92 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
93 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
94 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
95 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
96 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
97 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
98 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
99 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
100 ret <8 x i32> %vecinit14.i
103 define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
105 ; X32: ## %bb.0: ## %entry
106 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
107 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
108 ; X32-NEXT: movl (%ecx), %ecx
109 ; X32-NEXT: movl %ecx, (%eax)
110 ; X32-NEXT: vmovd %ecx, %xmm0
111 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
112 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
116 ; X64: ## %bb.0: ## %entry
117 ; X64-NEXT: movl (%rdi), %eax
118 ; X64-NEXT: movl %eax, (%rsi)
119 ; X64-NEXT: vmovd %eax, %xmm0
120 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
121 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
124 %q = load i32, i32* %ptr, align 4
125 store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
126 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
127 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
128 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
129 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
130 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
131 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
132 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
133 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
134 ret <8 x i32> %vecinit14.i
137 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
139 ; X32: ## %bb.0: ## %entry
140 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
141 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
145 ; X64: ## %bb.0: ## %entry
146 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
149 %q = load double, double* %ptr, align 8
150 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
151 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
152 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
153 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
154 ret <4 x double> %vecinit6.i
157 define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
159 ; X32: ## %bb.0: ## %entry
160 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
161 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
162 ; X32-NEXT: vbroadcastsd (%ecx), %ymm0
163 ; X32-NEXT: vmovlps %xmm0, (%eax)
167 ; X64: ## %bb.0: ## %entry
168 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
169 ; X64-NEXT: vmovlps %xmm0, (%rsi)
172 %q = load double, double* %ptr, align 8
173 store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast
174 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
175 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
176 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
177 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
178 ret <4 x double> %vecinit6.i
181 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
183 ; X32: ## %bb.0: ## %entry
184 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
185 ; X32-NEXT: vbroadcastss (%eax), %ymm0
189 ; X64: ## %bb.0: ## %entry
190 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
193 %q = load float, float* %ptr, align 4
194 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
195 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
196 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
197 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
198 ret <8 x float> %vecinit6.i
201 define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
203 ; X32: ## %bb.0: ## %entry
204 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
205 ; X32-NEXT: vbroadcastss (%eax), %ymm0
209 ; X64: ## %bb.0: ## %entry
210 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
213 %q = load float, float* %ptr, align 4
214 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
215 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
216 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
217 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
218 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
219 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
220 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
221 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
222 ret <8 x float> %vecinit14.i
225 define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
227 ; X32: ## %bb.0: ## %entry
228 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
229 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
230 ; X32-NEXT: vbroadcastss (%ecx), %ymm0
231 ; X32-NEXT: vmovss %xmm0, (%eax)
235 ; X64: ## %bb.0: ## %entry
236 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
237 ; X64-NEXT: vmovss %xmm0, (%rsi)
240 %q = load float, float* %ptr, align 4
241 store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
242 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
243 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
244 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
245 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
246 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
247 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
248 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
249 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
250 ret <8 x float> %vecinit14.i
253 ;;;; 128-bit versions
255 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
257 ; X32: ## %bb.0: ## %entry
258 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
259 ; X32-NEXT: vbroadcastss (%eax), %xmm0
263 ; X64: ## %bb.0: ## %entry
264 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
267 %q = load float, float* %ptr, align 4
268 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
269 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
270 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
271 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
272 ret <4 x float> %vecinit6.i
275 define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
277 ; X32: ## %bb.0: ## %entry
278 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
279 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
280 ; X32-NEXT: vbroadcastss (%ecx), %xmm0
281 ; X32-NEXT: vmovss %xmm0, (%eax)
285 ; X64: ## %bb.0: ## %entry
286 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
287 ; X64-NEXT: vmovss %xmm0, (%rsi)
290 %q = load float, float* %ptr, align 4
291 store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
292 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
293 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
294 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
295 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
296 ret <4 x float> %vecinit6.i
299 ; Don't broadcast constants on pre-AVX2 hardware.
300 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
302 ; X32: ## %bb.0: ## %entry
303 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
307 ; X64: ## %bb.0: ## %entry
308 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
311 %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
312 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
313 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
314 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
315 ret <4 x float> %vecinit6.i
319 define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
321 ; X32: ## %bb.0: ## %entry
322 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
323 ; X32-NEXT: vbroadcastss (%eax), %xmm0
327 ; X64: ## %bb.0: ## %entry
328 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
331 %q = load i32, i32* %ptr, align 4
332 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
333 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
334 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
335 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
336 ret <4 x i32> %vecinit6.i
339 define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
341 ; X32: ## %bb.0: ## %entry
342 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
343 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
344 ; X32-NEXT: movl (%ecx), %ecx
345 ; X32-NEXT: movl %ecx, (%eax)
346 ; X32-NEXT: vmovd %ecx, %xmm0
347 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
351 ; X64: ## %bb.0: ## %entry
352 ; X64-NEXT: movl (%rdi), %eax
353 ; X64-NEXT: movl %eax, (%rsi)
354 ; X64-NEXT: vmovd %eax, %xmm0
355 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
358 %q = load i32, i32* %ptr, align 4
359 store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
360 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
361 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
362 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
363 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
364 ret <4 x i32> %vecinit6.i
367 ; FIXME: Pointer adjusted broadcasts
369 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
370 ; X32-LABEL: load_splat_4i32_4i32_1111:
371 ; X32: ## %bb.0: ## %entry
372 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
376 ; X64-LABEL: load_splat_4i32_4i32_1111:
377 ; X64: ## %bb.0: ## %entry
378 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
381 %ld = load <4 x i32>, <4 x i32>* %ptr
382 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
386 define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
387 ; X32-LABEL: load_splat_8i32_4i32_33333333:
388 ; X32: ## %bb.0: ## %entry
389 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
390 ; X32-NEXT: vbroadcastss 12(%eax), %ymm0
393 ; X64-LABEL: load_splat_8i32_4i32_33333333:
394 ; X64: ## %bb.0: ## %entry
395 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
398 %ld = load <4 x i32>, <4 x i32>* %ptr
399 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
403 define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
404 ; X32-LABEL: load_splat_8i32_8i32_55555555:
405 ; X32: ## %bb.0: ## %entry
406 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
407 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
410 ; X64-LABEL: load_splat_8i32_8i32_55555555:
411 ; X64: ## %bb.0: ## %entry
412 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
415 %ld = load <8 x i32>, <8 x i32>* %ptr
416 %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
420 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
421 ; X32-LABEL: load_splat_4f32_4f32_1111:
422 ; X32: ## %bb.0: ## %entry
423 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
424 ; X32-NEXT: vbroadcastss 4(%eax), %xmm0
427 ; X64-LABEL: load_splat_4f32_4f32_1111:
428 ; X64: ## %bb.0: ## %entry
429 ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
432 %ld = load <4 x float>, <4 x float>* %ptr
433 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
437 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
438 ; X32-LABEL: load_splat_8f32_4f32_33333333:
439 ; X32: ## %bb.0: ## %entry
440 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
441 ; X32-NEXT: vbroadcastss 12(%eax), %ymm0
444 ; X64-LABEL: load_splat_8f32_4f32_33333333:
445 ; X64: ## %bb.0: ## %entry
446 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
449 %ld = load <4 x float>, <4 x float>* %ptr
450 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
454 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
455 ; X32-LABEL: load_splat_8f32_8f32_55555555:
456 ; X32: ## %bb.0: ## %entry
457 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
458 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
461 ; X64-LABEL: load_splat_8f32_8f32_55555555:
462 ; X64: ## %bb.0: ## %entry
463 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
466 %ld = load <8 x float>, <8 x float>* %ptr
467 %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
471 define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
472 ; X32-LABEL: load_splat_2i64_2i64_1111:
473 ; X32: ## %bb.0: ## %entry
474 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
475 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
478 ; X64-LABEL: load_splat_2i64_2i64_1111:
479 ; X64: ## %bb.0: ## %entry
480 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
483 %ld = load <2 x i64>, <2 x i64>* %ptr
484 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
488 define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
489 ; X32-LABEL: load_splat_4i64_2i64_1111:
490 ; X32: ## %bb.0: ## %entry
491 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
492 ; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
495 ; X64-LABEL: load_splat_4i64_2i64_1111:
496 ; X64: ## %bb.0: ## %entry
497 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
500 %ld = load <2 x i64>, <2 x i64>* %ptr
501 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
505 define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
506 ; X32-LABEL: load_splat_4i64_4i64_2222:
507 ; X32: ## %bb.0: ## %entry
508 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
509 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
512 ; X64-LABEL: load_splat_4i64_4i64_2222:
513 ; X64: ## %bb.0: ## %entry
514 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
517 %ld = load <4 x i64>, <4 x i64>* %ptr
518 %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
522 define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
523 ; X32-LABEL: load_splat_2f64_2f64_1111:
524 ; X32: ## %bb.0: ## %entry
525 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
526 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
529 ; X64-LABEL: load_splat_2f64_2f64_1111:
530 ; X64: ## %bb.0: ## %entry
531 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
534 %ld = load <2 x double>, <2 x double>* %ptr
535 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
536 ret <2 x double> %ret
539 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
540 ; X32-LABEL: load_splat_4f64_2f64_1111:
541 ; X32: ## %bb.0: ## %entry
542 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
543 ; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
546 ; X64-LABEL: load_splat_4f64_2f64_1111:
547 ; X64: ## %bb.0: ## %entry
548 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
551 %ld = load <2 x double>, <2 x double>* %ptr
552 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
553 ret <4 x double> %ret
556 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
557 ; X32-LABEL: load_splat_4f64_4f64_2222:
558 ; X32: ## %bb.0: ## %entry
559 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
560 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
563 ; X64-LABEL: load_splat_4f64_4f64_2222:
564 ; X64: ## %bb.0: ## %entry
565 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
568 %ld = load <4 x double>, <4 x double>* %ptr
569 %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
570 ret <4 x double> %ret
573 ; Unsupported vbroadcasts
575 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
577 ; X32: ## %bb.0: ## %entry
578 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
579 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
583 ; X64: ## %bb.0: ## %entry
584 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
587 %q = load i64, i64* %ptr, align 8
588 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
589 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
590 ret <2 x i64> %vecinit2.i
593 define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
595 ; X32: ## %bb.0: ## %entry
596 ; X32-NEXT: pushl %esi
597 ; X32-NEXT: .cfi_def_cfa_offset 8
598 ; X32-NEXT: .cfi_offset %esi, -8
599 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
600 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
601 ; X32-NEXT: movl (%ecx), %edx
602 ; X32-NEXT: movl 4(%ecx), %esi
603 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
604 ; X32-NEXT: movl %edx, (%eax)
605 ; X32-NEXT: movl %esi, 4(%eax)
606 ; X32-NEXT: popl %esi
610 ; X64: ## %bb.0: ## %entry
611 ; X64-NEXT: movq (%rdi), %rax
612 ; X64-NEXT: movq %rax, (%rsi)
613 ; X64-NEXT: vmovq %rax, %xmm0
614 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
617 %q = load i64, i64* %ptr, align 8
618 store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
619 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
620 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
621 ret <2 x i64> %vecinit2.i
624 define <4 x i32> @H(<4 x i32> %a) {
626 ; X32: ## %bb.0: ## %entry
627 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
631 ; X64: ## %bb.0: ## %entry
632 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
635 %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
639 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
641 ; X32: ## %bb.0: ## %entry
642 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
643 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
647 ; X64: ## %bb.0: ## %entry
648 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
651 %q = load double, double* %ptr, align 4
652 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
653 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
654 ret <2 x double> %vecinit2.i
657 define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
659 ; X32: ## %bb.0: ## %entry
660 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
661 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
662 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
663 ; X32-NEXT: vmovlps %xmm0, (%eax)
667 ; X64: ## %bb.0: ## %entry
668 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
669 ; X64-NEXT: vmovlps %xmm0, (%rsi)
672 %q = load double, double* %ptr, align 4
673 store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast
674 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
675 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
676 ret <2 x double> %vecinit2.i
679 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
681 ; X32: ## %bb.0: ## %entry
682 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
683 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
684 ; X32-NEXT: vbroadcastss (%ecx), %xmm0
685 ; X32-NEXT: movl (%eax), %eax
686 ; X32-NEXT: movl %eax, (%eax)
690 ; X64: ## %bb.0: ## %entry
691 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
692 ; X64-NEXT: movl (%rsi), %eax
693 ; X64-NEXT: movl %eax, (%rax)
696 %q = load float, float* %ptr, align 4
697 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
698 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
699 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
700 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
702 %j = load i32, i32* %k, align 4
703 store i32 %j, i32* undef
704 ret <4 x float> %vecinit6.i
707 define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
709 ; X32: ## %bb.0: ## %entry
710 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
711 ; X32-NEXT: vbroadcastss (%eax), %xmm0
715 ; X64: ## %bb.0: ## %entry
716 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
719 %q = load float, float* %ptr, align 4
720 %v = insertelement <4 x float> undef, float %q, i32 0
721 %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
725 ; These tests check that a vbroadcast instruction is used when we have a splat
726 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
727 ; (via the insertelements).
729 define <8 x float> @splat_concat1(float* %p) {
730 ; X32-LABEL: splat_concat1:
732 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
733 ; X32-NEXT: vbroadcastss (%eax), %ymm0
736 ; X64-LABEL: splat_concat1:
738 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
740 %1 = load float, float* %p, align 4
741 %2 = insertelement <4 x float> undef, float %1, i32 0
742 %3 = insertelement <4 x float> %2, float %1, i32 1
743 %4 = insertelement <4 x float> %3, float %1, i32 2
744 %5 = insertelement <4 x float> %4, float %1, i32 3
745 %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
749 define <8 x float> @splat_concat2(float* %p) {
750 ; X32-LABEL: splat_concat2:
752 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
753 ; X32-NEXT: vbroadcastss (%eax), %ymm0
756 ; X64-LABEL: splat_concat2:
758 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
760 %1 = load float, float* %p, align 4
761 %2 = insertelement <4 x float> undef, float %1, i32 0
762 %3 = insertelement <4 x float> %2, float %1, i32 1
763 %4 = insertelement <4 x float> %3, float %1, i32 2
764 %5 = insertelement <4 x float> %4, float %1, i32 3
765 %6 = insertelement <4 x float> undef, float %1, i32 0
766 %7 = insertelement <4 x float> %6, float %1, i32 1
767 %8 = insertelement <4 x float> %7, float %1, i32 2
768 %9 = insertelement <4 x float> %8, float %1, i32 3
769 %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
773 define <4 x double> @splat_concat3(double* %p) {
774 ; X32-LABEL: splat_concat3:
776 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
777 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
780 ; X64-LABEL: splat_concat3:
782 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
784 %1 = load double, double* %p, align 8
785 %2 = insertelement <2 x double> undef, double %1, i32 0
786 %3 = insertelement <2 x double> %2, double %1, i32 1
787 %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
791 define <4 x double> @splat_concat4(double* %p) {
792 ; X32-LABEL: splat_concat4:
794 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
795 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
798 ; X64-LABEL: splat_concat4:
800 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
802 %1 = load double, double* %p, align 8
803 %2 = insertelement <2 x double> undef, double %1, i32 0
804 %3 = insertelement <2 x double> %2, double %1, i32 1
805 %4 = insertelement <2 x double> undef, double %1, i32 0
806 %5 = insertelement <2 x double> %2, double %1, i32 1
807 %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
812 define <4 x double> @broadcast_shuffle_1000(double* %p) {
813 ; X32-LABEL: broadcast_shuffle_1000:
815 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
816 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
819 ; X64-LABEL: broadcast_shuffle_1000:
821 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
823 %1 = load double, double* %p
824 %2 = insertelement <2 x double> undef, double %1, i32 0
825 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
829 define <4 x double> @broadcast_shuffle1032(double* %p) {
830 ; X32-LABEL: broadcast_shuffle1032:
832 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
833 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
836 ; X64-LABEL: broadcast_shuffle1032:
838 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
840 %1 = load double, double* %p
841 %2 = insertelement <2 x double> undef, double %1, i32 1
842 %3 = insertelement <2 x double> undef, double %1, i32 0
843 %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
847 define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) {
848 ; X32-LABEL: broadcast_v16i32:
850 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
851 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
852 ; X32-NEXT: vbroadcastss (%ecx), %ymm0
853 ; X32-NEXT: vmovups %ymm0, 32(%eax)
854 ; X32-NEXT: vmovups %ymm0, (%eax)
855 ; X32-NEXT: vzeroupper
858 ; X64-LABEL: broadcast_v16i32:
860 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
861 ; X64-NEXT: vmovups %ymm0, 32(%rsi)
862 ; X64-NEXT: vmovups %ymm0, (%rsi)
863 ; X64-NEXT: vzeroupper
865 %1 = load i32, i32* %a, align 4
866 %2 = insertelement <8 x i32> undef, i32 %1, i32 0
867 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
868 %4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
869 store <16 x i32> %4, <16 x i32>* %b, align 4
874 ; Broadcast scale factor for xyz vector - slp will have vectorized xy.
876 define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind {
877 ; X32-LABEL: broadcast_scale_xyz:
879 ; X32-NEXT: subl $12, %esp
880 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
881 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
882 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
883 ; X32-NEXT: vmulpd (%eax), %xmm0, %xmm1
884 ; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
885 ; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
886 ; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1
887 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
888 ; X32-NEXT: vmovsd %xmm0, (%esp)
889 ; X32-NEXT: fldl (%esp)
890 ; X32-NEXT: addl $12, %esp
893 ; X64-LABEL: broadcast_scale_xyz:
895 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
896 ; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1
897 ; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
898 ; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
899 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
900 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
902 %3 = bitcast double* %1 to <2 x double>*
903 %4 = load <2 x double>, <2 x double>* %3, align 8
904 %5 = getelementptr inbounds double, double* %1, i64 2
905 %6 = load double, double* %5, align 8
906 %7 = load double, double* %0, align 8
907 %8 = insertelement <2 x double> undef, double %7, i32 0
908 %9 = shufflevector <2 x double> %8, <2 x double> undef, <2 x i32> zeroinitializer
909 %10 = fmul <2 x double> %4, %9
910 %11 = fmul double %6, %7
911 %12 = extractelement <2 x double> %10, i32 0
912 %13 = extractelement <2 x double> %10, i32 1
913 %14 = fadd double %12, %13
914 %15 = fadd double %11, %14
919 ; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
921 define float @broadcast_lifetime() nounwind {
922 ; X32-LABEL: broadcast_lifetime:
924 ; X32-NEXT: pushl %esi
925 ; X32-NEXT: subl $40, %esp
926 ; X32-NEXT: leal {{[0-9]+}}(%esp), %esi
927 ; X32-NEXT: movl %esi, (%esp)
928 ; X32-NEXT: calll _gfunc
929 ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
930 ; X32-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
931 ; X32-NEXT: movl %esi, (%esp)
932 ; X32-NEXT: calll _gfunc
933 ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
934 ; X32-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
935 ; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
936 ; X32-NEXT: flds {{[0-9]+}}(%esp)
937 ; X32-NEXT: addl $40, %esp
938 ; X32-NEXT: popl %esi
941 ; X64-LABEL: broadcast_lifetime:
943 ; X64-NEXT: subq $40, %rsp
944 ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
945 ; X64-NEXT: callq _gfunc
946 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
947 ; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
948 ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
949 ; X64-NEXT: callq _gfunc
950 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
951 ; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
952 ; X64-NEXT: addq $40, %rsp
954 %1 = alloca <4 x float>, align 16
955 %2 = alloca <4 x float>, align 16
956 %3 = bitcast <4 x float>* %1 to i8*
957 %4 = bitcast <4 x float>* %2 to i8*
959 call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
960 call void @gfunc(<4 x float>* %1)
961 %5 = load <4 x float>, <4 x float>* %1, align 16
962 call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
964 call void @llvm.lifetime.start.p0i8(i64 16, i8* %4)
965 call void @gfunc(<4 x float>* %2)
966 %6 = load <4 x float>, <4 x float>* %2, align 16
967 call void @llvm.lifetime.end.p0i8(i64 16, i8* %4)
969 %7 = extractelement <4 x float> %5, i32 1
970 %8 = extractelement <4 x float> %6, i32 1
971 %9 = fsub float %8, %7
975 declare void @gfunc(<4 x float>*)
976 declare void @llvm.lifetime.start.p0i8(i64, i8*)
977 declare void @llvm.lifetime.end.p0i8(i64, i8*)