1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
5 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
7 ; X32: ## %bb.0: ## %entry
8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
9 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
13 ; X64: ## %bb.0: ## %entry
14 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
17 %q = load i64, i64* %ptr, align 8
18 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
19 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
20 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
21 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
22 ret <4 x i64> %vecinit6.i
25 define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
27 ; X32: ## %bb.0: ## %entry
28 ; X32-NEXT: pushl %esi
29 ; X32-NEXT: .cfi_def_cfa_offset 8
30 ; X32-NEXT: .cfi_offset %esi, -8
31 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
32 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
33 ; X32-NEXT: movl (%ecx), %edx
34 ; X32-NEXT: movl 4(%ecx), %esi
35 ; X32-NEXT: vbroadcastsd (%ecx), %ymm0
36 ; X32-NEXT: movl %edx, (%eax)
37 ; X32-NEXT: movl %esi, 4(%eax)
42 ; X64: ## %bb.0: ## %entry
43 ; X64-NEXT: movq (%rdi), %rax
44 ; X64-NEXT: movq %rax, (%rsi)
45 ; X64-NEXT: vmovq %rax, %xmm0
46 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
47 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
50 %q = load i64, i64* %ptr, align 8
51 store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
52 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
53 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
54 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
55 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
56 ret <4 x i64> %vecinit6.i
59 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
61 ; X32: ## %bb.0: ## %entry
62 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
63 ; X32-NEXT: vbroadcastss (%eax), %ymm0
67 ; X64: ## %bb.0: ## %entry
68 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
71 %q = load i32, i32* %ptr, align 4
72 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
73 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
74 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
75 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
76 ret <8 x i32> %vecinit6.i
79 define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
81 ; X32: ## %bb.0: ## %entry
82 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
83 ; X32-NEXT: vbroadcastss (%eax), %ymm0
87 ; X64: ## %bb.0: ## %entry
88 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
91 %q = load i32, i32* %ptr, align 4
92 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
93 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
94 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
95 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
96 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
97 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
98 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
99 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
100 ret <8 x i32> %vecinit14.i
103 define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
105 ; X32: ## %bb.0: ## %entry
106 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
107 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
108 ; X32-NEXT: movl (%ecx), %ecx
109 ; X32-NEXT: movl %ecx, (%eax)
110 ; X32-NEXT: vmovd %ecx, %xmm0
111 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
112 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
116 ; X64: ## %bb.0: ## %entry
117 ; X64-NEXT: movl (%rdi), %eax
118 ; X64-NEXT: movl %eax, (%rsi)
119 ; X64-NEXT: vmovd %eax, %xmm0
120 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
121 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
124 %q = load i32, i32* %ptr, align 4
125 store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
126 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
127 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
128 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
129 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
130 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
131 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
132 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
133 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
134 ret <8 x i32> %vecinit14.i
137 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
139 ; X32: ## %bb.0: ## %entry
140 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
141 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
145 ; X64: ## %bb.0: ## %entry
146 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
149 %q = load double, double* %ptr, align 8
150 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
151 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
152 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
153 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
154 ret <4 x double> %vecinit6.i
157 define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
159 ; X32: ## %bb.0: ## %entry
160 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
161 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
162 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
163 ; X32-NEXT: vmovsd %xmm0, (%eax)
164 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
165 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
169 ; X64: ## %bb.0: ## %entry
170 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
171 ; X64-NEXT: vmovsd %xmm0, (%rsi)
172 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
173 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
176 %q = load double, double* %ptr, align 8
177 store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast
178 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
179 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
180 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
181 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
182 ret <4 x double> %vecinit6.i
185 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
187 ; X32: ## %bb.0: ## %entry
188 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
189 ; X32-NEXT: vbroadcastss (%eax), %ymm0
193 ; X64: ## %bb.0: ## %entry
194 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
197 %q = load float, float* %ptr, align 4
198 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
199 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
200 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
201 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
202 ret <8 x float> %vecinit6.i
205 define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
207 ; X32: ## %bb.0: ## %entry
208 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
209 ; X32-NEXT: vbroadcastss (%eax), %ymm0
213 ; X64: ## %bb.0: ## %entry
214 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
217 %q = load float, float* %ptr, align 4
218 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
219 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
220 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
221 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
222 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
223 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
224 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
225 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
226 ret <8 x float> %vecinit14.i
229 define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
231 ; X32: ## %bb.0: ## %entry
232 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
233 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
234 ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
235 ; X32-NEXT: vmovss %xmm0, (%eax)
236 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
237 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
241 ; X64: ## %bb.0: ## %entry
242 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
243 ; X64-NEXT: vmovss %xmm0, (%rsi)
244 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
245 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
248 %q = load float, float* %ptr, align 4
249 store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
250 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
251 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
252 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
253 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
254 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
255 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
256 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
257 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
258 ret <8 x float> %vecinit14.i
261 ;;;; 128-bit versions
263 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
265 ; X32: ## %bb.0: ## %entry
266 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
267 ; X32-NEXT: vbroadcastss (%eax), %xmm0
271 ; X64: ## %bb.0: ## %entry
272 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
275 %q = load float, float* %ptr, align 4
276 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
277 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
278 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
279 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
280 ret <4 x float> %vecinit6.i
283 define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
285 ; X32: ## %bb.0: ## %entry
286 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
287 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
288 ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
289 ; X32-NEXT: vmovss %xmm0, (%eax)
290 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
294 ; X64: ## %bb.0: ## %entry
295 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
296 ; X64-NEXT: vmovss %xmm0, (%rsi)
297 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
300 %q = load float, float* %ptr, align 4
301 store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
302 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
303 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
304 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
305 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
306 ret <4 x float> %vecinit6.i
309 ; Don't broadcast constants on pre-AVX2 hardware.
310 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
312 ; X32: ## %bb.0: ## %entry
313 ; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
317 ; X64: ## %bb.0: ## %entry
318 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
321 %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
322 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
323 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
324 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
325 ret <4 x float> %vecinit6.i
329 define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
331 ; X32: ## %bb.0: ## %entry
332 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
333 ; X32-NEXT: vbroadcastss (%eax), %xmm0
337 ; X64: ## %bb.0: ## %entry
338 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
341 %q = load i32, i32* %ptr, align 4
342 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
343 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
344 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
345 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
346 ret <4 x i32> %vecinit6.i
349 define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
351 ; X32: ## %bb.0: ## %entry
352 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
353 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
354 ; X32-NEXT: movl (%ecx), %ecx
355 ; X32-NEXT: movl %ecx, (%eax)
356 ; X32-NEXT: vmovd %ecx, %xmm0
357 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
361 ; X64: ## %bb.0: ## %entry
362 ; X64-NEXT: movl (%rdi), %eax
363 ; X64-NEXT: movl %eax, (%rsi)
364 ; X64-NEXT: vmovd %eax, %xmm0
365 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
368 %q = load i32, i32* %ptr, align 4
369 store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
370 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
371 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
372 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
373 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
374 ret <4 x i32> %vecinit6.i
377 ; FIXME: Pointer adjusted broadcasts
379 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
380 ; X32-LABEL: load_splat_4i32_4i32_1111:
381 ; X32: ## %bb.0: ## %entry
382 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
383 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
386 ; X64-LABEL: load_splat_4i32_4i32_1111:
387 ; X64: ## %bb.0: ## %entry
388 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
391 %ld = load <4 x i32>, <4 x i32>* %ptr
392 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
396 define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
397 ; X32-LABEL: load_splat_8i32_4i32_33333333:
398 ; X32: ## %bb.0: ## %entry
399 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
400 ; X32-NEXT: vbroadcastss 12(%eax), %ymm0
403 ; X64-LABEL: load_splat_8i32_4i32_33333333:
404 ; X64: ## %bb.0: ## %entry
405 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
408 %ld = load <4 x i32>, <4 x i32>* %ptr
409 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
413 define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
414 ; X32-LABEL: load_splat_8i32_8i32_55555555:
415 ; X32: ## %bb.0: ## %entry
416 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
417 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
420 ; X64-LABEL: load_splat_8i32_8i32_55555555:
421 ; X64: ## %bb.0: ## %entry
422 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
425 %ld = load <8 x i32>, <8 x i32>* %ptr
426 %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
430 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
431 ; X32-LABEL: load_splat_4f32_4f32_1111:
432 ; X32: ## %bb.0: ## %entry
433 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
434 ; X32-NEXT: vbroadcastss 4(%eax), %xmm0
437 ; X64-LABEL: load_splat_4f32_4f32_1111:
438 ; X64: ## %bb.0: ## %entry
439 ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
442 %ld = load <4 x float>, <4 x float>* %ptr
443 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
447 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
448 ; X32-LABEL: load_splat_8f32_4f32_33333333:
449 ; X32: ## %bb.0: ## %entry
450 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
451 ; X32-NEXT: vbroadcastss 12(%eax), %ymm0
454 ; X64-LABEL: load_splat_8f32_4f32_33333333:
455 ; X64: ## %bb.0: ## %entry
456 ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
459 %ld = load <4 x float>, <4 x float>* %ptr
460 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
464 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
465 ; X32-LABEL: load_splat_8f32_8f32_55555555:
466 ; X32: ## %bb.0: ## %entry
467 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
468 ; X32-NEXT: vbroadcastss 20(%eax), %ymm0
471 ; X64-LABEL: load_splat_8f32_8f32_55555555:
472 ; X64: ## %bb.0: ## %entry
473 ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
476 %ld = load <8 x float>, <8 x float>* %ptr
477 %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
481 define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
482 ; X32-LABEL: load_splat_2i64_2i64_1111:
483 ; X32: ## %bb.0: ## %entry
484 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
485 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
488 ; X64-LABEL: load_splat_2i64_2i64_1111:
489 ; X64: ## %bb.0: ## %entry
490 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
493 %ld = load <2 x i64>, <2 x i64>* %ptr
494 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
498 define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
499 ; X32-LABEL: load_splat_4i64_2i64_1111:
500 ; X32: ## %bb.0: ## %entry
501 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
502 ; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
505 ; X64-LABEL: load_splat_4i64_2i64_1111:
506 ; X64: ## %bb.0: ## %entry
507 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
510 %ld = load <2 x i64>, <2 x i64>* %ptr
511 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
515 define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
516 ; X32-LABEL: load_splat_4i64_4i64_2222:
517 ; X32: ## %bb.0: ## %entry
518 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
519 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
522 ; X64-LABEL: load_splat_4i64_4i64_2222:
523 ; X64: ## %bb.0: ## %entry
524 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
527 %ld = load <4 x i64>, <4 x i64>* %ptr
528 %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
532 define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
533 ; X32-LABEL: load_splat_2f64_2f64_1111:
534 ; X32: ## %bb.0: ## %entry
535 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
536 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
539 ; X64-LABEL: load_splat_2f64_2f64_1111:
540 ; X64: ## %bb.0: ## %entry
541 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
544 %ld = load <2 x double>, <2 x double>* %ptr
545 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
546 ret <2 x double> %ret
549 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
550 ; X32-LABEL: load_splat_4f64_2f64_1111:
551 ; X32: ## %bb.0: ## %entry
552 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
553 ; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
556 ; X64-LABEL: load_splat_4f64_2f64_1111:
557 ; X64: ## %bb.0: ## %entry
558 ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
561 %ld = load <2 x double>, <2 x double>* %ptr
562 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
563 ret <4 x double> %ret
566 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
567 ; X32-LABEL: load_splat_4f64_4f64_2222:
568 ; X32: ## %bb.0: ## %entry
569 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
570 ; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
573 ; X64-LABEL: load_splat_4f64_4f64_2222:
574 ; X64: ## %bb.0: ## %entry
575 ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
578 %ld = load <4 x double>, <4 x double>* %ptr
579 %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
580 ret <4 x double> %ret
583 ; Unsupported vbroadcasts
585 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
587 ; X32: ## %bb.0: ## %entry
588 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
589 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
593 ; X64: ## %bb.0: ## %entry
594 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
597 %q = load i64, i64* %ptr, align 8
598 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
599 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
600 ret <2 x i64> %vecinit2.i
603 define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
605 ; X32: ## %bb.0: ## %entry
606 ; X32-NEXT: pushl %esi
607 ; X32-NEXT: .cfi_def_cfa_offset 8
608 ; X32-NEXT: .cfi_offset %esi, -8
609 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
610 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
611 ; X32-NEXT: movl (%ecx), %edx
612 ; X32-NEXT: movl 4(%ecx), %esi
613 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
614 ; X32-NEXT: movl %edx, (%eax)
615 ; X32-NEXT: movl %esi, 4(%eax)
616 ; X32-NEXT: popl %esi
620 ; X64: ## %bb.0: ## %entry
621 ; X64-NEXT: movq (%rdi), %rax
622 ; X64-NEXT: movq %rax, (%rsi)
623 ; X64-NEXT: vmovq %rax, %xmm0
624 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
627 %q = load i64, i64* %ptr, align 8
628 store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
629 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
630 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
631 ret <2 x i64> %vecinit2.i
634 define <4 x i32> @H(<4 x i32> %a) {
636 ; X32: ## %bb.0: ## %entry
637 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
641 ; X64: ## %bb.0: ## %entry
642 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
645 %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
649 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
651 ; X32: ## %bb.0: ## %entry
652 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
653 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
657 ; X64: ## %bb.0: ## %entry
658 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
661 %q = load double, double* %ptr, align 4
662 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
663 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
664 ret <2 x double> %vecinit2.i
667 define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
669 ; X32: ## %bb.0: ## %entry
670 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
671 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
672 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
673 ; X32-NEXT: vmovsd %xmm0, (%eax)
674 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
678 ; X64: ## %bb.0: ## %entry
679 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
680 ; X64-NEXT: vmovsd %xmm0, (%rsi)
681 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
684 %q = load double, double* %ptr, align 4
685 store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast
686 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
687 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
688 ret <2 x double> %vecinit2.i
691 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
693 ; X32: ## %bb.0: ## %entry
694 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
695 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
696 ; X32-NEXT: vbroadcastss (%ecx), %xmm0
697 ; X32-NEXT: movl (%eax), %eax
698 ; X32-NEXT: movl %eax, (%eax)
702 ; X64: ## %bb.0: ## %entry
703 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
704 ; X64-NEXT: movl (%rsi), %eax
705 ; X64-NEXT: movl %eax, (%rax)
708 %q = load float, float* %ptr, align 4
709 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
710 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
711 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
712 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
714 %j = load i32, i32* %k, align 4
715 store i32 %j, i32* undef
716 ret <4 x float> %vecinit6.i
719 define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
721 ; X32: ## %bb.0: ## %entry
722 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
723 ; X32-NEXT: vbroadcastss (%eax), %xmm0
727 ; X64: ## %bb.0: ## %entry
728 ; X64-NEXT: vbroadcastss (%rdi), %xmm0
731 %q = load float, float* %ptr, align 4
732 %v = insertelement <4 x float> undef, float %q, i32 0
733 %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
737 ; These tests check that a vbroadcast instruction is used when we have a splat
738 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
739 ; (via the insertelements).
741 define <8 x float> @splat_concat1(float* %p) {
742 ; X32-LABEL: splat_concat1:
744 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
745 ; X32-NEXT: vbroadcastss (%eax), %ymm0
748 ; X64-LABEL: splat_concat1:
750 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
752 %1 = load float, float* %p, align 4
753 %2 = insertelement <4 x float> undef, float %1, i32 0
754 %3 = insertelement <4 x float> %2, float %1, i32 1
755 %4 = insertelement <4 x float> %3, float %1, i32 2
756 %5 = insertelement <4 x float> %4, float %1, i32 3
757 %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
761 define <8 x float> @splat_concat2(float* %p) {
762 ; X32-LABEL: splat_concat2:
764 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
765 ; X32-NEXT: vbroadcastss (%eax), %ymm0
768 ; X64-LABEL: splat_concat2:
770 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
772 %1 = load float, float* %p, align 4
773 %2 = insertelement <4 x float> undef, float %1, i32 0
774 %3 = insertelement <4 x float> %2, float %1, i32 1
775 %4 = insertelement <4 x float> %3, float %1, i32 2
776 %5 = insertelement <4 x float> %4, float %1, i32 3
777 %6 = insertelement <4 x float> undef, float %1, i32 0
778 %7 = insertelement <4 x float> %6, float %1, i32 1
779 %8 = insertelement <4 x float> %7, float %1, i32 2
780 %9 = insertelement <4 x float> %8, float %1, i32 3
781 %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
785 define <4 x double> @splat_concat3(double* %p) {
786 ; X32-LABEL: splat_concat3:
788 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
789 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
792 ; X64-LABEL: splat_concat3:
794 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
796 %1 = load double, double* %p, align 8
797 %2 = insertelement <2 x double> undef, double %1, i32 0
798 %3 = insertelement <2 x double> %2, double %1, i32 1
799 %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
803 define <4 x double> @splat_concat4(double* %p) {
804 ; X32-LABEL: splat_concat4:
806 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
807 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
810 ; X64-LABEL: splat_concat4:
812 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
814 %1 = load double, double* %p, align 8
815 %2 = insertelement <2 x double> undef, double %1, i32 0
816 %3 = insertelement <2 x double> %2, double %1, i32 1
817 %4 = insertelement <2 x double> undef, double %1, i32 0
818 %5 = insertelement <2 x double> %2, double %1, i32 1
819 %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
824 define <4 x double> @broadcast_shuffle_1000(double* %p) {
825 ; X32-LABEL: broadcast_shuffle_1000:
827 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
828 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
831 ; X64-LABEL: broadcast_shuffle_1000:
833 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
835 %1 = load double, double* %p
836 %2 = insertelement <2 x double> undef, double %1, i32 0
837 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
841 define <4 x double> @broadcast_shuffle1032(double* %p) {
842 ; X32-LABEL: broadcast_shuffle1032:
844 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
845 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
848 ; X64-LABEL: broadcast_shuffle1032:
850 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
852 %1 = load double, double* %p
853 %2 = insertelement <2 x double> undef, double %1, i32 1
854 %3 = insertelement <2 x double> undef, double %1, i32 0
855 %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
859 define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) {
860 ; X32-LABEL: broadcast_v16i32:
862 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
863 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
864 ; X32-NEXT: vbroadcastss (%ecx), %ymm0
865 ; X32-NEXT: vmovups %ymm0, 32(%eax)
866 ; X32-NEXT: vmovups %ymm0, (%eax)
867 ; X32-NEXT: vzeroupper
870 ; X64-LABEL: broadcast_v16i32:
872 ; X64-NEXT: vbroadcastss (%rdi), %ymm0
873 ; X64-NEXT: vmovups %ymm0, 32(%rsi)
874 ; X64-NEXT: vmovups %ymm0, (%rsi)
875 ; X64-NEXT: vzeroupper
877 %1 = load i32, i32* %a, align 4
878 %2 = insertelement <8 x i32> undef, i32 %1, i32 0
879 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
880 %4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
881 store <16 x i32> %4, <16 x i32>* %b, align 4
886 ; Broadcast scale factor for xyz vector - slp will have vectorized xy.
887 ; FIXME: Load as a broadcast and then use the scalar 0'th element.
889 define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind {
890 ; X32-LABEL: broadcast_scale_xyz:
892 ; X32-NEXT: subl $12, %esp
893 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
894 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
895 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
896 ; X32-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
897 ; X32-NEXT: vmulpd (%eax), %xmm1, %xmm1
898 ; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
899 ; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
900 ; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1
901 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
902 ; X32-NEXT: vmovsd %xmm0, (%esp)
903 ; X32-NEXT: fldl (%esp)
904 ; X32-NEXT: addl $12, %esp
907 ; X64-LABEL: broadcast_scale_xyz:
909 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
910 ; X64-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
911 ; X64-NEXT: vmulpd (%rsi), %xmm1, %xmm1
912 ; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
913 ; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
914 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
915 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
917 %3 = bitcast double* %1 to <2 x double>*
918 %4 = load <2 x double>, <2 x double>* %3, align 8
919 %5 = getelementptr inbounds double, double* %1, i64 2
920 %6 = load double, double* %5, align 8
921 %7 = load double, double* %0, align 8
922 %8 = insertelement <2 x double> undef, double %7, i32 0
923 %9 = shufflevector <2 x double> %8, <2 x double> undef, <2 x i32> zeroinitializer
924 %10 = fmul <2 x double> %4, %9
925 %11 = fmul double %6, %7
926 %12 = extractelement <2 x double> %10, i32 0
927 %13 = extractelement <2 x double> %10, i32 1
928 %14 = fadd double %12, %13
929 %15 = fadd double %11, %14
934 ; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
936 define float @broadcast_lifetime() nounwind {
937 ; X32-LABEL: broadcast_lifetime:
939 ; X32-NEXT: pushl %esi
940 ; X32-NEXT: subl $40, %esp
941 ; X32-NEXT: leal {{[0-9]+}}(%esp), %esi
942 ; X32-NEXT: movl %esi, (%esp)
943 ; X32-NEXT: calll _gfunc
944 ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
945 ; X32-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
946 ; X32-NEXT: movl %esi, (%esp)
947 ; X32-NEXT: calll _gfunc
948 ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
949 ; X32-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
950 ; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
951 ; X32-NEXT: flds {{[0-9]+}}(%esp)
952 ; X32-NEXT: addl $40, %esp
953 ; X32-NEXT: popl %esi
956 ; X64-LABEL: broadcast_lifetime:
958 ; X64-NEXT: subq $40, %rsp
959 ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
960 ; X64-NEXT: callq _gfunc
961 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
962 ; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
963 ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
964 ; X64-NEXT: callq _gfunc
965 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
966 ; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
967 ; X64-NEXT: addq $40, %rsp
969 %1 = alloca <4 x float>, align 16
970 %2 = alloca <4 x float>, align 16
971 %3 = bitcast <4 x float>* %1 to i8*
972 %4 = bitcast <4 x float>* %2 to i8*
974 call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
975 call void @gfunc(<4 x float>* %1)
976 %5 = load <4 x float>, <4 x float>* %1, align 16
977 call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
979 call void @llvm.lifetime.start.p0i8(i64 16, i8* %4)
980 call void @gfunc(<4 x float>* %2)
981 %6 = load <4 x float>, <4 x float>* %2, align 16
982 call void @llvm.lifetime.end.p0i8(i64 16, i8* %4)
984 %7 = extractelement <4 x float> %5, i32 1
985 %8 = extractelement <4 x float> %6, i32 1
986 %9 = fsub float %8, %7
990 declare void @gfunc(<4 x float>*)
991 declare void @llvm.lifetime.start.p0i8(i64, i8*)
992 declare void @llvm.lifetime.end.p0i8(i64, i8*)