1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s
4 ; Test that we can unfold constant pool loads when we're using avx512's
5 ; ability to fold a broadcast load into an operation.
7 define void @bcast_unfold_add_v16i32(i32* %arg) {
8 ; CHECK-LABEL: bcast_unfold_add_v16i32:
9 ; CHECK: # %bb.0: # %bb
10 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
11 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
12 ; CHECK-NEXT: .p2align 4, 0x90
13 ; CHECK-NEXT: .LBB0_1: # %bb2
14 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
15 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
16 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
17 ; CHECK-NEXT: addq $64, %rax
18 ; CHECK-NEXT: jne .LBB0_1
19 ; CHECK-NEXT: # %bb.2: # %bb10
20 ; CHECK-NEXT: vzeroupper
25 bb2: ; preds = %bb2, %bb
26 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
27 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
28 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
29 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
30 %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
31 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
32 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
33 %tmp8 = add i64 %tmp, 16
34 %tmp9 = icmp eq i64 %tmp8, 1024
35 br i1 %tmp9, label %bb10, label %bb2
41 define void @bcast_unfold_add_v8i32(i32* %arg) {
42 ; CHECK-LABEL: bcast_unfold_add_v8i32:
43 ; CHECK: # %bb.0: # %bb
44 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
45 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
46 ; CHECK-NEXT: .p2align 4, 0x90
47 ; CHECK-NEXT: .LBB1_1: # %bb2
48 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
49 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
50 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
51 ; CHECK-NEXT: addq $32, %rax
52 ; CHECK-NEXT: jne .LBB1_1
53 ; CHECK-NEXT: # %bb.2: # %bb10
54 ; CHECK-NEXT: vzeroupper
59 bb2: ; preds = %bb2, %bb
60 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
61 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
62 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
63 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
64 %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
65 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
66 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
67 %tmp8 = add i64 %tmp, 8
68 %tmp9 = icmp eq i64 %tmp8, 1024
69 br i1 %tmp9, label %bb10, label %bb2
75 define void @bcast_unfold_add_v4i32(i32* %arg) {
76 ; CHECK-LABEL: bcast_unfold_add_v4i32:
77 ; CHECK: # %bb.0: # %bb
78 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
79 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
80 ; CHECK-NEXT: .p2align 4, 0x90
81 ; CHECK-NEXT: .LBB2_1: # %bb2
82 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
83 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
84 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
85 ; CHECK-NEXT: addq $16, %rax
86 ; CHECK-NEXT: jne .LBB2_1
87 ; CHECK-NEXT: # %bb.2: # %bb10
92 bb2: ; preds = %bb2, %bb
93 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
94 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
95 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
96 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
97 %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
98 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
99 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
100 %tmp8 = add i64 %tmp, 4
101 %tmp9 = icmp eq i64 %tmp8, 1024
102 br i1 %tmp9, label %bb10, label %bb2
108 define void @bcast_unfold_add_v8i64(i64* %arg) {
109 ; CHECK-LABEL: bcast_unfold_add_v8i64:
110 ; CHECK: # %bb.0: # %bb
111 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
112 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
113 ; CHECK-NEXT: .p2align 4, 0x90
114 ; CHECK-NEXT: .LBB3_1: # %bb2
115 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
116 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
117 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
118 ; CHECK-NEXT: addq $64, %rax
119 ; CHECK-NEXT: jne .LBB3_1
120 ; CHECK-NEXT: # %bb.2: # %bb10
121 ; CHECK-NEXT: vzeroupper
126 bb2: ; preds = %bb2, %bb
127 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
128 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
129 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
130 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
131 %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
132 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
133 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
134 %tmp8 = add i64 %tmp, 8
135 %tmp9 = icmp eq i64 %tmp8, 1024
136 br i1 %tmp9, label %bb10, label %bb2
142 define void @bcast_unfold_add_v4i64(i64* %arg) {
143 ; CHECK-LABEL: bcast_unfold_add_v4i64:
144 ; CHECK: # %bb.0: # %bb
145 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
146 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
147 ; CHECK-NEXT: .p2align 4, 0x90
148 ; CHECK-NEXT: .LBB4_1: # %bb2
149 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
150 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
151 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
152 ; CHECK-NEXT: addq $32, %rax
153 ; CHECK-NEXT: jne .LBB4_1
154 ; CHECK-NEXT: # %bb.2: # %bb10
155 ; CHECK-NEXT: vzeroupper
160 bb2: ; preds = %bb2, %bb
161 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
162 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
163 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
164 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
165 %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
166 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
167 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
168 %tmp8 = add i64 %tmp, 4
169 %tmp9 = icmp eq i64 %tmp8, 1024
170 br i1 %tmp9, label %bb10, label %bb2
176 define void @bcast_unfold_add_v2i64(i64* %arg) {
177 ; CHECK-LABEL: bcast_unfold_add_v2i64:
178 ; CHECK: # %bb.0: # %bb
179 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
180 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
181 ; CHECK-NEXT: .p2align 4, 0x90
182 ; CHECK-NEXT: .LBB5_1: # %bb2
183 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
184 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
185 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
186 ; CHECK-NEXT: addq $16, %rax
187 ; CHECK-NEXT: jne .LBB5_1
188 ; CHECK-NEXT: # %bb.2: # %bb10
193 bb2: ; preds = %bb2, %bb
194 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
195 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
196 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
197 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
198 %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
199 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
200 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
201 %tmp8 = add i64 %tmp, 2
202 %tmp9 = icmp eq i64 %tmp8, 1024
203 br i1 %tmp9, label %bb10, label %bb2
209 define void @bcast_unfold_mul_v16i32(i32* %arg) {
210 ; CHECK-LABEL: bcast_unfold_mul_v16i32:
211 ; CHECK: # %bb.0: # %bb
212 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
213 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
214 ; CHECK-NEXT: .p2align 4, 0x90
215 ; CHECK-NEXT: .LBB6_1: # %bb2
216 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
217 ; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1
218 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
219 ; CHECK-NEXT: addq $64, %rax
220 ; CHECK-NEXT: jne .LBB6_1
221 ; CHECK-NEXT: # %bb.2: # %bb10
222 ; CHECK-NEXT: vzeroupper
227 bb2: ; preds = %bb2, %bb
228 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
229 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
230 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
231 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
232 %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
233 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
234 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
235 %tmp8 = add i64 %tmp, 16
236 %tmp9 = icmp eq i64 %tmp8, 1024
237 br i1 %tmp9, label %bb10, label %bb2
243 define void @bcast_unfold_mul_v8i32(i32* %arg) {
244 ; CHECK-LABEL: bcast_unfold_mul_v8i32:
245 ; CHECK: # %bb.0: # %bb
246 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
247 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
248 ; CHECK-NEXT: .p2align 4, 0x90
249 ; CHECK-NEXT: .LBB7_1: # %bb2
250 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
251 ; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1
252 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
253 ; CHECK-NEXT: addq $32, %rax
254 ; CHECK-NEXT: jne .LBB7_1
255 ; CHECK-NEXT: # %bb.2: # %bb10
256 ; CHECK-NEXT: vzeroupper
261 bb2: ; preds = %bb2, %bb
262 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
263 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
264 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
265 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
266 %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
267 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
268 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
269 %tmp8 = add i64 %tmp, 8
270 %tmp9 = icmp eq i64 %tmp8, 1024
271 br i1 %tmp9, label %bb10, label %bb2
277 define void @bcast_unfold_mul_v4i32(i32* %arg) {
278 ; CHECK-LABEL: bcast_unfold_mul_v4i32:
279 ; CHECK: # %bb.0: # %bb
280 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
281 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3]
282 ; CHECK-NEXT: .p2align 4, 0x90
283 ; CHECK-NEXT: .LBB8_1: # %bb2
284 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
285 ; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1
286 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
287 ; CHECK-NEXT: addq $16, %rax
288 ; CHECK-NEXT: jne .LBB8_1
289 ; CHECK-NEXT: # %bb.2: # %bb10
294 bb2: ; preds = %bb2, %bb
295 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
296 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
297 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
298 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
299 %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
300 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
301 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
302 %tmp8 = add i64 %tmp, 4
303 %tmp9 = icmp eq i64 %tmp8, 1024
304 br i1 %tmp9, label %bb10, label %bb2
310 define void @bcast_unfold_mul_v8i64(i64* %arg) {
311 ; CHECK-LABEL: bcast_unfold_mul_v8i64:
312 ; CHECK: # %bb.0: # %bb
313 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
314 ; CHECK-NEXT: .p2align 4, 0x90
315 ; CHECK-NEXT: .LBB9_1: # %bb2
316 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
317 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
318 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1
319 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
320 ; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
321 ; CHECK-NEXT: addq $64, %rax
322 ; CHECK-NEXT: jne .LBB9_1
323 ; CHECK-NEXT: # %bb.2: # %bb10
324 ; CHECK-NEXT: vzeroupper
329 bb2: ; preds = %bb2, %bb
330 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
331 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
332 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
333 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
334 %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
335 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
336 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
337 %tmp8 = add i64 %tmp, 8
338 %tmp9 = icmp eq i64 %tmp8, 1024
339 br i1 %tmp9, label %bb10, label %bb2
345 define void @bcast_unfold_mul_v4i64(i64* %arg) {
346 ; CHECK-LABEL: bcast_unfold_mul_v4i64:
347 ; CHECK: # %bb.0: # %bb
348 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
349 ; CHECK-NEXT: .p2align 4, 0x90
350 ; CHECK-NEXT: .LBB10_1: # %bb2
351 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
352 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
353 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1
354 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
355 ; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
356 ; CHECK-NEXT: addq $32, %rax
357 ; CHECK-NEXT: jne .LBB10_1
358 ; CHECK-NEXT: # %bb.2: # %bb10
359 ; CHECK-NEXT: vzeroupper
364 bb2: ; preds = %bb2, %bb
365 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
366 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
367 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
368 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
369 %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
370 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
371 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
372 %tmp8 = add i64 %tmp, 4
373 %tmp9 = icmp eq i64 %tmp8, 1024
374 br i1 %tmp9, label %bb10, label %bb2
380 define void @bcast_unfold_mul_v2i64(i64* %arg) {
381 ; CHECK-LABEL: bcast_unfold_mul_v2i64:
382 ; CHECK: # %bb.0: # %bb
383 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
384 ; CHECK-NEXT: .p2align 4, 0x90
385 ; CHECK-NEXT: .LBB11_1: # %bb2
386 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
387 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0
388 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1
389 ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
390 ; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax)
391 ; CHECK-NEXT: addq $16, %rax
392 ; CHECK-NEXT: jne .LBB11_1
393 ; CHECK-NEXT: # %bb.2: # %bb10
398 bb2: ; preds = %bb2, %bb
399 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
400 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
401 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
402 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
403 %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
404 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
405 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
406 %tmp8 = add i64 %tmp, 2
407 %tmp9 = icmp eq i64 %tmp8, 1024
408 br i1 %tmp9, label %bb10, label %bb2
414 define void @bcast_unfold_or_v16i32(i32* %arg) {
415 ; CHECK-LABEL: bcast_unfold_or_v16i32:
416 ; CHECK: # %bb.0: # %bb
417 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
418 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
419 ; CHECK-NEXT: .p2align 4, 0x90
420 ; CHECK-NEXT: .LBB12_1: # %bb2
421 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
422 ; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1
423 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
424 ; CHECK-NEXT: addq $64, %rax
425 ; CHECK-NEXT: jne .LBB12_1
426 ; CHECK-NEXT: # %bb.2: # %bb10
427 ; CHECK-NEXT: vzeroupper
432 bb2: ; preds = %bb2, %bb
433 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
434 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
435 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
436 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
437 %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
438 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
439 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
440 %tmp8 = add i64 %tmp, 16
441 %tmp9 = icmp eq i64 %tmp8, 1024
442 br i1 %tmp9, label %bb10, label %bb2
448 define void @bcast_unfold_or_v8i32(i32* %arg) {
449 ; CHECK-LABEL: bcast_unfold_or_v8i32:
450 ; CHECK: # %bb.0: # %bb
451 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
452 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
453 ; CHECK-NEXT: .p2align 4, 0x90
454 ; CHECK-NEXT: .LBB13_1: # %bb2
455 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
456 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1
457 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
458 ; CHECK-NEXT: addq $32, %rax
459 ; CHECK-NEXT: jne .LBB13_1
460 ; CHECK-NEXT: # %bb.2: # %bb10
461 ; CHECK-NEXT: vzeroupper
466 bb2: ; preds = %bb2, %bb
467 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
468 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
469 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
470 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
471 %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
472 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
473 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
474 %tmp8 = add i64 %tmp, 8
475 %tmp9 = icmp eq i64 %tmp8, 1024
476 br i1 %tmp9, label %bb10, label %bb2
482 define void @bcast_unfold_or_v4i32(i32* %arg) {
483 ; CHECK-LABEL: bcast_unfold_or_v4i32:
484 ; CHECK: # %bb.0: # %bb
485 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
486 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
487 ; CHECK-NEXT: .p2align 4, 0x90
488 ; CHECK-NEXT: .LBB14_1: # %bb2
489 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
490 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1
491 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
492 ; CHECK-NEXT: addq $16, %rax
493 ; CHECK-NEXT: jne .LBB14_1
494 ; CHECK-NEXT: # %bb.2: # %bb10
499 bb2: ; preds = %bb2, %bb
500 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
501 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
502 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
503 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
504 %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
505 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
506 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
507 %tmp8 = add i64 %tmp, 4
508 %tmp9 = icmp eq i64 %tmp8, 1024
509 br i1 %tmp9, label %bb10, label %bb2
515 define void @bcast_unfold_or_v8i64(i64* %arg) {
516 ; CHECK-LABEL: bcast_unfold_or_v8i64:
517 ; CHECK: # %bb.0: # %bb
518 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
519 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
520 ; CHECK-NEXT: .p2align 4, 0x90
521 ; CHECK-NEXT: .LBB15_1: # %bb2
522 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
523 ; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1
524 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
525 ; CHECK-NEXT: addq $64, %rax
526 ; CHECK-NEXT: jne .LBB15_1
527 ; CHECK-NEXT: # %bb.2: # %bb10
528 ; CHECK-NEXT: vzeroupper
533 bb2: ; preds = %bb2, %bb
534 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
535 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
536 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
537 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
538 %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
539 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
540 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
541 %tmp8 = add i64 %tmp, 8
542 %tmp9 = icmp eq i64 %tmp8, 1024
543 br i1 %tmp9, label %bb10, label %bb2
549 define void @bcast_unfold_or_v4i64(i64* %arg) {
550 ; CHECK-LABEL: bcast_unfold_or_v4i64:
551 ; CHECK: # %bb.0: # %bb
552 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
553 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
554 ; CHECK-NEXT: .p2align 4, 0x90
555 ; CHECK-NEXT: .LBB16_1: # %bb2
556 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
557 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1
558 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
559 ; CHECK-NEXT: addq $32, %rax
560 ; CHECK-NEXT: jne .LBB16_1
561 ; CHECK-NEXT: # %bb.2: # %bb10
562 ; CHECK-NEXT: vzeroupper
567 bb2: ; preds = %bb2, %bb
568 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
569 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
570 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
571 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
572 %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
573 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
574 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
575 %tmp8 = add i64 %tmp, 4
576 %tmp9 = icmp eq i64 %tmp8, 1024
577 br i1 %tmp9, label %bb10, label %bb2
583 define void @bcast_unfold_or_v2i64(i64* %arg) {
584 ; CHECK-LABEL: bcast_unfold_or_v2i64:
585 ; CHECK: # %bb.0: # %bb
586 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
587 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3]
588 ; CHECK-NEXT: .p2align 4, 0x90
589 ; CHECK-NEXT: .LBB17_1: # %bb2
590 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
591 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1
592 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
593 ; CHECK-NEXT: addq $16, %rax
594 ; CHECK-NEXT: jne .LBB17_1
595 ; CHECK-NEXT: # %bb.2: # %bb10
600 bb2: ; preds = %bb2, %bb
601 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
602 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
603 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
604 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
605 %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
606 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
607 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
608 %tmp8 = add i64 %tmp, 2
609 %tmp9 = icmp eq i64 %tmp8, 1024
610 br i1 %tmp9, label %bb10, label %bb2
616 define void @bcast_unfold_fneg_v16f32(float* %arg) {
617 ; CHECK-LABEL: bcast_unfold_fneg_v16f32:
618 ; CHECK: # %bb.0: # %bb
619 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
620 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
621 ; CHECK-NEXT: .p2align 4, 0x90
622 ; CHECK-NEXT: .LBB18_1: # %bb1
623 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
624 ; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1
625 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
626 ; CHECK-NEXT: addq $64, %rax
627 ; CHECK-NEXT: jne .LBB18_1
628 ; CHECK-NEXT: # %bb.2: # %bb9
629 ; CHECK-NEXT: vzeroupper
634 bb1: ; preds = %bb1, %bb
635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
636 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
637 %tmp3 = bitcast float* %tmp2 to <16 x float>*
638 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
639 %tmp5 = fneg <16 x float> %tmp4
640 %tmp6 = bitcast float* %tmp2 to <16 x float>*
641 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
642 %tmp7 = add i64 %tmp, 16
643 %tmp8 = icmp eq i64 %tmp7, 1024
644 br i1 %tmp8, label %bb9, label %bb1
650 define void @bcast_unfold_fneg_v8f32(float* %arg) {
651 ; CHECK-LABEL: bcast_unfold_fneg_v8f32:
652 ; CHECK: # %bb.0: # %bb
653 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
654 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
655 ; CHECK-NEXT: .p2align 4, 0x90
656 ; CHECK-NEXT: .LBB19_1: # %bb1
657 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
658 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1
659 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
660 ; CHECK-NEXT: addq $32, %rax
661 ; CHECK-NEXT: jne .LBB19_1
662 ; CHECK-NEXT: # %bb.2: # %bb9
663 ; CHECK-NEXT: vzeroupper
668 bb1: ; preds = %bb1, %bb
669 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
670 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
671 %tmp3 = bitcast float* %tmp2 to <8 x float>*
672 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
673 %tmp5 = fneg <8 x float> %tmp4
674 %tmp6 = bitcast float* %tmp2 to <8 x float>*
675 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
676 %tmp7 = add i64 %tmp, 8
677 %tmp8 = icmp eq i64 %tmp7, 1024
678 br i1 %tmp8, label %bb9, label %bb1
684 define void @bcast_unfold_fneg_v4f32(float* %arg) {
685 ; CHECK-LABEL: bcast_unfold_fneg_v4f32:
686 ; CHECK: # %bb.0: # %bb
687 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
688 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
689 ; CHECK-NEXT: .p2align 4, 0x90
690 ; CHECK-NEXT: .LBB20_1: # %bb1
691 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
692 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1
693 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
694 ; CHECK-NEXT: addq $16, %rax
695 ; CHECK-NEXT: jne .LBB20_1
696 ; CHECK-NEXT: # %bb.2: # %bb9
701 bb1: ; preds = %bb1, %bb
702 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
703 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
704 %tmp3 = bitcast float* %tmp2 to <4 x float>*
705 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
706 %tmp5 = fneg <4 x float> %tmp4
707 %tmp6 = bitcast float* %tmp2 to <4 x float>*
708 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
709 %tmp7 = add i64 %tmp, 4
710 %tmp8 = icmp eq i64 %tmp7, 1024
711 br i1 %tmp8, label %bb9, label %bb1
717 define void @bcast_unfold_fneg_v8f64(double* %arg) {
718 ; CHECK-LABEL: bcast_unfold_fneg_v8f64:
719 ; CHECK: # %bb.0: # %bb
720 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
721 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
722 ; CHECK-NEXT: .p2align 4, 0x90
723 ; CHECK-NEXT: .LBB21_1: # %bb1
724 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
725 ; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
726 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
727 ; CHECK-NEXT: addq $64, %rax
728 ; CHECK-NEXT: jne .LBB21_1
729 ; CHECK-NEXT: # %bb.2: # %bb9
730 ; CHECK-NEXT: vzeroupper
735 bb1: ; preds = %bb1, %bb
736 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
737 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
738 %tmp3 = bitcast double* %tmp2 to <8 x double>*
739 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
740 %tmp5 = fneg <8 x double> %tmp4
741 %tmp6 = bitcast double* %tmp2 to <8 x double>*
742 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
743 %tmp7 = add i64 %tmp, 8
744 %tmp8 = icmp eq i64 %tmp7, 1024
745 br i1 %tmp8, label %bb9, label %bb1
751 define void @bcast_unfold_fneg_v4f64(double* %arg) {
752 ; CHECK-LABEL: bcast_unfold_fneg_v4f64:
753 ; CHECK: # %bb.0: # %bb
754 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
755 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
756 ; CHECK-NEXT: .p2align 4, 0x90
757 ; CHECK-NEXT: .LBB22_1: # %bb1
758 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
759 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1
760 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
761 ; CHECK-NEXT: addq $32, %rax
762 ; CHECK-NEXT: jne .LBB22_1
763 ; CHECK-NEXT: # %bb.2: # %bb9
764 ; CHECK-NEXT: vzeroupper
769 bb1: ; preds = %bb1, %bb
770 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
771 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
772 %tmp3 = bitcast double* %tmp2 to <4 x double>*
773 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
774 %tmp5 = fneg <4 x double> %tmp4
775 %tmp6 = bitcast double* %tmp2 to <4 x double>*
776 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
777 %tmp7 = add i64 %tmp, 4
778 %tmp8 = icmp eq i64 %tmp7, 1024
779 br i1 %tmp8, label %bb9, label %bb1
785 define void @bcast_unfold_fneg_v2f64(double* %arg) {
786 ; CHECK-LABEL: bcast_unfold_fneg_v2f64:
787 ; CHECK: # %bb.0: # %bb
788 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
789 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
790 ; CHECK-NEXT: .p2align 4, 0x90
791 ; CHECK-NEXT: .LBB23_1: # %bb1
792 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
793 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1
794 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
795 ; CHECK-NEXT: addq $16, %rax
796 ; CHECK-NEXT: jne .LBB23_1
797 ; CHECK-NEXT: # %bb.2: # %bb9
802 bb1: ; preds = %bb1, %bb
803 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
804 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
805 %tmp3 = bitcast double* %tmp2 to <2 x double>*
806 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
807 %tmp5 = fneg <2 x double> %tmp4
808 %tmp6 = bitcast double* %tmp2 to <2 x double>*
809 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
810 %tmp7 = add i64 %tmp, 2
811 %tmp8 = icmp eq i64 %tmp7, 1024
812 br i1 %tmp8, label %bb9, label %bb1
818 define void @bcast_unfold_fabs_v16f32(float* %arg) {
819 ; CHECK-LABEL: bcast_unfold_fabs_v16f32:
820 ; CHECK: # %bb.0: # %bb
821 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
822 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
823 ; CHECK-NEXT: .p2align 4, 0x90
824 ; CHECK-NEXT: .LBB24_1: # %bb1
825 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
826 ; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1
827 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
828 ; CHECK-NEXT: addq $64, %rax
829 ; CHECK-NEXT: jne .LBB24_1
830 ; CHECK-NEXT: # %bb.2: # %bb9
831 ; CHECK-NEXT: vzeroupper
836 bb1: ; preds = %bb1, %bb
837 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
838 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
839 %tmp3 = bitcast float* %tmp2 to <16 x float>*
840 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
841 %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
842 %tmp6 = bitcast float* %tmp2 to <16 x float>*
843 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
844 %tmp7 = add i64 %tmp, 16
845 %tmp8 = icmp eq i64 %tmp7, 1024
846 br i1 %tmp8, label %bb9, label %bb1
852 ; Function Attrs: nounwind readnone speculatable willreturn
853 declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0
855 define void @bcast_unfold_fabs_v8f32(float* %arg) {
856 ; CHECK-LABEL: bcast_unfold_fabs_v8f32:
857 ; CHECK: # %bb.0: # %bb
858 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
859 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
860 ; CHECK-NEXT: .p2align 4, 0x90
861 ; CHECK-NEXT: .LBB25_1: # %bb1
862 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
863 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1
864 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
865 ; CHECK-NEXT: addq $32, %rax
866 ; CHECK-NEXT: jne .LBB25_1
867 ; CHECK-NEXT: # %bb.2: # %bb9
868 ; CHECK-NEXT: vzeroupper
873 bb1: ; preds = %bb1, %bb
874 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
875 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
876 %tmp3 = bitcast float* %tmp2 to <8 x float>*
877 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
878 %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
879 %tmp6 = bitcast float* %tmp2 to <8 x float>*
880 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
881 %tmp7 = add i64 %tmp, 8
882 %tmp8 = icmp eq i64 %tmp7, 1024
883 br i1 %tmp8, label %bb9, label %bb1
889 ; Function Attrs: nounwind readnone speculatable willreturn
890 declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
892 define void @bcast_unfold_fabs_v4f32(float* %arg) {
893 ; CHECK-LABEL: bcast_unfold_fabs_v4f32:
894 ; CHECK: # %bb.0: # %bb
895 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
896 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
897 ; CHECK-NEXT: .p2align 4, 0x90
898 ; CHECK-NEXT: .LBB26_1: # %bb1
899 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
900 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1
901 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
902 ; CHECK-NEXT: addq $16, %rax
903 ; CHECK-NEXT: jne .LBB26_1
904 ; CHECK-NEXT: # %bb.2: # %bb9
909 bb1: ; preds = %bb1, %bb
910 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
911 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
912 %tmp3 = bitcast float* %tmp2 to <4 x float>*
913 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
914 %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
915 %tmp6 = bitcast float* %tmp2 to <4 x float>*
916 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
917 %tmp7 = add i64 %tmp, 4
918 %tmp8 = icmp eq i64 %tmp7, 1024
919 br i1 %tmp8, label %bb9, label %bb1
925 ; Function Attrs: nounwind readnone speculatable willreturn
926 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0
928 define void @bcast_unfold_fabs_v8f64(double* %arg) {
929 ; CHECK-LABEL: bcast_unfold_fabs_v8f64:
930 ; CHECK: # %bb.0: # %bb
931 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
932 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
933 ; CHECK-NEXT: .p2align 4, 0x90
934 ; CHECK-NEXT: .LBB27_1: # %bb1
935 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
936 ; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1
937 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
938 ; CHECK-NEXT: addq $64, %rax
939 ; CHECK-NEXT: jne .LBB27_1
940 ; CHECK-NEXT: # %bb.2: # %bb9
941 ; CHECK-NEXT: vzeroupper
946 bb1: ; preds = %bb1, %bb
947 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
948 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
949 %tmp3 = bitcast double* %tmp2 to <8 x double>*
950 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
951 %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
952 %tmp6 = bitcast double* %tmp2 to <8 x double>*
953 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
954 %tmp7 = add i64 %tmp, 8
955 %tmp8 = icmp eq i64 %tmp7, 1024
956 br i1 %tmp8, label %bb9, label %bb1
962 ; Function Attrs: nounwind readnone speculatable willreturn
963 declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0
965 define void @bcast_unfold_fabs_v4f64(double* %arg) {
966 ; CHECK-LABEL: bcast_unfold_fabs_v4f64:
967 ; CHECK: # %bb.0: # %bb
968 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
969 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
970 ; CHECK-NEXT: .p2align 4, 0x90
971 ; CHECK-NEXT: .LBB28_1: # %bb1
972 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
973 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1
974 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
975 ; CHECK-NEXT: addq $32, %rax
976 ; CHECK-NEXT: jne .LBB28_1
977 ; CHECK-NEXT: # %bb.2: # %bb9
978 ; CHECK-NEXT: vzeroupper
983 bb1: ; preds = %bb1, %bb
984 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
985 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
986 %tmp3 = bitcast double* %tmp2 to <4 x double>*
987 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
988 %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
989 %tmp6 = bitcast double* %tmp2 to <4 x double>*
990 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
991 %tmp7 = add i64 %tmp, 4
992 %tmp8 = icmp eq i64 %tmp7, 1024
993 br i1 %tmp8, label %bb9, label %bb1
999 ; Function Attrs: nounwind readnone speculatable willreturn
1000 declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0
1002 define void @bcast_unfold_fabs_v2f64(double* %arg) {
1003 ; CHECK-LABEL: bcast_unfold_fabs_v2f64:
1004 ; CHECK: # %bb.0: # %bb
1005 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1006 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN]
1007 ; CHECK-NEXT: .p2align 4, 0x90
1008 ; CHECK-NEXT: .LBB29_1: # %bb1
1009 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1010 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1
1011 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
1012 ; CHECK-NEXT: addq $16, %rax
1013 ; CHECK-NEXT: jne .LBB29_1
1014 ; CHECK-NEXT: # %bb.2: # %bb9
1019 bb1: ; preds = %bb1, %bb
1020 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1021 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1022 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1023 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1024 %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
1025 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1026 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1027 %tmp7 = add i64 %tmp, 2
1028 %tmp8 = icmp eq i64 %tmp7, 1024
1029 br i1 %tmp8, label %bb9, label %bb1
1035 ; Function Attrs: nounwind readnone speculatable willreturn
1036 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
1038 define void @bcast_unfold_fadd_v16f32(float* nocapture %arg) {
1039 ; CHECK-LABEL: bcast_unfold_fadd_v16f32:
1040 ; CHECK: # %bb.0: # %bb
1041 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1042 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1043 ; CHECK-NEXT: .p2align 4, 0x90
1044 ; CHECK-NEXT: .LBB30_1: # %bb1
1045 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1046 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1
1047 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1048 ; CHECK-NEXT: addq $64, %rax
1049 ; CHECK-NEXT: jne .LBB30_1
1050 ; CHECK-NEXT: # %bb.2: # %bb9
1051 ; CHECK-NEXT: vzeroupper
1056 bb1: ; preds = %bb1, %bb
1057 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1058 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1059 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1060 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1061 %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1062 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1063 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1064 %tmp7 = add i64 %tmp, 16
1065 %tmp8 = icmp eq i64 %tmp7, 1024
1066 br i1 %tmp8, label %bb9, label %bb1
1072 define void @bcast_unfold_fadd_v8f32(float* nocapture %arg) {
1073 ; CHECK-LABEL: bcast_unfold_fadd_v8f32:
1074 ; CHECK: # %bb.0: # %bb
1075 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1076 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1077 ; CHECK-NEXT: .p2align 4, 0x90
1078 ; CHECK-NEXT: .LBB31_1: # %bb1
1079 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1080 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1
1081 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1082 ; CHECK-NEXT: addq $32, %rax
1083 ; CHECK-NEXT: jne .LBB31_1
1084 ; CHECK-NEXT: # %bb.2: # %bb9
1085 ; CHECK-NEXT: vzeroupper
1090 bb1: ; preds = %bb1, %bb
1091 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1092 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1093 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1094 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1095 %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1096 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1097 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1098 %tmp7 = add i64 %tmp, 8
1099 %tmp8 = icmp eq i64 %tmp7, 1024
1100 br i1 %tmp8, label %bb9, label %bb1
1106 define void @bcast_unfold_fadd_v4f32(float* nocapture %arg) {
1107 ; CHECK-LABEL: bcast_unfold_fadd_v4f32:
1108 ; CHECK: # %bb.0: # %bb
1109 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1110 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1111 ; CHECK-NEXT: .p2align 4, 0x90
1112 ; CHECK-NEXT: .LBB32_1: # %bb1
1113 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1114 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1
1115 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1116 ; CHECK-NEXT: addq $16, %rax
1117 ; CHECK-NEXT: jne .LBB32_1
1118 ; CHECK-NEXT: # %bb.2: # %bb9
1123 bb1: ; preds = %bb1, %bb
1124 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1125 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1126 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1127 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1128 %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1129 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1130 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1131 %tmp7 = add i64 %tmp, 4
1132 %tmp8 = icmp eq i64 %tmp7, 1024
1133 br i1 %tmp8, label %bb9, label %bb1
1139 define void @bcast_unfold_fadd_v8f64(double* nocapture %arg) {
1140 ; CHECK-LABEL: bcast_unfold_fadd_v8f64:
1141 ; CHECK: # %bb.0: # %bb
1142 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1143 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1144 ; CHECK-NEXT: .p2align 4, 0x90
1145 ; CHECK-NEXT: .LBB33_1: # %bb1
1146 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1147 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
1148 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1149 ; CHECK-NEXT: addq $64, %rax
1150 ; CHECK-NEXT: jne .LBB33_1
1151 ; CHECK-NEXT: # %bb.2: # %bb9
1152 ; CHECK-NEXT: vzeroupper
1157 bb1: ; preds = %bb1, %bb
1158 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1159 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1160 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1161 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1162 %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1163 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1164 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1165 %tmp7 = add i64 %tmp, 8
1166 %tmp8 = icmp eq i64 %tmp7, 1024
1167 br i1 %tmp8, label %bb9, label %bb1
1173 define void @bcast_unfold_fadd_v4f64(double* nocapture %arg) {
1174 ; CHECK-LABEL: bcast_unfold_fadd_v4f64:
1175 ; CHECK: # %bb.0: # %bb
1176 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1177 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1178 ; CHECK-NEXT: .p2align 4, 0x90
1179 ; CHECK-NEXT: .LBB34_1: # %bb1
1180 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1181 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
1182 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1183 ; CHECK-NEXT: addq $32, %rax
1184 ; CHECK-NEXT: jne .LBB34_1
1185 ; CHECK-NEXT: # %bb.2: # %bb9
1186 ; CHECK-NEXT: vzeroupper
1191 bb1: ; preds = %bb1, %bb
1192 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1193 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1194 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1195 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1196 %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1197 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1198 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1199 %tmp7 = add i64 %tmp, 4
1200 %tmp8 = icmp eq i64 %tmp7, 1024
1201 br i1 %tmp8, label %bb9, label %bb1
1207 define void @bcast_unfold_fadd_v2f64(double* nocapture %arg) {
1208 ; CHECK-LABEL: bcast_unfold_fadd_v2f64:
1209 ; CHECK: # %bb.0: # %bb
1210 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1211 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1212 ; CHECK-NEXT: .p2align 4, 0x90
1213 ; CHECK-NEXT: .LBB35_1: # %bb1
1214 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1215 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
1216 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1217 ; CHECK-NEXT: addq $16, %rax
1218 ; CHECK-NEXT: jne .LBB35_1
1219 ; CHECK-NEXT: # %bb.2: # %bb9
1224 bb1: ; preds = %bb1, %bb
1225 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1226 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1227 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1228 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1229 %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1230 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1231 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1232 %tmp7 = add i64 %tmp, 2
1233 %tmp8 = icmp eq i64 %tmp7, 1024
1234 br i1 %tmp8, label %bb9, label %bb1
1240 define void @bcast_unfold_fmul_v16f32(float* nocapture %arg) {
1241 ; CHECK-LABEL: bcast_unfold_fmul_v16f32:
1242 ; CHECK: # %bb.0: # %bb
1243 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1244 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1245 ; CHECK-NEXT: .p2align 4, 0x90
1246 ; CHECK-NEXT: .LBB36_1: # %bb1
1247 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1248 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1
1249 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1250 ; CHECK-NEXT: addq $64, %rax
1251 ; CHECK-NEXT: jne .LBB36_1
1252 ; CHECK-NEXT: # %bb.2: # %bb9
1253 ; CHECK-NEXT: vzeroupper
1258 bb1: ; preds = %bb1, %bb
1259 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1260 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1261 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1262 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1263 %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1264 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1265 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1266 %tmp7 = add i64 %tmp, 16
1267 %tmp8 = icmp eq i64 %tmp7, 1024
1268 br i1 %tmp8, label %bb9, label %bb1
1274 define void @bcast_unfold_fmul_v8f32(float* nocapture %arg) {
1275 ; CHECK-LABEL: bcast_unfold_fmul_v8f32:
1276 ; CHECK: # %bb.0: # %bb
1277 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1278 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1279 ; CHECK-NEXT: .p2align 4, 0x90
1280 ; CHECK-NEXT: .LBB37_1: # %bb1
1281 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1282 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1
1283 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1284 ; CHECK-NEXT: addq $32, %rax
1285 ; CHECK-NEXT: jne .LBB37_1
1286 ; CHECK-NEXT: # %bb.2: # %bb9
1287 ; CHECK-NEXT: vzeroupper
1292 bb1: ; preds = %bb1, %bb
1293 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1294 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1295 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1296 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1297 %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1298 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1299 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1300 %tmp7 = add i64 %tmp, 8
1301 %tmp8 = icmp eq i64 %tmp7, 1024
1302 br i1 %tmp8, label %bb9, label %bb1
1308 define void @bcast_unfold_fmul_v4f32(float* nocapture %arg) {
1309 ; CHECK-LABEL: bcast_unfold_fmul_v4f32:
1310 ; CHECK: # %bb.0: # %bb
1311 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1312 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1313 ; CHECK-NEXT: .p2align 4, 0x90
1314 ; CHECK-NEXT: .LBB38_1: # %bb1
1315 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1316 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1
1317 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1318 ; CHECK-NEXT: addq $16, %rax
1319 ; CHECK-NEXT: jne .LBB38_1
1320 ; CHECK-NEXT: # %bb.2: # %bb9
1325 bb1: ; preds = %bb1, %bb
1326 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1327 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1328 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1329 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1330 %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1331 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1332 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1333 %tmp7 = add i64 %tmp, 4
1334 %tmp8 = icmp eq i64 %tmp7, 1024
1335 br i1 %tmp8, label %bb9, label %bb1
1341 define void @bcast_unfold_fmul_v8f64(double* nocapture %arg) {
1342 ; CHECK-LABEL: bcast_unfold_fmul_v8f64:
1343 ; CHECK: # %bb.0: # %bb
1344 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1345 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1346 ; CHECK-NEXT: .p2align 4, 0x90
1347 ; CHECK-NEXT: .LBB39_1: # %bb1
1348 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1349 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
1350 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1351 ; CHECK-NEXT: addq $64, %rax
1352 ; CHECK-NEXT: jne .LBB39_1
1353 ; CHECK-NEXT: # %bb.2: # %bb9
1354 ; CHECK-NEXT: vzeroupper
1359 bb1: ; preds = %bb1, %bb
1360 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1361 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1362 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1363 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1364 %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1365 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1366 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1367 %tmp7 = add i64 %tmp, 8
1368 %tmp8 = icmp eq i64 %tmp7, 1024
1369 br i1 %tmp8, label %bb9, label %bb1
1375 define void @bcast_unfold_fmul_v4f64(double* nocapture %arg) {
1376 ; CHECK-LABEL: bcast_unfold_fmul_v4f64:
1377 ; CHECK: # %bb.0: # %bb
1378 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1379 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1380 ; CHECK-NEXT: .p2align 4, 0x90
1381 ; CHECK-NEXT: .LBB40_1: # %bb1
1382 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1383 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
1384 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1385 ; CHECK-NEXT: addq $32, %rax
1386 ; CHECK-NEXT: jne .LBB40_1
1387 ; CHECK-NEXT: # %bb.2: # %bb9
1388 ; CHECK-NEXT: vzeroupper
1393 bb1: ; preds = %bb1, %bb
1394 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1395 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1396 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1397 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1398 %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1399 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1400 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1401 %tmp7 = add i64 %tmp, 4
1402 %tmp8 = icmp eq i64 %tmp7, 1024
1403 br i1 %tmp8, label %bb9, label %bb1
1409 define void @bcast_unfold_fmul_v2f64(double* nocapture %arg) {
1410 ; CHECK-LABEL: bcast_unfold_fmul_v2f64:
1411 ; CHECK: # %bb.0: # %bb
1412 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1413 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
1414 ; CHECK-NEXT: .p2align 4, 0x90
1415 ; CHECK-NEXT: .LBB41_1: # %bb1
1416 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1417 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
1418 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1419 ; CHECK-NEXT: addq $16, %rax
1420 ; CHECK-NEXT: jne .LBB41_1
1421 ; CHECK-NEXT: # %bb.2: # %bb9
1426 bb1: ; preds = %bb1, %bb
1427 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1428 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1429 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1430 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1431 %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
1432 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1433 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1434 %tmp7 = add i64 %tmp, 2
1435 %tmp8 = icmp eq i64 %tmp7, 1024
1436 br i1 %tmp8, label %bb9, label %bb1
1442 define void @bcast_unfold_fdiv_v16f32(float* nocapture %arg) {
1443 ; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
1444 ; CHECK: # %bb.0: # %bb
1445 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1446 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1447 ; CHECK-NEXT: .p2align 4, 0x90
1448 ; CHECK-NEXT: .LBB42_1: # %bb1
1449 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1450 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1451 ; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1
1452 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1453 ; CHECK-NEXT: addq $64, %rax
1454 ; CHECK-NEXT: jne .LBB42_1
1455 ; CHECK-NEXT: # %bb.2: # %bb9
1456 ; CHECK-NEXT: vzeroupper
1461 bb1: ; preds = %bb1, %bb
1462 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1463 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1464 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1465 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1466 %tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1467 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1468 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1469 %tmp7 = add i64 %tmp, 16
1470 %tmp8 = icmp eq i64 %tmp7, 1024
1471 br i1 %tmp8, label %bb9, label %bb1
1477 define void @bcast_unfold_fdiv_v8f32(float* nocapture %arg) {
1478 ; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
1479 ; CHECK: # %bb.0: # %bb
1480 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1481 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1482 ; CHECK-NEXT: .p2align 4, 0x90
1483 ; CHECK-NEXT: .LBB43_1: # %bb1
1484 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1485 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1486 ; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1
1487 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1488 ; CHECK-NEXT: addq $32, %rax
1489 ; CHECK-NEXT: jne .LBB43_1
1490 ; CHECK-NEXT: # %bb.2: # %bb9
1491 ; CHECK-NEXT: vzeroupper
1496 bb1: ; preds = %bb1, %bb
1497 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1498 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1499 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1500 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1501 %tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1502 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1503 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1504 %tmp7 = add i64 %tmp, 8
1505 %tmp8 = icmp eq i64 %tmp7, 1024
1506 br i1 %tmp8, label %bb9, label %bb1
1512 define void @bcast_unfold_fdiv_v4f32(float* nocapture %arg) {
1513 ; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
1514 ; CHECK: # %bb.0: # %bb
1515 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1516 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1517 ; CHECK-NEXT: .p2align 4, 0x90
1518 ; CHECK-NEXT: .LBB44_1: # %bb1
1519 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1520 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1521 ; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1
1522 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1523 ; CHECK-NEXT: addq $16, %rax
1524 ; CHECK-NEXT: jne .LBB44_1
1525 ; CHECK-NEXT: # %bb.2: # %bb9
1530 bb1: ; preds = %bb1, %bb
1531 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1532 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1533 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1534 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1535 %tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1536 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1537 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1538 %tmp7 = add i64 %tmp, 4
1539 %tmp8 = icmp eq i64 %tmp7, 1024
1540 br i1 %tmp8, label %bb9, label %bb1
1546 define void @bcast_unfold_fdiv_v8f64(double* nocapture %arg) {
1547 ; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
1548 ; CHECK: # %bb.0: # %bb
1549 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1550 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1551 ; CHECK-NEXT: .p2align 4, 0x90
1552 ; CHECK-NEXT: .LBB45_1: # %bb1
1553 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1554 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
1555 ; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1
1556 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1557 ; CHECK-NEXT: addq $64, %rax
1558 ; CHECK-NEXT: jne .LBB45_1
1559 ; CHECK-NEXT: # %bb.2: # %bb9
1560 ; CHECK-NEXT: vzeroupper
1565 bb1: ; preds = %bb1, %bb
1566 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1567 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1568 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1569 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1570 %tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1571 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1572 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1573 %tmp7 = add i64 %tmp, 8
1574 %tmp8 = icmp eq i64 %tmp7, 1024
1575 br i1 %tmp8, label %bb9, label %bb1
1581 define void @bcast_unfold_fdiv_v4f64(double* nocapture %arg) {
1582 ; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
1583 ; CHECK: # %bb.0: # %bb
1584 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1585 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1586 ; CHECK-NEXT: .p2align 4, 0x90
1587 ; CHECK-NEXT: .LBB46_1: # %bb1
1588 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1589 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1590 ; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1
1591 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1592 ; CHECK-NEXT: addq $32, %rax
1593 ; CHECK-NEXT: jne .LBB46_1
1594 ; CHECK-NEXT: # %bb.2: # %bb9
1595 ; CHECK-NEXT: vzeroupper
1600 bb1: ; preds = %bb1, %bb
1601 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1602 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1603 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1604 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1605 %tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1606 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1607 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1608 %tmp7 = add i64 %tmp, 4
1609 %tmp8 = icmp eq i64 %tmp7, 1024
1610 br i1 %tmp8, label %bb9, label %bb1
1616 define void @bcast_unfold_fdiv_v2f64(double* nocapture %arg) {
1617 ; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
1618 ; CHECK: # %bb.0: # %bb
1619 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1620 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1621 ; CHECK-NEXT: .p2align 4, 0x90
1622 ; CHECK-NEXT: .LBB47_1: # %bb1
1623 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1624 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1625 ; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1
1626 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1627 ; CHECK-NEXT: addq $16, %rax
1628 ; CHECK-NEXT: jne .LBB47_1
1629 ; CHECK-NEXT: # %bb.2: # %bb9
1634 bb1: ; preds = %bb1, %bb
1635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1636 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1637 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1638 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1639 %tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1640 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1641 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1642 %tmp7 = add i64 %tmp, 2
1643 %tmp8 = icmp eq i64 %tmp7, 1024
1644 br i1 %tmp8, label %bb9, label %bb1