1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s
4 ; Test that we can unfold constant pool loads when we're using avx512's
5 ; ability to fold a broadcast load into an operation.
7 define void @bcast_unfold_add_v16i32(i32* %arg) {
8 ; CHECK-LABEL: bcast_unfold_add_v16i32:
9 ; CHECK: # %bb.0: # %bb
10 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
11 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
12 ; CHECK-NEXT: .p2align 4, 0x90
13 ; CHECK-NEXT: .LBB0_1: # %bb2
14 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
15 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
16 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
17 ; CHECK-NEXT: addq $64, %rax
18 ; CHECK-NEXT: jne .LBB0_1
19 ; CHECK-NEXT: # %bb.2: # %bb10
20 ; CHECK-NEXT: vzeroupper
25 bb2: ; preds = %bb2, %bb
26 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
27 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
28 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
29 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
30 %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
31 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
32 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
33 %tmp8 = add i64 %tmp, 16
34 %tmp9 = icmp eq i64 %tmp8, 1024
35 br i1 %tmp9, label %bb10, label %bb2
41 define void @bcast_unfold_add_v8i32(i32* %arg) {
42 ; CHECK-LABEL: bcast_unfold_add_v8i32:
43 ; CHECK: # %bb.0: # %bb
44 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
45 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
46 ; CHECK-NEXT: .p2align 4, 0x90
47 ; CHECK-NEXT: .LBB1_1: # %bb2
48 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
49 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
50 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
51 ; CHECK-NEXT: addq $32, %rax
52 ; CHECK-NEXT: jne .LBB1_1
53 ; CHECK-NEXT: # %bb.2: # %bb10
54 ; CHECK-NEXT: vzeroupper
59 bb2: ; preds = %bb2, %bb
60 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
61 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
62 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
63 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
64 %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
65 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
66 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
67 %tmp8 = add i64 %tmp, 8
68 %tmp9 = icmp eq i64 %tmp8, 1024
69 br i1 %tmp9, label %bb10, label %bb2
75 define void @bcast_unfold_add_v4i32(i32* %arg) {
76 ; CHECK-LABEL: bcast_unfold_add_v4i32:
77 ; CHECK: # %bb.0: # %bb
78 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
79 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
80 ; CHECK-NEXT: .p2align 4, 0x90
81 ; CHECK-NEXT: .LBB2_1: # %bb2
82 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
83 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
84 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
85 ; CHECK-NEXT: addq $16, %rax
86 ; CHECK-NEXT: jne .LBB2_1
87 ; CHECK-NEXT: # %bb.2: # %bb10
92 bb2: ; preds = %bb2, %bb
93 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
94 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
95 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
96 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
97 %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
98 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
99 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
100 %tmp8 = add i64 %tmp, 4
101 %tmp9 = icmp eq i64 %tmp8, 1024
102 br i1 %tmp9, label %bb10, label %bb2
108 define void @bcast_unfold_add_v8i64(i64* %arg) {
109 ; CHECK-LABEL: bcast_unfold_add_v8i64:
110 ; CHECK: # %bb.0: # %bb
111 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
112 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
113 ; CHECK-NEXT: .p2align 4, 0x90
114 ; CHECK-NEXT: .LBB3_1: # %bb2
115 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
116 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
117 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
118 ; CHECK-NEXT: addq $64, %rax
119 ; CHECK-NEXT: jne .LBB3_1
120 ; CHECK-NEXT: # %bb.2: # %bb10
121 ; CHECK-NEXT: vzeroupper
126 bb2: ; preds = %bb2, %bb
127 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
128 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
129 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
130 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
131 %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
132 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
133 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
134 %tmp8 = add i64 %tmp, 8
135 %tmp9 = icmp eq i64 %tmp8, 1024
136 br i1 %tmp9, label %bb10, label %bb2
142 define void @bcast_unfold_add_v4i64(i64* %arg) {
143 ; CHECK-LABEL: bcast_unfold_add_v4i64:
144 ; CHECK: # %bb.0: # %bb
145 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
146 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
147 ; CHECK-NEXT: .p2align 4, 0x90
148 ; CHECK-NEXT: .LBB4_1: # %bb2
149 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
150 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
151 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
152 ; CHECK-NEXT: addq $32, %rax
153 ; CHECK-NEXT: jne .LBB4_1
154 ; CHECK-NEXT: # %bb.2: # %bb10
155 ; CHECK-NEXT: vzeroupper
160 bb2: ; preds = %bb2, %bb
161 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
162 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
163 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
164 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
165 %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
166 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
167 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
168 %tmp8 = add i64 %tmp, 4
169 %tmp9 = icmp eq i64 %tmp8, 1024
170 br i1 %tmp9, label %bb10, label %bb2
176 define void @bcast_unfold_add_v2i64(i64* %arg) {
177 ; CHECK-LABEL: bcast_unfold_add_v2i64:
178 ; CHECK: # %bb.0: # %bb
179 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
180 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
181 ; CHECK-NEXT: .p2align 4, 0x90
182 ; CHECK-NEXT: .LBB5_1: # %bb2
183 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
184 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
185 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
186 ; CHECK-NEXT: addq $16, %rax
187 ; CHECK-NEXT: jne .LBB5_1
188 ; CHECK-NEXT: # %bb.2: # %bb10
193 bb2: ; preds = %bb2, %bb
194 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
195 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
196 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
197 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
198 %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
199 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
200 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
201 %tmp8 = add i64 %tmp, 2
202 %tmp9 = icmp eq i64 %tmp8, 1024
203 br i1 %tmp9, label %bb10, label %bb2
209 define void @bcast_unfold_mul_v16i32(i32* %arg) {
210 ; CHECK-LABEL: bcast_unfold_mul_v16i32:
211 ; CHECK: # %bb.0: # %bb
212 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
213 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
214 ; CHECK-NEXT: .p2align 4, 0x90
215 ; CHECK-NEXT: .LBB6_1: # %bb2
216 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
217 ; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1
218 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
219 ; CHECK-NEXT: addq $64, %rax
220 ; CHECK-NEXT: jne .LBB6_1
221 ; CHECK-NEXT: # %bb.2: # %bb10
222 ; CHECK-NEXT: vzeroupper
227 bb2: ; preds = %bb2, %bb
228 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
229 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
230 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
231 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
232 %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
233 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
234 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
235 %tmp8 = add i64 %tmp, 16
236 %tmp9 = icmp eq i64 %tmp8, 1024
237 br i1 %tmp9, label %bb10, label %bb2
243 define void @bcast_unfold_mul_v8i32(i32* %arg) {
244 ; CHECK-LABEL: bcast_unfold_mul_v8i32:
245 ; CHECK: # %bb.0: # %bb
246 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
247 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
248 ; CHECK-NEXT: .p2align 4, 0x90
249 ; CHECK-NEXT: .LBB7_1: # %bb2
250 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
251 ; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1
252 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
253 ; CHECK-NEXT: addq $32, %rax
254 ; CHECK-NEXT: jne .LBB7_1
255 ; CHECK-NEXT: # %bb.2: # %bb10
256 ; CHECK-NEXT: vzeroupper
261 bb2: ; preds = %bb2, %bb
262 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
263 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
264 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
265 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
266 %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
267 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
268 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
269 %tmp8 = add i64 %tmp, 8
270 %tmp9 = icmp eq i64 %tmp8, 1024
271 br i1 %tmp9, label %bb10, label %bb2
277 define void @bcast_unfold_mul_v4i32(i32* %arg) {
278 ; CHECK-LABEL: bcast_unfold_mul_v4i32:
279 ; CHECK: # %bb.0: # %bb
280 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
281 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3]
282 ; CHECK-NEXT: .p2align 4, 0x90
283 ; CHECK-NEXT: .LBB8_1: # %bb2
284 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
285 ; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1
286 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
287 ; CHECK-NEXT: addq $16, %rax
288 ; CHECK-NEXT: jne .LBB8_1
289 ; CHECK-NEXT: # %bb.2: # %bb10
294 bb2: ; preds = %bb2, %bb
295 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
296 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
297 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
298 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
299 %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
300 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
301 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
302 %tmp8 = add i64 %tmp, 4
303 %tmp9 = icmp eq i64 %tmp8, 1024
304 br i1 %tmp9, label %bb10, label %bb2
310 define void @bcast_unfold_mul_v8i64(i64* %arg) {
311 ; CHECK-LABEL: bcast_unfold_mul_v8i64:
312 ; CHECK: # %bb.0: # %bb
313 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
314 ; CHECK-NEXT: .p2align 4, 0x90
315 ; CHECK-NEXT: .LBB9_1: # %bb2
316 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
317 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
318 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1
319 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
320 ; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
321 ; CHECK-NEXT: addq $64, %rax
322 ; CHECK-NEXT: jne .LBB9_1
323 ; CHECK-NEXT: # %bb.2: # %bb10
324 ; CHECK-NEXT: vzeroupper
329 bb2: ; preds = %bb2, %bb
330 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
331 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
332 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
333 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
334 %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
335 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
336 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
337 %tmp8 = add i64 %tmp, 8
338 %tmp9 = icmp eq i64 %tmp8, 1024
339 br i1 %tmp9, label %bb10, label %bb2
345 define void @bcast_unfold_mul_v4i64(i64* %arg) {
346 ; CHECK-LABEL: bcast_unfold_mul_v4i64:
347 ; CHECK: # %bb.0: # %bb
348 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
349 ; CHECK-NEXT: .p2align 4, 0x90
350 ; CHECK-NEXT: .LBB10_1: # %bb2
351 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
352 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
353 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1
354 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
355 ; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
356 ; CHECK-NEXT: addq $32, %rax
357 ; CHECK-NEXT: jne .LBB10_1
358 ; CHECK-NEXT: # %bb.2: # %bb10
359 ; CHECK-NEXT: vzeroupper
364 bb2: ; preds = %bb2, %bb
365 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
366 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
367 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
368 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
369 %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
370 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
371 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
372 %tmp8 = add i64 %tmp, 4
373 %tmp9 = icmp eq i64 %tmp8, 1024
374 br i1 %tmp9, label %bb10, label %bb2
380 define void @bcast_unfold_mul_v2i64(i64* %arg) {
381 ; CHECK-LABEL: bcast_unfold_mul_v2i64:
382 ; CHECK: # %bb.0: # %bb
383 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
384 ; CHECK-NEXT: .p2align 4, 0x90
385 ; CHECK-NEXT: .LBB11_1: # %bb2
386 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
387 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0
388 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1
389 ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
390 ; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax)
391 ; CHECK-NEXT: addq $16, %rax
392 ; CHECK-NEXT: jne .LBB11_1
393 ; CHECK-NEXT: # %bb.2: # %bb10
398 bb2: ; preds = %bb2, %bb
399 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
400 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
401 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
402 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
403 %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
404 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
405 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
406 %tmp8 = add i64 %tmp, 2
407 %tmp9 = icmp eq i64 %tmp8, 1024
408 br i1 %tmp9, label %bb10, label %bb2
414 define void @bcast_unfold_or_v16i32(i32* %arg) {
415 ; CHECK-LABEL: bcast_unfold_or_v16i32:
416 ; CHECK: # %bb.0: # %bb
417 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
418 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
419 ; CHECK-NEXT: .p2align 4, 0x90
420 ; CHECK-NEXT: .LBB12_1: # %bb2
421 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
422 ; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1
423 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
424 ; CHECK-NEXT: addq $64, %rax
425 ; CHECK-NEXT: jne .LBB12_1
426 ; CHECK-NEXT: # %bb.2: # %bb10
427 ; CHECK-NEXT: vzeroupper
432 bb2: ; preds = %bb2, %bb
433 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
434 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
435 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
436 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
437 %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
438 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
439 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
440 %tmp8 = add i64 %tmp, 16
441 %tmp9 = icmp eq i64 %tmp8, 1024
442 br i1 %tmp9, label %bb10, label %bb2
448 define void @bcast_unfold_or_v8i32(i32* %arg) {
449 ; CHECK-LABEL: bcast_unfold_or_v8i32:
450 ; CHECK: # %bb.0: # %bb
451 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
452 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
453 ; CHECK-NEXT: .p2align 4, 0x90
454 ; CHECK-NEXT: .LBB13_1: # %bb2
455 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
456 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1
457 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
458 ; CHECK-NEXT: addq $32, %rax
459 ; CHECK-NEXT: jne .LBB13_1
460 ; CHECK-NEXT: # %bb.2: # %bb10
461 ; CHECK-NEXT: vzeroupper
466 bb2: ; preds = %bb2, %bb
467 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
468 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
469 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
470 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
471 %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
472 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
473 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
474 %tmp8 = add i64 %tmp, 8
475 %tmp9 = icmp eq i64 %tmp8, 1024
476 br i1 %tmp9, label %bb10, label %bb2
482 define void @bcast_unfold_or_v4i32(i32* %arg) {
483 ; CHECK-LABEL: bcast_unfold_or_v4i32:
484 ; CHECK: # %bb.0: # %bb
485 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
486 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
487 ; CHECK-NEXT: .p2align 4, 0x90
488 ; CHECK-NEXT: .LBB14_1: # %bb2
489 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
490 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1
491 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
492 ; CHECK-NEXT: addq $16, %rax
493 ; CHECK-NEXT: jne .LBB14_1
494 ; CHECK-NEXT: # %bb.2: # %bb10
499 bb2: ; preds = %bb2, %bb
500 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
501 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
502 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
503 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
504 %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
505 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
506 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
507 %tmp8 = add i64 %tmp, 4
508 %tmp9 = icmp eq i64 %tmp8, 1024
509 br i1 %tmp9, label %bb10, label %bb2
515 define void @bcast_unfold_or_v8i64(i64* %arg) {
516 ; CHECK-LABEL: bcast_unfold_or_v8i64:
517 ; CHECK: # %bb.0: # %bb
518 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
519 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
520 ; CHECK-NEXT: .p2align 4, 0x90
521 ; CHECK-NEXT: .LBB15_1: # %bb2
522 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
523 ; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1
524 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
525 ; CHECK-NEXT: addq $64, %rax
526 ; CHECK-NEXT: jne .LBB15_1
527 ; CHECK-NEXT: # %bb.2: # %bb10
528 ; CHECK-NEXT: vzeroupper
533 bb2: ; preds = %bb2, %bb
534 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
535 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
536 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
537 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
538 %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
539 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
540 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
541 %tmp8 = add i64 %tmp, 8
542 %tmp9 = icmp eq i64 %tmp8, 1024
543 br i1 %tmp9, label %bb10, label %bb2
549 define void @bcast_unfold_or_v4i64(i64* %arg) {
550 ; CHECK-LABEL: bcast_unfold_or_v4i64:
551 ; CHECK: # %bb.0: # %bb
552 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
553 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
554 ; CHECK-NEXT: .p2align 4, 0x90
555 ; CHECK-NEXT: .LBB16_1: # %bb2
556 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
557 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1
558 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
559 ; CHECK-NEXT: addq $32, %rax
560 ; CHECK-NEXT: jne .LBB16_1
561 ; CHECK-NEXT: # %bb.2: # %bb10
562 ; CHECK-NEXT: vzeroupper
567 bb2: ; preds = %bb2, %bb
568 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
569 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
570 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
571 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
572 %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
573 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
574 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
575 %tmp8 = add i64 %tmp, 4
576 %tmp9 = icmp eq i64 %tmp8, 1024
577 br i1 %tmp9, label %bb10, label %bb2
583 define void @bcast_unfold_or_v2i64(i64* %arg) {
584 ; CHECK-LABEL: bcast_unfold_or_v2i64:
585 ; CHECK: # %bb.0: # %bb
586 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
587 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3]
588 ; CHECK-NEXT: .p2align 4, 0x90
589 ; CHECK-NEXT: .LBB17_1: # %bb2
590 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
591 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1
592 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
593 ; CHECK-NEXT: addq $16, %rax
594 ; CHECK-NEXT: jne .LBB17_1
595 ; CHECK-NEXT: # %bb.2: # %bb10
600 bb2: ; preds = %bb2, %bb
601 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
602 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
603 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
604 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
605 %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
606 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
607 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
608 %tmp8 = add i64 %tmp, 2
609 %tmp9 = icmp eq i64 %tmp8, 1024
610 br i1 %tmp9, label %bb10, label %bb2
616 define void @bcast_unfold_fneg_v16f32(float* %arg) {
617 ; CHECK-LABEL: bcast_unfold_fneg_v16f32:
618 ; CHECK: # %bb.0: # %bb
619 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
620 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
621 ; CHECK-NEXT: .p2align 4, 0x90
622 ; CHECK-NEXT: .LBB18_1: # %bb1
623 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
624 ; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1
625 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
626 ; CHECK-NEXT: addq $64, %rax
627 ; CHECK-NEXT: jne .LBB18_1
628 ; CHECK-NEXT: # %bb.2: # %bb9
629 ; CHECK-NEXT: vzeroupper
634 bb1: ; preds = %bb1, %bb
635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
636 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
637 %tmp3 = bitcast float* %tmp2 to <16 x float>*
638 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
639 %tmp5 = fneg <16 x float> %tmp4
640 %tmp6 = bitcast float* %tmp2 to <16 x float>*
641 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
642 %tmp7 = add i64 %tmp, 16
643 %tmp8 = icmp eq i64 %tmp7, 1024
644 br i1 %tmp8, label %bb9, label %bb1
650 define void @bcast_unfold_fneg_v8f32(float* %arg) {
651 ; CHECK-LABEL: bcast_unfold_fneg_v8f32:
652 ; CHECK: # %bb.0: # %bb
653 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
654 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
655 ; CHECK-NEXT: .p2align 4, 0x90
656 ; CHECK-NEXT: .LBB19_1: # %bb1
657 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
658 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1
659 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
660 ; CHECK-NEXT: addq $32, %rax
661 ; CHECK-NEXT: jne .LBB19_1
662 ; CHECK-NEXT: # %bb.2: # %bb9
663 ; CHECK-NEXT: vzeroupper
668 bb1: ; preds = %bb1, %bb
669 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
670 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
671 %tmp3 = bitcast float* %tmp2 to <8 x float>*
672 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
673 %tmp5 = fneg <8 x float> %tmp4
674 %tmp6 = bitcast float* %tmp2 to <8 x float>*
675 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
676 %tmp7 = add i64 %tmp, 8
677 %tmp8 = icmp eq i64 %tmp7, 1024
678 br i1 %tmp8, label %bb9, label %bb1
684 define void @bcast_unfold_fneg_v4f32(float* %arg) {
685 ; CHECK-LABEL: bcast_unfold_fneg_v4f32:
686 ; CHECK: # %bb.0: # %bb
687 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
688 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
689 ; CHECK-NEXT: .p2align 4, 0x90
690 ; CHECK-NEXT: .LBB20_1: # %bb1
691 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
692 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1
693 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
694 ; CHECK-NEXT: addq $16, %rax
695 ; CHECK-NEXT: jne .LBB20_1
696 ; CHECK-NEXT: # %bb.2: # %bb9
701 bb1: ; preds = %bb1, %bb
702 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
703 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
704 %tmp3 = bitcast float* %tmp2 to <4 x float>*
705 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
706 %tmp5 = fneg <4 x float> %tmp4
707 %tmp6 = bitcast float* %tmp2 to <4 x float>*
708 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
709 %tmp7 = add i64 %tmp, 4
710 %tmp8 = icmp eq i64 %tmp7, 1024
711 br i1 %tmp8, label %bb9, label %bb1
717 define void @bcast_unfold_fneg_v8f64(double* %arg) {
718 ; CHECK-LABEL: bcast_unfold_fneg_v8f64:
719 ; CHECK: # %bb.0: # %bb
720 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
721 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
722 ; CHECK-NEXT: .p2align 4, 0x90
723 ; CHECK-NEXT: .LBB21_1: # %bb1
724 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
725 ; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
726 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
727 ; CHECK-NEXT: addq $64, %rax
728 ; CHECK-NEXT: jne .LBB21_1
729 ; CHECK-NEXT: # %bb.2: # %bb9
730 ; CHECK-NEXT: vzeroupper
735 bb1: ; preds = %bb1, %bb
736 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
737 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
738 %tmp3 = bitcast double* %tmp2 to <8 x double>*
739 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
740 %tmp5 = fneg <8 x double> %tmp4
741 %tmp6 = bitcast double* %tmp2 to <8 x double>*
742 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
743 %tmp7 = add i64 %tmp, 8
744 %tmp8 = icmp eq i64 %tmp7, 1024
745 br i1 %tmp8, label %bb9, label %bb1
751 define void @bcast_unfold_fneg_v4f64(double* %arg) {
752 ; CHECK-LABEL: bcast_unfold_fneg_v4f64:
753 ; CHECK: # %bb.0: # %bb
754 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
755 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
756 ; CHECK-NEXT: .p2align 4, 0x90
757 ; CHECK-NEXT: .LBB22_1: # %bb1
758 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
759 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1
760 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
761 ; CHECK-NEXT: addq $32, %rax
762 ; CHECK-NEXT: jne .LBB22_1
763 ; CHECK-NEXT: # %bb.2: # %bb9
764 ; CHECK-NEXT: vzeroupper
769 bb1: ; preds = %bb1, %bb
770 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
771 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
772 %tmp3 = bitcast double* %tmp2 to <4 x double>*
773 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
774 %tmp5 = fneg <4 x double> %tmp4
775 %tmp6 = bitcast double* %tmp2 to <4 x double>*
776 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
777 %tmp7 = add i64 %tmp, 4
778 %tmp8 = icmp eq i64 %tmp7, 1024
779 br i1 %tmp8, label %bb9, label %bb1
785 define void @bcast_unfold_fneg_v2f64(double* %arg) {
786 ; CHECK-LABEL: bcast_unfold_fneg_v2f64:
787 ; CHECK: # %bb.0: # %bb
788 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
789 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
790 ; CHECK-NEXT: .p2align 4, 0x90
791 ; CHECK-NEXT: .LBB23_1: # %bb1
792 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
793 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1
794 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
795 ; CHECK-NEXT: addq $16, %rax
796 ; CHECK-NEXT: jne .LBB23_1
797 ; CHECK-NEXT: # %bb.2: # %bb9
802 bb1: ; preds = %bb1, %bb
803 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
804 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
805 %tmp3 = bitcast double* %tmp2 to <2 x double>*
806 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
807 %tmp5 = fneg <2 x double> %tmp4
808 %tmp6 = bitcast double* %tmp2 to <2 x double>*
809 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
810 %tmp7 = add i64 %tmp, 2
811 %tmp8 = icmp eq i64 %tmp7, 1024
812 br i1 %tmp8, label %bb9, label %bb1
818 define void @bcast_unfold_fabs_v16f32(float* %arg) {
819 ; CHECK-LABEL: bcast_unfold_fabs_v16f32:
820 ; CHECK: # %bb.0: # %bb
821 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
822 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
823 ; CHECK-NEXT: .p2align 4, 0x90
824 ; CHECK-NEXT: .LBB24_1: # %bb1
825 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
826 ; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1
827 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
828 ; CHECK-NEXT: addq $64, %rax
829 ; CHECK-NEXT: jne .LBB24_1
830 ; CHECK-NEXT: # %bb.2: # %bb9
831 ; CHECK-NEXT: vzeroupper
836 bb1: ; preds = %bb1, %bb
837 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
838 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
839 %tmp3 = bitcast float* %tmp2 to <16 x float>*
840 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
841 %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
842 %tmp6 = bitcast float* %tmp2 to <16 x float>*
843 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
844 %tmp7 = add i64 %tmp, 16
845 %tmp8 = icmp eq i64 %tmp7, 1024
846 br i1 %tmp8, label %bb9, label %bb1
852 ; Function Attrs: nounwind readnone speculatable willreturn
853 declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0
855 define void @bcast_unfold_fabs_v8f32(float* %arg) {
856 ; CHECK-LABEL: bcast_unfold_fabs_v8f32:
857 ; CHECK: # %bb.0: # %bb
858 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
859 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
860 ; CHECK-NEXT: .p2align 4, 0x90
861 ; CHECK-NEXT: .LBB25_1: # %bb1
862 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
863 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1
864 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
865 ; CHECK-NEXT: addq $32, %rax
866 ; CHECK-NEXT: jne .LBB25_1
867 ; CHECK-NEXT: # %bb.2: # %bb9
868 ; CHECK-NEXT: vzeroupper
873 bb1: ; preds = %bb1, %bb
874 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
875 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
876 %tmp3 = bitcast float* %tmp2 to <8 x float>*
877 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
878 %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
879 %tmp6 = bitcast float* %tmp2 to <8 x float>*
880 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
881 %tmp7 = add i64 %tmp, 8
882 %tmp8 = icmp eq i64 %tmp7, 1024
883 br i1 %tmp8, label %bb9, label %bb1
889 ; Function Attrs: nounwind readnone speculatable willreturn
890 declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
892 define void @bcast_unfold_fabs_v4f32(float* %arg) {
893 ; CHECK-LABEL: bcast_unfold_fabs_v4f32:
894 ; CHECK: # %bb.0: # %bb
895 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
896 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
897 ; CHECK-NEXT: .p2align 4, 0x90
898 ; CHECK-NEXT: .LBB26_1: # %bb1
899 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
900 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1
901 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
902 ; CHECK-NEXT: addq $16, %rax
903 ; CHECK-NEXT: jne .LBB26_1
904 ; CHECK-NEXT: # %bb.2: # %bb9
909 bb1: ; preds = %bb1, %bb
910 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
911 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
912 %tmp3 = bitcast float* %tmp2 to <4 x float>*
913 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
914 %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
915 %tmp6 = bitcast float* %tmp2 to <4 x float>*
916 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
917 %tmp7 = add i64 %tmp, 4
918 %tmp8 = icmp eq i64 %tmp7, 1024
919 br i1 %tmp8, label %bb9, label %bb1
925 ; Function Attrs: nounwind readnone speculatable willreturn
926 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0
928 define void @bcast_unfold_fabs_v8f64(double* %arg) {
929 ; CHECK-LABEL: bcast_unfold_fabs_v8f64:
930 ; CHECK: # %bb.0: # %bb
931 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
932 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
933 ; CHECK-NEXT: .p2align 4, 0x90
934 ; CHECK-NEXT: .LBB27_1: # %bb1
935 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
936 ; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1
937 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
938 ; CHECK-NEXT: addq $64, %rax
939 ; CHECK-NEXT: jne .LBB27_1
940 ; CHECK-NEXT: # %bb.2: # %bb9
941 ; CHECK-NEXT: vzeroupper
946 bb1: ; preds = %bb1, %bb
947 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
948 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
949 %tmp3 = bitcast double* %tmp2 to <8 x double>*
950 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
951 %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
952 %tmp6 = bitcast double* %tmp2 to <8 x double>*
953 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
954 %tmp7 = add i64 %tmp, 8
955 %tmp8 = icmp eq i64 %tmp7, 1024
956 br i1 %tmp8, label %bb9, label %bb1
962 ; Function Attrs: nounwind readnone speculatable willreturn
963 declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0
965 define void @bcast_unfold_fabs_v4f64(double* %arg) {
966 ; CHECK-LABEL: bcast_unfold_fabs_v4f64:
967 ; CHECK: # %bb.0: # %bb
968 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
969 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
970 ; CHECK-NEXT: .p2align 4, 0x90
971 ; CHECK-NEXT: .LBB28_1: # %bb1
972 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
973 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1
974 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
975 ; CHECK-NEXT: addq $32, %rax
976 ; CHECK-NEXT: jne .LBB28_1
977 ; CHECK-NEXT: # %bb.2: # %bb9
978 ; CHECK-NEXT: vzeroupper
983 bb1: ; preds = %bb1, %bb
984 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
985 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
986 %tmp3 = bitcast double* %tmp2 to <4 x double>*
987 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
988 %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
989 %tmp6 = bitcast double* %tmp2 to <4 x double>*
990 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
991 %tmp7 = add i64 %tmp, 4
992 %tmp8 = icmp eq i64 %tmp7, 1024
993 br i1 %tmp8, label %bb9, label %bb1
999 ; Function Attrs: nounwind readnone speculatable willreturn
1000 declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0
1002 define void @bcast_unfold_fabs_v2f64(double* %arg) {
1003 ; CHECK-LABEL: bcast_unfold_fabs_v2f64:
1004 ; CHECK: # %bb.0: # %bb
1005 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1006 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN]
1007 ; CHECK-NEXT: .p2align 4, 0x90
1008 ; CHECK-NEXT: .LBB29_1: # %bb1
1009 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1010 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1
1011 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
1012 ; CHECK-NEXT: addq $16, %rax
1013 ; CHECK-NEXT: jne .LBB29_1
1014 ; CHECK-NEXT: # %bb.2: # %bb9
1019 bb1: ; preds = %bb1, %bb
1020 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1021 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1022 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1023 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1024 %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
1025 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1026 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1027 %tmp7 = add i64 %tmp, 2
1028 %tmp8 = icmp eq i64 %tmp7, 1024
1029 br i1 %tmp8, label %bb9, label %bb1
1035 ; Function Attrs: nounwind readnone speculatable willreturn
1036 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
1038 define void @bcast_unfold_fadd_v16f32(float* nocapture %arg) {
1039 ; CHECK-LABEL: bcast_unfold_fadd_v16f32:
1040 ; CHECK: # %bb.0: # %bb
1041 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1042 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1043 ; CHECK-NEXT: .p2align 4, 0x90
1044 ; CHECK-NEXT: .LBB30_1: # %bb1
1045 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1046 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1
1047 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1048 ; CHECK-NEXT: addq $64, %rax
1049 ; CHECK-NEXT: jne .LBB30_1
1050 ; CHECK-NEXT: # %bb.2: # %bb9
1051 ; CHECK-NEXT: vzeroupper
1056 bb1: ; preds = %bb1, %bb
1057 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1058 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1059 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1060 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1061 %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1062 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1063 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1064 %tmp7 = add i64 %tmp, 16
1065 %tmp8 = icmp eq i64 %tmp7, 1024
1066 br i1 %tmp8, label %bb9, label %bb1
1072 define void @bcast_unfold_fadd_v8f32(float* nocapture %arg) {
1073 ; CHECK-LABEL: bcast_unfold_fadd_v8f32:
1074 ; CHECK: # %bb.0: # %bb
1075 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1076 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1077 ; CHECK-NEXT: .p2align 4, 0x90
1078 ; CHECK-NEXT: .LBB31_1: # %bb1
1079 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1080 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1
1081 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1082 ; CHECK-NEXT: addq $32, %rax
1083 ; CHECK-NEXT: jne .LBB31_1
1084 ; CHECK-NEXT: # %bb.2: # %bb9
1085 ; CHECK-NEXT: vzeroupper
1090 bb1: ; preds = %bb1, %bb
1091 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1092 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1093 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1094 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1095 %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1096 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1097 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1098 %tmp7 = add i64 %tmp, 8
1099 %tmp8 = icmp eq i64 %tmp7, 1024
1100 br i1 %tmp8, label %bb9, label %bb1
1106 define void @bcast_unfold_fadd_v4f32(float* nocapture %arg) {
1107 ; CHECK-LABEL: bcast_unfold_fadd_v4f32:
1108 ; CHECK: # %bb.0: # %bb
1109 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1110 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1111 ; CHECK-NEXT: .p2align 4, 0x90
1112 ; CHECK-NEXT: .LBB32_1: # %bb1
1113 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1114 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1
1115 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1116 ; CHECK-NEXT: addq $16, %rax
1117 ; CHECK-NEXT: jne .LBB32_1
1118 ; CHECK-NEXT: # %bb.2: # %bb9
1123 bb1: ; preds = %bb1, %bb
1124 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1125 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1126 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1127 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1128 %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1129 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1130 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1131 %tmp7 = add i64 %tmp, 4
1132 %tmp8 = icmp eq i64 %tmp7, 1024
1133 br i1 %tmp8, label %bb9, label %bb1
1139 define void @bcast_unfold_fadd_v8f64(double* nocapture %arg) {
1140 ; CHECK-LABEL: bcast_unfold_fadd_v8f64:
1141 ; CHECK: # %bb.0: # %bb
1142 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1143 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1144 ; CHECK-NEXT: .p2align 4, 0x90
1145 ; CHECK-NEXT: .LBB33_1: # %bb1
1146 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1147 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
1148 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1149 ; CHECK-NEXT: addq $64, %rax
1150 ; CHECK-NEXT: jne .LBB33_1
1151 ; CHECK-NEXT: # %bb.2: # %bb9
1152 ; CHECK-NEXT: vzeroupper
1157 bb1: ; preds = %bb1, %bb
1158 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1159 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1160 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1161 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1162 %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1163 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1164 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1165 %tmp7 = add i64 %tmp, 8
1166 %tmp8 = icmp eq i64 %tmp7, 1024
1167 br i1 %tmp8, label %bb9, label %bb1
1173 define void @bcast_unfold_fadd_v4f64(double* nocapture %arg) {
1174 ; CHECK-LABEL: bcast_unfold_fadd_v4f64:
1175 ; CHECK: # %bb.0: # %bb
1176 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1177 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1178 ; CHECK-NEXT: .p2align 4, 0x90
1179 ; CHECK-NEXT: .LBB34_1: # %bb1
1180 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1181 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
1182 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1183 ; CHECK-NEXT: addq $32, %rax
1184 ; CHECK-NEXT: jne .LBB34_1
1185 ; CHECK-NEXT: # %bb.2: # %bb9
1186 ; CHECK-NEXT: vzeroupper
1191 bb1: ; preds = %bb1, %bb
1192 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1193 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1194 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1195 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1196 %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1197 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1198 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1199 %tmp7 = add i64 %tmp, 4
1200 %tmp8 = icmp eq i64 %tmp7, 1024
1201 br i1 %tmp8, label %bb9, label %bb1
1207 define void @bcast_unfold_fadd_v2f64(double* nocapture %arg) {
1208 ; CHECK-LABEL: bcast_unfold_fadd_v2f64:
1209 ; CHECK: # %bb.0: # %bb
1210 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1211 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1212 ; CHECK-NEXT: .p2align 4, 0x90
1213 ; CHECK-NEXT: .LBB35_1: # %bb1
1214 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1215 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
1216 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1217 ; CHECK-NEXT: addq $16, %rax
1218 ; CHECK-NEXT: jne .LBB35_1
1219 ; CHECK-NEXT: # %bb.2: # %bb9
1224 bb1: ; preds = %bb1, %bb
1225 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1226 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1227 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1228 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1229 %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1230 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1231 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1232 %tmp7 = add i64 %tmp, 2
1233 %tmp8 = icmp eq i64 %tmp7, 1024
1234 br i1 %tmp8, label %bb9, label %bb1
1240 define void @bcast_unfold_fmul_v16f32(float* nocapture %arg) {
1241 ; CHECK-LABEL: bcast_unfold_fmul_v16f32:
1242 ; CHECK: # %bb.0: # %bb
1243 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1244 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1245 ; CHECK-NEXT: .p2align 4, 0x90
1246 ; CHECK-NEXT: .LBB36_1: # %bb1
1247 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1248 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1
1249 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1250 ; CHECK-NEXT: addq $64, %rax
1251 ; CHECK-NEXT: jne .LBB36_1
1252 ; CHECK-NEXT: # %bb.2: # %bb9
1253 ; CHECK-NEXT: vzeroupper
1258 bb1: ; preds = %bb1, %bb
1259 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1260 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1261 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1262 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1263 %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1264 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1265 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1266 %tmp7 = add i64 %tmp, 16
1267 %tmp8 = icmp eq i64 %tmp7, 1024
1268 br i1 %tmp8, label %bb9, label %bb1
1274 define void @bcast_unfold_fmul_v8f32(float* nocapture %arg) {
1275 ; CHECK-LABEL: bcast_unfold_fmul_v8f32:
1276 ; CHECK: # %bb.0: # %bb
1277 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1278 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1279 ; CHECK-NEXT: .p2align 4, 0x90
1280 ; CHECK-NEXT: .LBB37_1: # %bb1
1281 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1282 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1
1283 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1284 ; CHECK-NEXT: addq $32, %rax
1285 ; CHECK-NEXT: jne .LBB37_1
1286 ; CHECK-NEXT: # %bb.2: # %bb9
1287 ; CHECK-NEXT: vzeroupper
1292 bb1: ; preds = %bb1, %bb
1293 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1294 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1295 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1296 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1297 %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1298 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1299 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1300 %tmp7 = add i64 %tmp, 8
1301 %tmp8 = icmp eq i64 %tmp7, 1024
1302 br i1 %tmp8, label %bb9, label %bb1
1308 define void @bcast_unfold_fmul_v4f32(float* nocapture %arg) {
1309 ; CHECK-LABEL: bcast_unfold_fmul_v4f32:
1310 ; CHECK: # %bb.0: # %bb
1311 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1312 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1313 ; CHECK-NEXT: .p2align 4, 0x90
1314 ; CHECK-NEXT: .LBB38_1: # %bb1
1315 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1316 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1
1317 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1318 ; CHECK-NEXT: addq $16, %rax
1319 ; CHECK-NEXT: jne .LBB38_1
1320 ; CHECK-NEXT: # %bb.2: # %bb9
1325 bb1: ; preds = %bb1, %bb
1326 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1327 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1328 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1329 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1330 %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1331 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1332 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1333 %tmp7 = add i64 %tmp, 4
1334 %tmp8 = icmp eq i64 %tmp7, 1024
1335 br i1 %tmp8, label %bb9, label %bb1
1341 define void @bcast_unfold_fmul_v8f64(double* nocapture %arg) {
1342 ; CHECK-LABEL: bcast_unfold_fmul_v8f64:
1343 ; CHECK: # %bb.0: # %bb
1344 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1345 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1346 ; CHECK-NEXT: .p2align 4, 0x90
1347 ; CHECK-NEXT: .LBB39_1: # %bb1
1348 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1349 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
1350 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1351 ; CHECK-NEXT: addq $64, %rax
1352 ; CHECK-NEXT: jne .LBB39_1
1353 ; CHECK-NEXT: # %bb.2: # %bb9
1354 ; CHECK-NEXT: vzeroupper
1359 bb1: ; preds = %bb1, %bb
1360 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1361 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1362 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1363 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1364 %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1365 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1366 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1367 %tmp7 = add i64 %tmp, 8
1368 %tmp8 = icmp eq i64 %tmp7, 1024
1369 br i1 %tmp8, label %bb9, label %bb1
1375 define void @bcast_unfold_fmul_v4f64(double* nocapture %arg) {
1376 ; CHECK-LABEL: bcast_unfold_fmul_v4f64:
1377 ; CHECK: # %bb.0: # %bb
1378 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1379 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1380 ; CHECK-NEXT: .p2align 4, 0x90
1381 ; CHECK-NEXT: .LBB40_1: # %bb1
1382 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1383 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
1384 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1385 ; CHECK-NEXT: addq $32, %rax
1386 ; CHECK-NEXT: jne .LBB40_1
1387 ; CHECK-NEXT: # %bb.2: # %bb9
1388 ; CHECK-NEXT: vzeroupper
1393 bb1: ; preds = %bb1, %bb
1394 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1395 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1396 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1397 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1398 %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1399 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1400 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1401 %tmp7 = add i64 %tmp, 4
1402 %tmp8 = icmp eq i64 %tmp7, 1024
1403 br i1 %tmp8, label %bb9, label %bb1
1409 define void @bcast_unfold_fmul_v2f64(double* nocapture %arg) {
1410 ; CHECK-LABEL: bcast_unfold_fmul_v2f64:
1411 ; CHECK: # %bb.0: # %bb
1412 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1413 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
1414 ; CHECK-NEXT: .p2align 4, 0x90
1415 ; CHECK-NEXT: .LBB41_1: # %bb1
1416 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1417 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
1418 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1419 ; CHECK-NEXT: addq $16, %rax
1420 ; CHECK-NEXT: jne .LBB41_1
1421 ; CHECK-NEXT: # %bb.2: # %bb9
1426 bb1: ; preds = %bb1, %bb
1427 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1428 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1429 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1430 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1431 %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
1432 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1433 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1434 %tmp7 = add i64 %tmp, 2
1435 %tmp8 = icmp eq i64 %tmp7, 1024
1436 br i1 %tmp8, label %bb9, label %bb1
1442 define void @bcast_unfold_fdiv_v16f32(float* nocapture %arg) {
1443 ; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
1444 ; CHECK: # %bb.0: # %bb
1445 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1446 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1447 ; CHECK-NEXT: .p2align 4, 0x90
1448 ; CHECK-NEXT: .LBB42_1: # %bb1
1449 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1450 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1451 ; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1
1452 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1453 ; CHECK-NEXT: addq $64, %rax
1454 ; CHECK-NEXT: jne .LBB42_1
1455 ; CHECK-NEXT: # %bb.2: # %bb9
1456 ; CHECK-NEXT: vzeroupper
1461 bb1: ; preds = %bb1, %bb
1462 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1463 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1464 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1465 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1466 %tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1467 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1468 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1469 %tmp7 = add i64 %tmp, 16
1470 %tmp8 = icmp eq i64 %tmp7, 1024
1471 br i1 %tmp8, label %bb9, label %bb1
1477 define void @bcast_unfold_fdiv_v8f32(float* nocapture %arg) {
1478 ; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
1479 ; CHECK: # %bb.0: # %bb
1480 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1481 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1482 ; CHECK-NEXT: .p2align 4, 0x90
1483 ; CHECK-NEXT: .LBB43_1: # %bb1
1484 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1485 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1486 ; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1
1487 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1488 ; CHECK-NEXT: addq $32, %rax
1489 ; CHECK-NEXT: jne .LBB43_1
1490 ; CHECK-NEXT: # %bb.2: # %bb9
1491 ; CHECK-NEXT: vzeroupper
1496 bb1: ; preds = %bb1, %bb
1497 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1498 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1499 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1500 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1501 %tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1502 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1503 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1504 %tmp7 = add i64 %tmp, 8
1505 %tmp8 = icmp eq i64 %tmp7, 1024
1506 br i1 %tmp8, label %bb9, label %bb1
1512 define void @bcast_unfold_fdiv_v4f32(float* nocapture %arg) {
1513 ; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
1514 ; CHECK: # %bb.0: # %bb
1515 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1516 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1517 ; CHECK-NEXT: .p2align 4, 0x90
1518 ; CHECK-NEXT: .LBB44_1: # %bb1
1519 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1520 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1521 ; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1
1522 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1523 ; CHECK-NEXT: addq $16, %rax
1524 ; CHECK-NEXT: jne .LBB44_1
1525 ; CHECK-NEXT: # %bb.2: # %bb9
1530 bb1: ; preds = %bb1, %bb
1531 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1532 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1533 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1534 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1535 %tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1536 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1537 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1538 %tmp7 = add i64 %tmp, 4
1539 %tmp8 = icmp eq i64 %tmp7, 1024
1540 br i1 %tmp8, label %bb9, label %bb1
1546 define void @bcast_unfold_fdiv_v8f64(double* nocapture %arg) {
1547 ; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
1548 ; CHECK: # %bb.0: # %bb
1549 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1550 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1551 ; CHECK-NEXT: .p2align 4, 0x90
1552 ; CHECK-NEXT: .LBB45_1: # %bb1
1553 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1554 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
1555 ; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1
1556 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1557 ; CHECK-NEXT: addq $64, %rax
1558 ; CHECK-NEXT: jne .LBB45_1
1559 ; CHECK-NEXT: # %bb.2: # %bb9
1560 ; CHECK-NEXT: vzeroupper
1565 bb1: ; preds = %bb1, %bb
1566 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1567 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1568 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1569 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1570 %tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1571 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1572 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1573 %tmp7 = add i64 %tmp, 8
1574 %tmp8 = icmp eq i64 %tmp7, 1024
1575 br i1 %tmp8, label %bb9, label %bb1
1581 define void @bcast_unfold_fdiv_v4f64(double* nocapture %arg) {
1582 ; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
1583 ; CHECK: # %bb.0: # %bb
1584 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1585 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1586 ; CHECK-NEXT: .p2align 4, 0x90
1587 ; CHECK-NEXT: .LBB46_1: # %bb1
1588 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1589 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1590 ; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1
1591 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1592 ; CHECK-NEXT: addq $32, %rax
1593 ; CHECK-NEXT: jne .LBB46_1
1594 ; CHECK-NEXT: # %bb.2: # %bb9
1595 ; CHECK-NEXT: vzeroupper
1600 bb1: ; preds = %bb1, %bb
1601 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1602 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1603 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1604 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1605 %tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1606 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1607 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1608 %tmp7 = add i64 %tmp, 4
1609 %tmp8 = icmp eq i64 %tmp7, 1024
1610 br i1 %tmp8, label %bb9, label %bb1
1616 define void @bcast_unfold_fdiv_v2f64(double* nocapture %arg) {
1617 ; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
1618 ; CHECK: # %bb.0: # %bb
1619 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1620 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1621 ; CHECK-NEXT: .p2align 4, 0x90
1622 ; CHECK-NEXT: .LBB47_1: # %bb1
1623 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1624 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1625 ; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1
1626 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1627 ; CHECK-NEXT: addq $16, %rax
1628 ; CHECK-NEXT: jne .LBB47_1
1629 ; CHECK-NEXT: # %bb.2: # %bb9
1634 bb1: ; preds = %bb1, %bb
1635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1636 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1637 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1638 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1639 %tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1640 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1641 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1642 %tmp7 = add i64 %tmp, 2
1643 %tmp8 = icmp eq i64 %tmp7, 1024
1644 br i1 %tmp8, label %bb9, label %bb1
1650 define void @bcast_unfold_fma213_v4f32(float* %arg) {
1651 ; CHECK-LABEL: bcast_unfold_fma213_v4f32:
1652 ; CHECK: # %bb.0: # %bb
1653 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1654 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1655 ; CHECK-NEXT: .p2align 4, 0x90
1656 ; CHECK-NEXT: .LBB48_1: # %bb2
1657 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1658 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1659 ; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1660 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1661 ; CHECK-NEXT: addq $16, %rax
1662 ; CHECK-NEXT: jne .LBB48_1
1663 ; CHECK-NEXT: # %bb.2: # %bb11
1668 bb2: ; preds = %bb2, %bb
1669 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1670 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1671 %tmp4 = bitcast float* %tmp3 to <4 x float>*
1672 %tmp5 = load <4 x float>, <4 x float>* %tmp4, align 4
1673 %tmp6 = fmul contract <4 x float> %tmp5, %tmp5
1674 %tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1675 %tmp8 = bitcast float* %tmp3 to <4 x float>*
1676 store <4 x float> %tmp7, <4 x float>* %tmp8, align 4
1677 %tmp9 = add i64 %tmp, 4
1678 %tmp10 = icmp eq i64 %tmp9, 1024
1679 br i1 %tmp10, label %bb11, label %bb2
1681 bb11: ; preds = %bb2
1685 define void @bcast_unfold_fma231_v4f32(float* %arg) {
1686 ; CHECK-LABEL: bcast_unfold_fma231_v4f32:
1687 ; CHECK: # %bb.0: # %bb
1688 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1689 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1690 ; CHECK-NEXT: .p2align 4, 0x90
1691 ; CHECK-NEXT: .LBB49_1: # %bb1
1692 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1693 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1694 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1695 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1696 ; CHECK-NEXT: addq $16, %rax
1697 ; CHECK-NEXT: jne .LBB49_1
1698 ; CHECK-NEXT: # %bb.2: # %bb10
1703 bb1: ; preds = %bb1, %bb
1704 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1705 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1706 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1707 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1708 %tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1709 %tmp6 = fadd contract <4 x float> %tmp4, %tmp5
1710 %tmp7 = bitcast float* %tmp2 to <4 x float>*
1711 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
1712 %tmp8 = add i64 %tmp, 4
1713 %tmp9 = icmp eq i64 %tmp8, 1024
1714 br i1 %tmp9, label %bb10, label %bb1
1716 bb10: ; preds = %bb1
1720 define void @bcast_unfold_fma213_v8f32(float* %arg) {
1721 ; CHECK-LABEL: bcast_unfold_fma213_v8f32:
1722 ; CHECK: # %bb.0: # %bb
1723 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1724 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1725 ; CHECK-NEXT: .p2align 4, 0x90
1726 ; CHECK-NEXT: .LBB50_1: # %bb2
1727 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1728 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1729 ; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1730 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1731 ; CHECK-NEXT: addq $32, %rax
1732 ; CHECK-NEXT: jne .LBB50_1
1733 ; CHECK-NEXT: # %bb.2: # %bb11
1734 ; CHECK-NEXT: vzeroupper
1739 bb2: ; preds = %bb2, %bb
1740 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1741 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1742 %tmp4 = bitcast float* %tmp3 to <8 x float>*
1743 %tmp5 = load <8 x float>, <8 x float>* %tmp4, align 4
1744 %tmp6 = fmul contract <8 x float> %tmp5, %tmp5
1745 %tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1746 %tmp8 = bitcast float* %tmp3 to <8 x float>*
1747 store <8 x float> %tmp7, <8 x float>* %tmp8, align 4
1748 %tmp9 = add i64 %tmp, 8
1749 %tmp10 = icmp eq i64 %tmp9, 1024
1750 br i1 %tmp10, label %bb11, label %bb2
1752 bb11: ; preds = %bb2
1756 define void @bcast_unfold_fma231_v8f32(float* %arg) {
1757 ; CHECK-LABEL: bcast_unfold_fma231_v8f32:
1758 ; CHECK: # %bb.0: # %bb
1759 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1760 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1761 ; CHECK-NEXT: .p2align 4, 0x90
1762 ; CHECK-NEXT: .LBB51_1: # %bb1
1763 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1764 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1765 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1766 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1767 ; CHECK-NEXT: addq $32, %rax
1768 ; CHECK-NEXT: jne .LBB51_1
1769 ; CHECK-NEXT: # %bb.2: # %bb10
1770 ; CHECK-NEXT: vzeroupper
1775 bb1: ; preds = %bb1, %bb
1776 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1777 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1778 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1779 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1780 %tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1781 %tmp6 = fadd contract <8 x float> %tmp4, %tmp5
1782 %tmp7 = bitcast float* %tmp2 to <8 x float>*
1783 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
1784 %tmp8 = add i64 %tmp, 8
1785 %tmp9 = icmp eq i64 %tmp8, 1024
1786 br i1 %tmp9, label %bb10, label %bb1
1788 bb10: ; preds = %bb1
1792 define void @bcast_unfold_fma213_v16f32(float* %arg) {
1793 ; CHECK-LABEL: bcast_unfold_fma213_v16f32:
1794 ; CHECK: # %bb.0: # %bb
1795 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1796 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1797 ; CHECK-NEXT: .p2align 4, 0x90
1798 ; CHECK-NEXT: .LBB52_1: # %bb2
1799 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1800 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1801 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
1802 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1803 ; CHECK-NEXT: addq $64, %rax
1804 ; CHECK-NEXT: jne .LBB52_1
1805 ; CHECK-NEXT: # %bb.2: # %bb11
1806 ; CHECK-NEXT: vzeroupper
1811 bb2: ; preds = %bb2, %bb
1812 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1813 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1814 %tmp4 = bitcast float* %tmp3 to <16 x float>*
1815 %tmp5 = load <16 x float>, <16 x float>* %tmp4, align 4
1816 %tmp6 = fmul contract <16 x float> %tmp5, %tmp5
1817 %tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1818 %tmp8 = bitcast float* %tmp3 to <16 x float>*
1819 store <16 x float> %tmp7, <16 x float>* %tmp8, align 4
1820 %tmp9 = add i64 %tmp, 16
1821 %tmp10 = icmp eq i64 %tmp9, 1024
1822 br i1 %tmp10, label %bb11, label %bb2
1824 bb11: ; preds = %bb2
1828 define void @bcast_unfold_fma231_v16f32(float* %arg) {
1829 ; CHECK-LABEL: bcast_unfold_fma231_v16f32:
1830 ; CHECK: # %bb.0: # %bb
1831 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1832 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1833 ; CHECK-NEXT: .p2align 4, 0x90
1834 ; CHECK-NEXT: .LBB53_1: # %bb1
1835 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1836 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1837 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
1838 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1839 ; CHECK-NEXT: addq $64, %rax
1840 ; CHECK-NEXT: jne .LBB53_1
1841 ; CHECK-NEXT: # %bb.2: # %bb10
1842 ; CHECK-NEXT: vzeroupper
1847 bb1: ; preds = %bb1, %bb
1848 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1849 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1850 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1851 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1852 %tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1853 %tmp6 = fadd contract <16 x float> %tmp4, %tmp5
1854 %tmp7 = bitcast float* %tmp2 to <16 x float>*
1855 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
1856 %tmp8 = add i64 %tmp, 16
1857 %tmp9 = icmp eq i64 %tmp8, 1024
1858 br i1 %tmp9, label %bb10, label %bb1
1860 bb10: ; preds = %bb1
1864 define void @bcast_unfold_fma213_v2f64(double* %arg) {
1865 ; CHECK-LABEL: bcast_unfold_fma213_v2f64:
1866 ; CHECK: # %bb.0: # %bb
1867 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1868 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1869 ; CHECK-NEXT: .p2align 4, 0x90
1870 ; CHECK-NEXT: .LBB54_1: # %bb2
1871 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1872 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1873 ; CHECK-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1874 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1875 ; CHECK-NEXT: addq $16, %rax
1876 ; CHECK-NEXT: jne .LBB54_1
1877 ; CHECK-NEXT: # %bb.2: # %bb11
1882 bb2: ; preds = %bb2, %bb
1883 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1884 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
1885 %tmp4 = bitcast double* %tmp3 to <2 x double>*
1886 %tmp5 = load <2 x double>, <2 x double>* %tmp4, align 4
1887 %tmp6 = fmul contract <2 x double> %tmp5, %tmp5
1888 %tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00>
1889 %tmp8 = bitcast double* %tmp3 to <2 x double>*
1890 store <2 x double> %tmp7, <2 x double>* %tmp8, align 8
1891 %tmp9 = add i64 %tmp, 2
1892 %tmp10 = icmp eq i64 %tmp9, 1024
1893 br i1 %tmp10, label %bb11, label %bb2
1895 bb11: ; preds = %bb2
1899 define void @bcast_unfold_fma231_v2f64(double* %arg) {
1900 ; CHECK-LABEL: bcast_unfold_fma231_v2f64:
1901 ; CHECK: # %bb.0: # %bb
1902 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1903 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1904 ; CHECK-NEXT: .p2align 4, 0x90
1905 ; CHECK-NEXT: .LBB55_1: # %bb1
1906 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1907 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1908 ; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1909 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1910 ; CHECK-NEXT: addq $16, %rax
1911 ; CHECK-NEXT: jne .LBB55_1
1912 ; CHECK-NEXT: # %bb.2: # %bb10
1917 bb1: ; preds = %bb1, %bb
1918 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1919 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1920 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1921 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1922 %tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1923 %tmp6 = fadd contract <2 x double> %tmp4, %tmp5
1924 %tmp7 = bitcast double* %tmp2 to <2 x double>*
1925 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
1926 %tmp8 = add i64 %tmp, 2
1927 %tmp9 = icmp eq i64 %tmp8, 1024
1928 br i1 %tmp9, label %bb10, label %bb1
1930 bb10: ; preds = %bb1
1934 define void @bcast_unfold_fma213_v4f64(double* %arg) {
1935 ; CHECK-LABEL: bcast_unfold_fma213_v4f64:
1936 ; CHECK: # %bb.0: # %bb
1937 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1938 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1939 ; CHECK-NEXT: .p2align 4, 0x90
1940 ; CHECK-NEXT: .LBB56_1: # %bb2
1941 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1942 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1943 ; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1944 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1945 ; CHECK-NEXT: addq $32, %rax
1946 ; CHECK-NEXT: jne .LBB56_1
1947 ; CHECK-NEXT: # %bb.2: # %bb11
1948 ; CHECK-NEXT: vzeroupper
1953 bb2: ; preds = %bb2, %bb
1954 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1955 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
1956 %tmp4 = bitcast double* %tmp3 to <4 x double>*
1957 %tmp5 = load <4 x double>, <4 x double>* %tmp4, align 8
1958 %tmp6 = fmul contract <4 x double> %tmp5, %tmp5
1959 %tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1960 %tmp8 = bitcast double* %tmp3 to <4 x double>*
1961 store <4 x double> %tmp7, <4 x double>* %tmp8, align 8
1962 %tmp9 = add i64 %tmp, 4
1963 %tmp10 = icmp eq i64 %tmp9, 1024
1964 br i1 %tmp10, label %bb11, label %bb2
1966 bb11: ; preds = %bb2
1970 define void @bcast_unfold_fma231_v4f64(double* %arg) {
1971 ; CHECK-LABEL: bcast_unfold_fma231_v4f64:
1972 ; CHECK: # %bb.0: # %bb
1973 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1974 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1975 ; CHECK-NEXT: .p2align 4, 0x90
1976 ; CHECK-NEXT: .LBB57_1: # %bb1
1977 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1978 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1979 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1980 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1981 ; CHECK-NEXT: addq $32, %rax
1982 ; CHECK-NEXT: jne .LBB57_1
1983 ; CHECK-NEXT: # %bb.2: # %bb10
1984 ; CHECK-NEXT: vzeroupper
1989 bb1: ; preds = %bb1, %bb
1990 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1991 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1992 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1993 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1994 %tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1995 %tmp6 = fadd contract <4 x double> %tmp4, %tmp5
1996 %tmp7 = bitcast double* %tmp2 to <4 x double>*
1997 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
1998 %tmp8 = add i64 %tmp, 4
1999 %tmp9 = icmp eq i64 %tmp8, 1024
2000 br i1 %tmp9, label %bb10, label %bb1
2002 bb10: ; preds = %bb1
2006 define void @bcast_unfold_fma213_v8f64(double* %arg) {
2007 ; CHECK-LABEL: bcast_unfold_fma213_v8f64:
2008 ; CHECK: # %bb.0: # %bb
2009 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2010 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2011 ; CHECK-NEXT: .p2align 4, 0x90
2012 ; CHECK-NEXT: .LBB58_1: # %bb2
2013 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2014 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2015 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
2016 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2017 ; CHECK-NEXT: addq $64, %rax
2018 ; CHECK-NEXT: jne .LBB58_1
2019 ; CHECK-NEXT: # %bb.2: # %bb11
2020 ; CHECK-NEXT: vzeroupper
2025 bb2: ; preds = %bb2, %bb
2026 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
2027 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
2028 %tmp4 = bitcast double* %tmp3 to <8 x double>*
2029 %tmp5 = load <8 x double>, <8 x double>* %tmp4, align 8
2030 %tmp6 = fmul contract <8 x double> %tmp5, %tmp5
2031 %tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2032 %tmp8 = bitcast double* %tmp3 to <8 x double>*
2033 store <8 x double> %tmp7, <8 x double>* %tmp8, align 8
2034 %tmp9 = add i64 %tmp, 8
2035 %tmp10 = icmp eq i64 %tmp9, 1024
2036 br i1 %tmp10, label %bb11, label %bb2
2038 bb11: ; preds = %bb2
2042 define void @bcast_unfold_fma231_v8f64(double* %arg) {
2043 ; CHECK-LABEL: bcast_unfold_fma231_v8f64:
2044 ; CHECK: # %bb.0: # %bb
2045 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2046 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2047 ; CHECK-NEXT: .p2align 4, 0x90
2048 ; CHECK-NEXT: .LBB59_1: # %bb1
2049 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2050 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2051 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
2052 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2053 ; CHECK-NEXT: addq $64, %rax
2054 ; CHECK-NEXT: jne .LBB59_1
2055 ; CHECK-NEXT: # %bb.2: # %bb10
2056 ; CHECK-NEXT: vzeroupper
2061 bb1: ; preds = %bb1, %bb
2062 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2063 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2064 %tmp3 = bitcast double* %tmp2 to <8 x double>*
2065 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2066 %tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2067 %tmp6 = fadd contract <8 x double> %tmp4, %tmp5
2068 %tmp7 = bitcast double* %tmp2 to <8 x double>*
2069 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2070 %tmp8 = add i64 %tmp, 8
2071 %tmp9 = icmp eq i64 %tmp8, 1024
2072 br i1 %tmp9, label %bb10, label %bb1
2074 bb10: ; preds = %bb1
2078 define void @bcast_unfold_fmax_v4f32(float* %arg) {
2079 ; CHECK-LABEL: bcast_unfold_fmax_v4f32:
2080 ; CHECK: # %bb.0: # %bb
2081 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2082 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2083 ; CHECK-NEXT: .p2align 4, 0x90
2084 ; CHECK-NEXT: .LBB60_1: # %bb1
2085 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2086 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
2087 ; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1
2088 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
2089 ; CHECK-NEXT: addq $16, %rax
2090 ; CHECK-NEXT: jne .LBB60_1
2091 ; CHECK-NEXT: # %bb.2: # %bb10
2096 bb1: ; preds = %bb1, %bb
2097 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2098 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2099 %tmp3 = bitcast float* %tmp2 to <4 x float>*
2100 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
2101 %tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2102 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2103 %tmp7 = bitcast float* %tmp2 to <4 x float>*
2104 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
2105 %tmp8 = add i64 %tmp, 4
2106 %tmp9 = icmp eq i64 %tmp8, 1024
2107 br i1 %tmp9, label %bb10, label %bb1
2109 bb10: ; preds = %bb1
2113 define void @bcast_unfold_fmax_v8f32(float* %arg) {
2114 ; CHECK-LABEL: bcast_unfold_fmax_v8f32:
2115 ; CHECK: # %bb.0: # %bb
2116 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2117 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2118 ; CHECK-NEXT: .p2align 4, 0x90
2119 ; CHECK-NEXT: .LBB61_1: # %bb1
2120 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2121 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
2122 ; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1
2123 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
2124 ; CHECK-NEXT: addq $32, %rax
2125 ; CHECK-NEXT: jne .LBB61_1
2126 ; CHECK-NEXT: # %bb.2: # %bb10
2127 ; CHECK-NEXT: vzeroupper
2132 bb1: ; preds = %bb1, %bb
2133 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2134 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2135 %tmp3 = bitcast float* %tmp2 to <8 x float>*
2136 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
2137 %tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2138 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2139 %tmp7 = bitcast float* %tmp2 to <8 x float>*
2140 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
2141 %tmp8 = add i64 %tmp, 8
2142 %tmp9 = icmp eq i64 %tmp8, 1024
2143 br i1 %tmp9, label %bb10, label %bb1
2145 bb10: ; preds = %bb1
2149 define void @bcast_unfold_fmax_v16f32(float* %arg) {
2150 ; CHECK-LABEL: bcast_unfold_fmax_v16f32:
2151 ; CHECK: # %bb.0: # %bb
2152 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2153 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2154 ; CHECK-NEXT: .p2align 4, 0x90
2155 ; CHECK-NEXT: .LBB62_1: # %bb1
2156 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2157 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
2158 ; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1
2159 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
2160 ; CHECK-NEXT: addq $64, %rax
2161 ; CHECK-NEXT: jne .LBB62_1
2162 ; CHECK-NEXT: # %bb.2: # %bb10
2163 ; CHECK-NEXT: vzeroupper
2168 bb1: ; preds = %bb1, %bb
2169 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2170 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2171 %tmp3 = bitcast float* %tmp2 to <16 x float>*
2172 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
2173 %tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2174 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2175 %tmp7 = bitcast float* %tmp2 to <16 x float>*
2176 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
2177 %tmp8 = add i64 %tmp, 16
2178 %tmp9 = icmp eq i64 %tmp8, 1024
2179 br i1 %tmp9, label %bb10, label %bb1
2181 bb10: ; preds = %bb1
2185 define void @bcast_unfold_fmax_v2f64(double* %arg) {
2186 ; CHECK-LABEL: bcast_unfold_fmax_v2f64:
2187 ; CHECK: # %bb.0: # %bb
2188 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2189 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2190 ; CHECK-NEXT: .p2align 4, 0x90
2191 ; CHECK-NEXT: .LBB63_1: # %bb1
2192 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2193 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
2194 ; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
2195 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
2196 ; CHECK-NEXT: addq $16, %rax
2197 ; CHECK-NEXT: jne .LBB63_1
2198 ; CHECK-NEXT: # %bb.2: # %bb10
2203 bb1: ; preds = %bb1, %bb
2204 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2205 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2206 %tmp3 = bitcast double* %tmp2 to <2 x double>*
2207 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
2208 %tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2209 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2210 %tmp7 = bitcast double* %tmp2 to <2 x double>*
2211 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
2212 %tmp8 = add i64 %tmp, 2
2213 %tmp9 = icmp eq i64 %tmp8, 1024
2214 br i1 %tmp9, label %bb10, label %bb1
2216 bb10: ; preds = %bb1
2220 define void @bcast_unfold_fmax_v4f64(double* %arg) {
2221 ; CHECK-LABEL: bcast_unfold_fmax_v4f64:
2222 ; CHECK: # %bb.0: # %bb
2223 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2224 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2225 ; CHECK-NEXT: .p2align 4, 0x90
2226 ; CHECK-NEXT: .LBB64_1: # %bb1
2227 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2228 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
2229 ; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1
2230 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
2231 ; CHECK-NEXT: addq $32, %rax
2232 ; CHECK-NEXT: jne .LBB64_1
2233 ; CHECK-NEXT: # %bb.2: # %bb10
2234 ; CHECK-NEXT: vzeroupper
2239 bb1: ; preds = %bb1, %bb
2240 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2241 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2242 %tmp3 = bitcast double* %tmp2 to <4 x double>*
2243 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2244 %tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2245 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2246 %tmp7 = bitcast double* %tmp2 to <4 x double>*
2247 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2248 %tmp8 = add i64 %tmp, 4
2249 %tmp9 = icmp eq i64 %tmp8, 1024
2250 br i1 %tmp9, label %bb10, label %bb1
2252 bb10: ; preds = %bb1
2256 define void @bcast_unfold_fmax_v8f64(double* %arg) {
2257 ; CHECK-LABEL: bcast_unfold_fmax_v8f64:
2258 ; CHECK: # %bb.0: # %bb
2259 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2260 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2261 ; CHECK-NEXT: .p2align 4, 0x90
2262 ; CHECK-NEXT: .LBB65_1: # %bb1
2263 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2264 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2265 ; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1
2266 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2267 ; CHECK-NEXT: addq $64, %rax
2268 ; CHECK-NEXT: jne .LBB65_1
2269 ; CHECK-NEXT: # %bb.2: # %bb10
2270 ; CHECK-NEXT: vzeroupper
2275 bb1: ; preds = %bb1, %bb
2276 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2277 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2278 %tmp3 = bitcast double* %tmp2 to <8 x double>*
2279 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2280 %tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2281 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2282 %tmp7 = bitcast double* %tmp2 to <8 x double>*
2283 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2284 %tmp8 = add i64 %tmp, 8
2285 %tmp9 = icmp eq i64 %tmp8, 1024
2286 br i1 %tmp9, label %bb10, label %bb1
2288 bb10: ; preds = %bb1
2292 define void @bcast_unfold_fmin_v4f32(float* %arg) {
2293 ; CHECK-LABEL: bcast_unfold_fmin_v4f32:
2294 ; CHECK: # %bb.0: # %bb
2295 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2296 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2297 ; CHECK-NEXT: .p2align 4, 0x90
2298 ; CHECK-NEXT: .LBB66_1: # %bb1
2299 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2300 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
2301 ; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1
2302 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
2303 ; CHECK-NEXT: addq $16, %rax
2304 ; CHECK-NEXT: jne .LBB66_1
2305 ; CHECK-NEXT: # %bb.2: # %bb10
2310 bb1: ; preds = %bb1, %bb
2311 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2312 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2313 %tmp3 = bitcast float* %tmp2 to <4 x float>*
2314 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
2315 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2316 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2317 %tmp7 = bitcast float* %tmp2 to <4 x float>*
2318 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
2319 %tmp8 = add i64 %tmp, 4
2320 %tmp9 = icmp eq i64 %tmp8, 1024
2321 br i1 %tmp9, label %bb10, label %bb1
2323 bb10: ; preds = %bb1
2327 define void @bcast_unfold_fmin_v8f32(float* %arg) {
2328 ; CHECK-LABEL: bcast_unfold_fmin_v8f32:
2329 ; CHECK: # %bb.0: # %bb
2330 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2331 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2332 ; CHECK-NEXT: .p2align 4, 0x90
2333 ; CHECK-NEXT: .LBB67_1: # %bb1
2334 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2335 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
2336 ; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1
2337 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
2338 ; CHECK-NEXT: addq $32, %rax
2339 ; CHECK-NEXT: jne .LBB67_1
2340 ; CHECK-NEXT: # %bb.2: # %bb10
2341 ; CHECK-NEXT: vzeroupper
2346 bb1: ; preds = %bb1, %bb
2347 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2348 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2349 %tmp3 = bitcast float* %tmp2 to <8 x float>*
2350 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
2351 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2352 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2353 %tmp7 = bitcast float* %tmp2 to <8 x float>*
2354 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
2355 %tmp8 = add i64 %tmp, 8
2356 %tmp9 = icmp eq i64 %tmp8, 1024
2357 br i1 %tmp9, label %bb10, label %bb1
2359 bb10: ; preds = %bb1
2363 define void @bcast_unfold_fmin_v16f32(float* %arg) {
2364 ; CHECK-LABEL: bcast_unfold_fmin_v16f32:
2365 ; CHECK: # %bb.0: # %bb
2366 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2367 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2368 ; CHECK-NEXT: .p2align 4, 0x90
2369 ; CHECK-NEXT: .LBB68_1: # %bb1
2370 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2371 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
2372 ; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1
2373 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
2374 ; CHECK-NEXT: addq $64, %rax
2375 ; CHECK-NEXT: jne .LBB68_1
2376 ; CHECK-NEXT: # %bb.2: # %bb10
2377 ; CHECK-NEXT: vzeroupper
2382 bb1: ; preds = %bb1, %bb
2383 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2384 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2385 %tmp3 = bitcast float* %tmp2 to <16 x float>*
2386 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
2387 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2388 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2389 %tmp7 = bitcast float* %tmp2 to <16 x float>*
2390 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
2391 %tmp8 = add i64 %tmp, 16
2392 %tmp9 = icmp eq i64 %tmp8, 1024
2393 br i1 %tmp9, label %bb10, label %bb1
2395 bb10: ; preds = %bb1
2399 define void @bcast_unfold_fmin_v2f64(double* %arg) {
2400 ; CHECK-LABEL: bcast_unfold_fmin_v2f64:
2401 ; CHECK: # %bb.0: # %bb
2402 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2403 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2404 ; CHECK-NEXT: .p2align 4, 0x90
2405 ; CHECK-NEXT: .LBB69_1: # %bb1
2406 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2407 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
2408 ; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1
2409 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
2410 ; CHECK-NEXT: addq $16, %rax
2411 ; CHECK-NEXT: jne .LBB69_1
2412 ; CHECK-NEXT: # %bb.2: # %bb10
2417 bb1: ; preds = %bb1, %bb
2418 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2419 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2420 %tmp3 = bitcast double* %tmp2 to <2 x double>*
2421 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
2422 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2423 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2424 %tmp7 = bitcast double* %tmp2 to <2 x double>*
2425 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
2426 %tmp8 = add i64 %tmp, 2
2427 %tmp9 = icmp eq i64 %tmp8, 1024
2428 br i1 %tmp9, label %bb10, label %bb1
2430 bb10: ; preds = %bb1
2434 define void @bcast_unfold_fmin_v4f64(double* %arg) {
2435 ; CHECK-LABEL: bcast_unfold_fmin_v4f64:
2436 ; CHECK: # %bb.0: # %bb
2437 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2438 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2439 ; CHECK-NEXT: .p2align 4, 0x90
2440 ; CHECK-NEXT: .LBB70_1: # %bb1
2441 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2442 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
2443 ; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1
2444 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
2445 ; CHECK-NEXT: addq $32, %rax
2446 ; CHECK-NEXT: jne .LBB70_1
2447 ; CHECK-NEXT: # %bb.2: # %bb10
2448 ; CHECK-NEXT: vzeroupper
2453 bb1: ; preds = %bb1, %bb
2454 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2455 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2456 %tmp3 = bitcast double* %tmp2 to <4 x double>*
2457 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2458 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2459 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2460 %tmp7 = bitcast double* %tmp2 to <4 x double>*
2461 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2462 %tmp8 = add i64 %tmp, 4
2463 %tmp9 = icmp eq i64 %tmp8, 1024
2464 br i1 %tmp9, label %bb10, label %bb1
2466 bb10: ; preds = %bb1
2470 define void @bcast_unfold_fmin_v8f64(double* %arg) {
2471 ; CHECK-LABEL: bcast_unfold_fmin_v8f64:
2472 ; CHECK: # %bb.0: # %bb
2473 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2474 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2475 ; CHECK-NEXT: .p2align 4, 0x90
2476 ; CHECK-NEXT: .LBB71_1: # %bb1
2477 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2478 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2479 ; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1
2480 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2481 ; CHECK-NEXT: addq $64, %rax
2482 ; CHECK-NEXT: jne .LBB71_1
2483 ; CHECK-NEXT: # %bb.2: # %bb10
2484 ; CHECK-NEXT: vzeroupper
2489 bb1: ; preds = %bb1, %bb
2490 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2491 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2492 %tmp3 = bitcast double* %tmp2 to <8 x double>*
2493 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2494 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2495 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2496 %tmp7 = bitcast double* %tmp2 to <8 x double>*
2497 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2498 %tmp8 = add i64 %tmp, 8
2499 %tmp9 = icmp eq i64 %tmp8, 1024
2500 br i1 %tmp9, label %bb10, label %bb1
2502 bb10: ; preds = %bb1
2506 define void @bcast_unfold_smin_v4i32(i32* %arg) {
2507 ; CHECK-LABEL: bcast_unfold_smin_v4i32:
2508 ; CHECK: # %bb.0: # %bb
2509 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2510 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2511 ; CHECK-NEXT: .p2align 4, 0x90
2512 ; CHECK-NEXT: .LBB72_1: # %bb1
2513 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2514 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
2515 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2516 ; CHECK-NEXT: addq $16, %rax
2517 ; CHECK-NEXT: jne .LBB72_1
2518 ; CHECK-NEXT: # %bb.2: # %bb10
2523 bb1: ; preds = %bb1, %bb
2524 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2525 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2526 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2527 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2528 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2529 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2530 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2531 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2532 %tmp8 = add i64 %tmp, 4
2533 %tmp9 = icmp eq i64 %tmp8, 1024
2534 br i1 %tmp9, label %bb10, label %bb1
2536 bb10: ; preds = %bb1
2540 define void @bcast_unfold_smin_v8i32(i32* %arg) {
2541 ; CHECK-LABEL: bcast_unfold_smin_v8i32:
2542 ; CHECK: # %bb.0: # %bb
2543 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2544 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2545 ; CHECK-NEXT: .p2align 4, 0x90
2546 ; CHECK-NEXT: .LBB73_1: # %bb1
2547 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2548 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
2549 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2550 ; CHECK-NEXT: addq $32, %rax
2551 ; CHECK-NEXT: jne .LBB73_1
2552 ; CHECK-NEXT: # %bb.2: # %bb10
2553 ; CHECK-NEXT: vzeroupper
2558 bb1: ; preds = %bb1, %bb
2559 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2560 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2561 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2562 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2563 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2564 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2565 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2566 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2567 %tmp8 = add i64 %tmp, 8
2568 %tmp9 = icmp eq i64 %tmp8, 1024
2569 br i1 %tmp9, label %bb10, label %bb1
2571 bb10: ; preds = %bb1
2575 define void @bcast_unfold_smin_v16i32(i32* %arg) {
2576 ; CHECK-LABEL: bcast_unfold_smin_v16i32:
2577 ; CHECK: # %bb.0: # %bb
2578 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2579 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2580 ; CHECK-NEXT: .p2align 4, 0x90
2581 ; CHECK-NEXT: .LBB74_1: # %bb1
2582 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2583 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
2584 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2585 ; CHECK-NEXT: addq $64, %rax
2586 ; CHECK-NEXT: jne .LBB74_1
2587 ; CHECK-NEXT: # %bb.2: # %bb10
2588 ; CHECK-NEXT: vzeroupper
2593 bb1: ; preds = %bb1, %bb
2594 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2595 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2596 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
2597 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
2598 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2599 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2600 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
2601 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
2602 %tmp8 = add i64 %tmp, 16
2603 %tmp9 = icmp eq i64 %tmp8, 1024
2604 br i1 %tmp9, label %bb10, label %bb1
2606 bb10: ; preds = %bb1
2610 define void @bcast_unfold_smin_v2i64(i64* %arg) {
2611 ; CHECK-LABEL: bcast_unfold_smin_v2i64:
2612 ; CHECK: # %bb.0: # %bb
2613 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2614 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
2615 ; CHECK-NEXT: .p2align 4, 0x90
2616 ; CHECK-NEXT: .LBB75_1: # %bb1
2617 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2618 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %xmm0, %xmm1
2619 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2620 ; CHECK-NEXT: addq $16, %rax
2621 ; CHECK-NEXT: jne .LBB75_1
2622 ; CHECK-NEXT: # %bb.2: # %bb10
2627 bb1: ; preds = %bb1, %bb
2628 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2629 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2630 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
2631 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
2632 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2>
2633 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2634 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
2635 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
2636 %tmp8 = add i64 %tmp, 2
2637 %tmp9 = icmp eq i64 %tmp8, 1024
2638 br i1 %tmp9, label %bb10, label %bb1
2640 bb10: ; preds = %bb1
2644 define void @bcast_unfold_smin_v4i64(i64* %arg) {
2645 ; CHECK-LABEL: bcast_unfold_smin_v4i64:
2646 ; CHECK: # %bb.0: # %bb
2647 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2648 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2649 ; CHECK-NEXT: .p2align 4, 0x90
2650 ; CHECK-NEXT: .LBB76_1: # %bb1
2651 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2652 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
2653 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2654 ; CHECK-NEXT: addq $32, %rax
2655 ; CHECK-NEXT: jne .LBB76_1
2656 ; CHECK-NEXT: # %bb.2: # %bb10
2657 ; CHECK-NEXT: vzeroupper
2662 bb1: ; preds = %bb1, %bb
2663 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2664 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2665 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
2666 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
2667 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2668 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2669 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
2670 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
2671 %tmp8 = add i64 %tmp, 4
2672 %tmp9 = icmp eq i64 %tmp8, 1024
2673 br i1 %tmp9, label %bb10, label %bb1
2675 bb10: ; preds = %bb1
2679 define void @bcast_unfold_smin_v8i64(i64* %arg) {
2680 ; CHECK-LABEL: bcast_unfold_smin_v8i64:
2681 ; CHECK: # %bb.0: # %bb
2682 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2683 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2684 ; CHECK-NEXT: .p2align 4, 0x90
2685 ; CHECK-NEXT: .LBB77_1: # %bb1
2686 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2687 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
2688 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2689 ; CHECK-NEXT: addq $64, %rax
2690 ; CHECK-NEXT: jne .LBB77_1
2691 ; CHECK-NEXT: # %bb.2: # %bb10
2692 ; CHECK-NEXT: vzeroupper
2697 bb1: ; preds = %bb1, %bb
2698 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2699 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2700 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
2701 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
2702 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2703 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2704 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
2705 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
2706 %tmp8 = add i64 %tmp, 8
2707 %tmp9 = icmp eq i64 %tmp8, 1024
2708 br i1 %tmp9, label %bb10, label %bb1
2710 bb10: ; preds = %bb1
2714 define void @bcast_unfold_smax_v4i32(i32* %arg) {
2715 ; CHECK-LABEL: bcast_unfold_smax_v4i32:
2716 ; CHECK: # %bb.0: # %bb
2717 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2718 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2719 ; CHECK-NEXT: .p2align 4, 0x90
2720 ; CHECK-NEXT: .LBB78_1: # %bb1
2721 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2722 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
2723 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2724 ; CHECK-NEXT: addq $16, %rax
2725 ; CHECK-NEXT: jne .LBB78_1
2726 ; CHECK-NEXT: # %bb.2: # %bb10
2731 bb1: ; preds = %bb1, %bb
2732 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2733 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2734 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2735 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2736 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2737 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2738 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2739 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2740 %tmp8 = add i64 %tmp, 4
2741 %tmp9 = icmp eq i64 %tmp8, 1024
2742 br i1 %tmp9, label %bb10, label %bb1
2744 bb10: ; preds = %bb1
2748 define void @bcast_unfold_smax_v8i32(i32* %arg) {
2749 ; CHECK-LABEL: bcast_unfold_smax_v8i32:
2750 ; CHECK: # %bb.0: # %bb
2751 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2752 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2753 ; CHECK-NEXT: .p2align 4, 0x90
2754 ; CHECK-NEXT: .LBB79_1: # %bb1
2755 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2756 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
2757 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2758 ; CHECK-NEXT: addq $32, %rax
2759 ; CHECK-NEXT: jne .LBB79_1
2760 ; CHECK-NEXT: # %bb.2: # %bb10
2761 ; CHECK-NEXT: vzeroupper
2766 bb1: ; preds = %bb1, %bb
2767 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2768 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2769 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2770 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2771 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2772 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2773 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2774 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2775 %tmp8 = add i64 %tmp, 8
2776 %tmp9 = icmp eq i64 %tmp8, 1024
2777 br i1 %tmp9, label %bb10, label %bb1
2779 bb10: ; preds = %bb1
2783 define void @bcast_unfold_smax_v16i32(i32* %arg) {
2784 ; CHECK-LABEL: bcast_unfold_smax_v16i32:
2785 ; CHECK: # %bb.0: # %bb
2786 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2787 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2788 ; CHECK-NEXT: .p2align 4, 0x90
2789 ; CHECK-NEXT: .LBB80_1: # %bb1
2790 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2791 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
2792 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2793 ; CHECK-NEXT: addq $64, %rax
2794 ; CHECK-NEXT: jne .LBB80_1
2795 ; CHECK-NEXT: # %bb.2: # %bb10
2796 ; CHECK-NEXT: vzeroupper
2801 bb1: ; preds = %bb1, %bb
2802 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2803 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2804 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
2805 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
2806 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2807 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2808 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
2809 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
2810 %tmp8 = add i64 %tmp, 16
2811 %tmp9 = icmp eq i64 %tmp8, 1024
2812 br i1 %tmp9, label %bb10, label %bb1
2814 bb10: ; preds = %bb1
2818 define void @bcast_unfold_smax_v2i64(i64* %arg) {
2819 ; CHECK-LABEL: bcast_unfold_smax_v2i64:
2820 ; CHECK: # %bb.0: # %bb
2821 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2822 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
2823 ; CHECK-NEXT: .p2align 4, 0x90
2824 ; CHECK-NEXT: .LBB81_1: # %bb1
2825 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2826 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1
2827 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2828 ; CHECK-NEXT: addq $16, %rax
2829 ; CHECK-NEXT: jne .LBB81_1
2830 ; CHECK-NEXT: # %bb.2: # %bb10
2835 bb1: ; preds = %bb1, %bb
2836 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2837 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2838 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
2839 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
2840 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2>
2841 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2842 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
2843 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
2844 %tmp8 = add i64 %tmp, 2
2845 %tmp9 = icmp eq i64 %tmp8, 1024
2846 br i1 %tmp9, label %bb10, label %bb1
2848 bb10: ; preds = %bb1
2852 define void @bcast_unfold_smax_v4i64(i64* %arg) {
2853 ; CHECK-LABEL: bcast_unfold_smax_v4i64:
2854 ; CHECK: # %bb.0: # %bb
2855 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2856 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2857 ; CHECK-NEXT: .p2align 4, 0x90
2858 ; CHECK-NEXT: .LBB82_1: # %bb1
2859 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2860 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
2861 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2862 ; CHECK-NEXT: addq $32, %rax
2863 ; CHECK-NEXT: jne .LBB82_1
2864 ; CHECK-NEXT: # %bb.2: # %bb10
2865 ; CHECK-NEXT: vzeroupper
2870 bb1: ; preds = %bb1, %bb
2871 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2872 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2873 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
2874 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
2875 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2876 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2877 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
2878 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
2879 %tmp8 = add i64 %tmp, 4
2880 %tmp9 = icmp eq i64 %tmp8, 1024
2881 br i1 %tmp9, label %bb10, label %bb1
2883 bb10: ; preds = %bb1
2887 define void @bcast_unfold_smax_v8i64(i64* %arg) {
2888 ; CHECK-LABEL: bcast_unfold_smax_v8i64:
2889 ; CHECK: # %bb.0: # %bb
2890 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2891 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2892 ; CHECK-NEXT: .p2align 4, 0x90
2893 ; CHECK-NEXT: .LBB83_1: # %bb1
2894 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2895 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
2896 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2897 ; CHECK-NEXT: addq $64, %rax
2898 ; CHECK-NEXT: jne .LBB83_1
2899 ; CHECK-NEXT: # %bb.2: # %bb10
2900 ; CHECK-NEXT: vzeroupper
2905 bb1: ; preds = %bb1, %bb
2906 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2907 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2908 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
2909 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
2910 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2911 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2912 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
2913 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
2914 %tmp8 = add i64 %tmp, 8
2915 %tmp9 = icmp eq i64 %tmp8, 1024
2916 br i1 %tmp9, label %bb10, label %bb1
2918 bb10: ; preds = %bb1
2922 define void @bcast_unfold_umin_v4i32(i32* %arg) {
2923 ; CHECK-LABEL: bcast_unfold_umin_v4i32:
2924 ; CHECK: # %bb.0: # %bb
2925 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2926 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2927 ; CHECK-NEXT: .p2align 4, 0x90
2928 ; CHECK-NEXT: .LBB84_1: # %bb1
2929 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2930 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1
2931 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2932 ; CHECK-NEXT: addq $16, %rax
2933 ; CHECK-NEXT: jne .LBB84_1
2934 ; CHECK-NEXT: # %bb.2: # %bb10
2939 bb1: ; preds = %bb1, %bb
2940 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2941 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2942 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2943 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2944 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2945 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2946 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2947 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2948 %tmp8 = add i64 %tmp, 4
2949 %tmp9 = icmp eq i64 %tmp8, 1024
2950 br i1 %tmp9, label %bb10, label %bb1
2952 bb10: ; preds = %bb1
2956 define void @bcast_unfold_umin_v8i32(i32* %arg) {
2957 ; CHECK-LABEL: bcast_unfold_umin_v8i32:
2958 ; CHECK: # %bb.0: # %bb
2959 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2960 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2961 ; CHECK-NEXT: .p2align 4, 0x90
2962 ; CHECK-NEXT: .LBB85_1: # %bb1
2963 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2964 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1
2965 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2966 ; CHECK-NEXT: addq $32, %rax
2967 ; CHECK-NEXT: jne .LBB85_1
2968 ; CHECK-NEXT: # %bb.2: # %bb10
2969 ; CHECK-NEXT: vzeroupper
2974 bb1: ; preds = %bb1, %bb
2975 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2976 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2977 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2978 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2979 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2980 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2981 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2982 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2983 %tmp8 = add i64 %tmp, 8
2984 %tmp9 = icmp eq i64 %tmp8, 1024
2985 br i1 %tmp9, label %bb10, label %bb1
2987 bb10: ; preds = %bb1
2991 define void @bcast_unfold_umin_v16i32(i32* %arg) {
2992 ; CHECK-LABEL: bcast_unfold_umin_v16i32:
2993 ; CHECK: # %bb.0: # %bb
2994 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2995 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2996 ; CHECK-NEXT: .p2align 4, 0x90
2997 ; CHECK-NEXT: .LBB86_1: # %bb1
2998 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2999 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1
3000 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3001 ; CHECK-NEXT: addq $64, %rax
3002 ; CHECK-NEXT: jne .LBB86_1
3003 ; CHECK-NEXT: # %bb.2: # %bb10
3004 ; CHECK-NEXT: vzeroupper
3009 bb1: ; preds = %bb1, %bb
3010 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3011 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3012 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3013 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3014 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3015 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3016 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3017 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3018 %tmp8 = add i64 %tmp, 16
3019 %tmp9 = icmp eq i64 %tmp8, 1024
3020 br i1 %tmp9, label %bb10, label %bb1
3022 bb10: ; preds = %bb1
3026 define void @bcast_unfold_umin_v2i64(i64* %arg) {
3027 ; CHECK-LABEL: bcast_unfold_umin_v2i64:
3028 ; CHECK: # %bb.0: # %bb
3029 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3030 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
3031 ; CHECK-NEXT: .p2align 4, 0x90
3032 ; CHECK-NEXT: .LBB87_1: # %bb1
3033 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3034 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %xmm0, %xmm1
3035 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3036 ; CHECK-NEXT: addq $16, %rax
3037 ; CHECK-NEXT: jne .LBB87_1
3038 ; CHECK-NEXT: # %bb.2: # %bb10
3043 bb1: ; preds = %bb1, %bb
3044 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3045 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3046 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3047 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
3048 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
3049 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3050 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3051 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
3052 %tmp8 = add i64 %tmp, 2
3053 %tmp9 = icmp eq i64 %tmp8, 1024
3054 br i1 %tmp9, label %bb10, label %bb1
3056 bb10: ; preds = %bb1
3060 define void @bcast_unfold_umin_v4i64(i64* %arg) {
3061 ; CHECK-LABEL: bcast_unfold_umin_v4i64:
3062 ; CHECK: # %bb.0: # %bb
3063 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3064 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3065 ; CHECK-NEXT: .p2align 4, 0x90
3066 ; CHECK-NEXT: .LBB88_1: # %bb1
3067 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3068 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
3069 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3070 ; CHECK-NEXT: addq $32, %rax
3071 ; CHECK-NEXT: jne .LBB88_1
3072 ; CHECK-NEXT: # %bb.2: # %bb10
3073 ; CHECK-NEXT: vzeroupper
3078 bb1: ; preds = %bb1, %bb
3079 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3080 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3081 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3082 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
3083 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3084 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3085 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3086 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
3087 %tmp8 = add i64 %tmp, 4
3088 %tmp9 = icmp eq i64 %tmp8, 1024
3089 br i1 %tmp9, label %bb10, label %bb1
3091 bb10: ; preds = %bb1
3095 define void @bcast_unfold_umin_v8i64(i64* %arg) {
3096 ; CHECK-LABEL: bcast_unfold_umin_v8i64:
3097 ; CHECK: # %bb.0: # %bb
3098 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3099 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3100 ; CHECK-NEXT: .p2align 4, 0x90
3101 ; CHECK-NEXT: .LBB89_1: # %bb1
3102 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3103 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
3104 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3105 ; CHECK-NEXT: addq $64, %rax
3106 ; CHECK-NEXT: jne .LBB89_1
3107 ; CHECK-NEXT: # %bb.2: # %bb10
3108 ; CHECK-NEXT: vzeroupper
3113 bb1: ; preds = %bb1, %bb
3114 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3115 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3116 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3117 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
3118 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3119 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3120 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3121 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
3122 %tmp8 = add i64 %tmp, 8
3123 %tmp9 = icmp eq i64 %tmp8, 1024
3124 br i1 %tmp9, label %bb10, label %bb1
3126 bb10: ; preds = %bb1
3130 define void @bcast_unfold_umax_v4i32(i32* %arg) {
3131 ; CHECK-LABEL: bcast_unfold_umax_v4i32:
3132 ; CHECK: # %bb.0: # %bb
3133 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3134 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
3135 ; CHECK-NEXT: .p2align 4, 0x90
3136 ; CHECK-NEXT: .LBB90_1: # %bb1
3137 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3138 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
3139 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3140 ; CHECK-NEXT: addq $16, %rax
3141 ; CHECK-NEXT: jne .LBB90_1
3142 ; CHECK-NEXT: # %bb.2: # %bb10
3147 bb1: ; preds = %bb1, %bb
3148 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3149 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3150 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3151 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3152 %tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
3153 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
3154 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3155 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3156 %tmp8 = add i64 %tmp, 4
3157 %tmp9 = icmp eq i64 %tmp8, 1024
3158 br i1 %tmp9, label %bb10, label %bb1
3160 bb10: ; preds = %bb1
3164 define void @bcast_unfold_umax_v8i32(i32* %arg) {
3165 ; CHECK-LABEL: bcast_unfold_umax_v8i32:
3166 ; CHECK: # %bb.0: # %bb
3167 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3168 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
3169 ; CHECK-NEXT: .p2align 4, 0x90
3170 ; CHECK-NEXT: .LBB91_1: # %bb1
3171 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3172 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
3173 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3174 ; CHECK-NEXT: addq $32, %rax
3175 ; CHECK-NEXT: jne .LBB91_1
3176 ; CHECK-NEXT: # %bb.2: # %bb10
3177 ; CHECK-NEXT: vzeroupper
3182 bb1: ; preds = %bb1, %bb
3183 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3184 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3185 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3186 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3187 %tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3188 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3189 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3190 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3191 %tmp8 = add i64 %tmp, 8
3192 %tmp9 = icmp eq i64 %tmp8, 1024
3193 br i1 %tmp9, label %bb10, label %bb1
3195 bb10: ; preds = %bb1
3199 define void @bcast_unfold_umax_v16i32(i32* %arg) {
3200 ; CHECK-LABEL: bcast_unfold_umax_v16i32:
3201 ; CHECK: # %bb.0: # %bb
3202 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3203 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3204 ; CHECK-NEXT: .p2align 4, 0x90
3205 ; CHECK-NEXT: .LBB92_1: # %bb1
3206 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3207 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
3208 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3209 ; CHECK-NEXT: addq $64, %rax
3210 ; CHECK-NEXT: jne .LBB92_1
3211 ; CHECK-NEXT: # %bb.2: # %bb10
3212 ; CHECK-NEXT: vzeroupper
3217 bb1: ; preds = %bb1, %bb
3218 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3219 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3220 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3221 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3222 %tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3223 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3224 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3225 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3226 %tmp8 = add i64 %tmp, 16
3227 %tmp9 = icmp eq i64 %tmp8, 1024
3228 br i1 %tmp9, label %bb10, label %bb1
3230 bb10: ; preds = %bb1
3234 define void @bcast_unfold_umax_v2i64(i64* %arg) {
3235 ; CHECK-LABEL: bcast_unfold_umax_v2i64:
3236 ; CHECK: # %bb.0: # %bb
3237 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3238 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
3239 ; CHECK-NEXT: .p2align 4, 0x90
3240 ; CHECK-NEXT: .LBB93_1: # %bb1
3241 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3242 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1
3243 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3244 ; CHECK-NEXT: addq $16, %rax
3245 ; CHECK-NEXT: jne .LBB93_1
3246 ; CHECK-NEXT: # %bb.2: # %bb10
3251 bb1: ; preds = %bb1, %bb
3252 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3253 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3254 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3255 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
3256 %tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2>
3257 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3258 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3259 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
3260 %tmp8 = add i64 %tmp, 2
3261 %tmp9 = icmp eq i64 %tmp8, 1024
3262 br i1 %tmp9, label %bb10, label %bb1
3264 bb10: ; preds = %bb1
3268 define void @bcast_unfold_umax_v4i64(i64* %arg) {
3269 ; CHECK-LABEL: bcast_unfold_umax_v4i64:
3270 ; CHECK: # %bb.0: # %bb
3271 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3272 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3273 ; CHECK-NEXT: .p2align 4, 0x90
3274 ; CHECK-NEXT: .LBB94_1: # %bb1
3275 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3276 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
3277 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3278 ; CHECK-NEXT: addq $32, %rax
3279 ; CHECK-NEXT: jne .LBB94_1
3280 ; CHECK-NEXT: # %bb.2: # %bb10
3281 ; CHECK-NEXT: vzeroupper
3286 bb1: ; preds = %bb1, %bb
3287 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3288 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3289 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3290 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
3291 %tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3292 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3293 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3294 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
3295 %tmp8 = add i64 %tmp, 4
3296 %tmp9 = icmp eq i64 %tmp8, 1024
3297 br i1 %tmp9, label %bb10, label %bb1
3299 bb10: ; preds = %bb1
3303 define void @bcast_unfold_umax_v8i64(i64* %arg) {
3304 ; CHECK-LABEL: bcast_unfold_umax_v8i64:
3305 ; CHECK: # %bb.0: # %bb
3306 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3307 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3308 ; CHECK-NEXT: .p2align 4, 0x90
3309 ; CHECK-NEXT: .LBB95_1: # %bb1
3310 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3311 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
3312 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3313 ; CHECK-NEXT: addq $64, %rax
3314 ; CHECK-NEXT: jne .LBB95_1
3315 ; CHECK-NEXT: # %bb.2: # %bb10
3316 ; CHECK-NEXT: vzeroupper
3321 bb1: ; preds = %bb1, %bb
3322 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3323 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3324 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3325 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
3326 %tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3327 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3328 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3329 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
3330 %tmp8 = add i64 %tmp, 8
3331 %tmp9 = icmp eq i64 %tmp8, 1024
3332 br i1 %tmp9, label %bb10, label %bb1
3334 bb10: ; preds = %bb1
3338 define void @bcast_unfold_pcmpgt_v4i32(i32* %arg) {
3339 ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32:
3340 ; CHECK: # %bb.0: # %bb
3341 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3342 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3343 ; CHECK-NEXT: .p2align 4, 0x90
3344 ; CHECK-NEXT: .LBB96_1: # %bb1
3345 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3346 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
3347 ; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
3348 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
3349 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3350 ; CHECK-NEXT: addq $16, %rax
3351 ; CHECK-NEXT: jne .LBB96_1
3352 ; CHECK-NEXT: # %bb.2: # %bb10
3357 bb1: ; preds = %bb1, %bb
3358 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3359 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3360 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3361 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3362 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3363 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3364 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3365 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3366 %tmp8 = add i64 %tmp, 4
3367 %tmp9 = icmp eq i64 %tmp8, 1024
3368 br i1 %tmp9, label %bb10, label %bb1
3370 bb10: ; preds = %bb1
3374 define void @bcast_unfold_pcmpgt_v8i32(i32* %arg) {
3375 ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32:
3376 ; CHECK: # %bb.0: # %bb
3377 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3378 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3379 ; CHECK-NEXT: .p2align 4, 0x90
3380 ; CHECK-NEXT: .LBB97_1: # %bb1
3381 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3382 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
3383 ; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
3384 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
3385 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3386 ; CHECK-NEXT: addq $32, %rax
3387 ; CHECK-NEXT: jne .LBB97_1
3388 ; CHECK-NEXT: # %bb.2: # %bb10
3389 ; CHECK-NEXT: vzeroupper
3394 bb1: ; preds = %bb1, %bb
3395 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3396 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3397 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3398 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3399 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3400 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3401 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3402 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3403 %tmp8 = add i64 %tmp, 8
3404 %tmp9 = icmp eq i64 %tmp8, 1024
3405 br i1 %tmp9, label %bb10, label %bb1
3407 bb10: ; preds = %bb1
3411 define void @bcast_unfold_pcmpgt_v16i32(i32* %arg) {
3412 ; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32:
3413 ; CHECK: # %bb.0: # %bb
3414 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3415 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3416 ; CHECK-NEXT: .p2align 4, 0x90
3417 ; CHECK-NEXT: .LBB98_1: # %bb1
3418 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3419 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
3420 ; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
3421 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
3422 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3423 ; CHECK-NEXT: addq $64, %rax
3424 ; CHECK-NEXT: jne .LBB98_1
3425 ; CHECK-NEXT: # %bb.2: # %bb10
3426 ; CHECK-NEXT: vzeroupper
3431 bb1: ; preds = %bb1, %bb
3432 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3433 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3434 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3435 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3436 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3437 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3438 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3439 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3440 %tmp8 = add i64 %tmp, 16
3441 %tmp9 = icmp eq i64 %tmp8, 1024
3442 br i1 %tmp9, label %bb10, label %bb1
3444 bb10: ; preds = %bb1
3448 define void @bcast_unfold_pcmpgt_v2i64(i64* %arg) {
3449 ; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64:
3450 ; CHECK: # %bb.0: # %bb
3451 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3452 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1]
3453 ; CHECK-NEXT: .p2align 4, 0x90
3454 ; CHECK-NEXT: .LBB99_1: # %bb1
3455 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3456 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
3457 ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
3458 ; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
3459 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3460 ; CHECK-NEXT: addq $16, %rax
3461 ; CHECK-NEXT: jne .LBB99_1
3462 ; CHECK-NEXT: # %bb.2: # %bb10
3467 bb1: ; preds = %bb1, %bb
3468 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3469 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3470 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3471 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3472 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1>
3473 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3474 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3475 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3476 %tmp8 = add i64 %tmp, 2
3477 %tmp9 = icmp eq i64 %tmp8, 1024
3478 br i1 %tmp9, label %bb10, label %bb1
3480 bb10: ; preds = %bb1
3483 define void @bcast_unfold_pcmpgt_v4i64(i64* %arg) {
3484 ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64:
3485 ; CHECK: # %bb.0: # %bb
3486 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3487 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3488 ; CHECK-NEXT: .p2align 4, 0x90
3489 ; CHECK-NEXT: .LBB100_1: # %bb1
3490 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3491 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
3492 ; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
3493 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
3494 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3495 ; CHECK-NEXT: addq $32, %rax
3496 ; CHECK-NEXT: jne .LBB100_1
3497 ; CHECK-NEXT: # %bb.2: # %bb10
3498 ; CHECK-NEXT: vzeroupper
3503 bb1: ; preds = %bb1, %bb
3504 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3505 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3506 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3507 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3508 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3509 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3510 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3511 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3512 %tmp8 = add i64 %tmp, 4
3513 %tmp9 = icmp eq i64 %tmp8, 1024
3514 br i1 %tmp9, label %bb10, label %bb1
3516 bb10: ; preds = %bb1
3520 define void @bcast_unfold_pcmpgt_v8i64(i64* %arg) {
3521 ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64:
3522 ; CHECK: # %bb.0: # %bb
3523 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3524 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3525 ; CHECK-NEXT: .p2align 4, 0x90
3526 ; CHECK-NEXT: .LBB101_1: # %bb1
3527 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3528 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
3529 ; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
3530 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
3531 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3532 ; CHECK-NEXT: addq $64, %rax
3533 ; CHECK-NEXT: jne .LBB101_1
3534 ; CHECK-NEXT: # %bb.2: # %bb10
3535 ; CHECK-NEXT: vzeroupper
3540 bb1: ; preds = %bb1, %bb
3541 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3542 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3543 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3544 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3545 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3546 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3547 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3548 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3549 %tmp8 = add i64 %tmp, 8
3550 %tmp9 = icmp eq i64 %tmp8, 1024
3551 br i1 %tmp9, label %bb10, label %bb1
3553 bb10: ; preds = %bb1
3557 define void @bcast_unfold_pcmpeq_v4i32(i32* %arg) {
3558 ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32:
3559 ; CHECK: # %bb.0: # %bb
3560 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3561 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3562 ; CHECK-NEXT: .p2align 4, 0x90
3563 ; CHECK-NEXT: .LBB102_1: # %bb1
3564 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3565 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
3566 ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
3567 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
3568 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3569 ; CHECK-NEXT: addq $16, %rax
3570 ; CHECK-NEXT: jne .LBB102_1
3571 ; CHECK-NEXT: # %bb.2: # %bb10
3576 bb1: ; preds = %bb1, %bb
3577 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3578 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3579 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3580 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3581 %tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3582 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3583 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3584 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3585 %tmp8 = add i64 %tmp, 4
3586 %tmp9 = icmp eq i64 %tmp8, 1024
3587 br i1 %tmp9, label %bb10, label %bb1
3589 bb10: ; preds = %bb1
3593 define void @bcast_unfold_pcmpeq_v8i32(i32* %arg) {
3594 ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32:
3595 ; CHECK: # %bb.0: # %bb
3596 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3597 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3598 ; CHECK-NEXT: .p2align 4, 0x90
3599 ; CHECK-NEXT: .LBB103_1: # %bb1
3600 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3601 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
3602 ; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1
3603 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
3604 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3605 ; CHECK-NEXT: addq $32, %rax
3606 ; CHECK-NEXT: jne .LBB103_1
3607 ; CHECK-NEXT: # %bb.2: # %bb10
3608 ; CHECK-NEXT: vzeroupper
3613 bb1: ; preds = %bb1, %bb
3614 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3615 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3616 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3617 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3618 %tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3619 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3620 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3621 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3622 %tmp8 = add i64 %tmp, 8
3623 %tmp9 = icmp eq i64 %tmp8, 1024
3624 br i1 %tmp9, label %bb10, label %bb1
3626 bb10: ; preds = %bb1
3630 define void @bcast_unfold_pcmpeq_v16i32(i32* %arg) {
3631 ; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32:
3632 ; CHECK: # %bb.0: # %bb
3633 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3634 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3635 ; CHECK-NEXT: .p2align 4, 0x90
3636 ; CHECK-NEXT: .LBB104_1: # %bb1
3637 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3638 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
3639 ; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1
3640 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
3641 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3642 ; CHECK-NEXT: addq $64, %rax
3643 ; CHECK-NEXT: jne .LBB104_1
3644 ; CHECK-NEXT: # %bb.2: # %bb10
3645 ; CHECK-NEXT: vzeroupper
3650 bb1: ; preds = %bb1, %bb
3651 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3652 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3653 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3654 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3655 %tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3656 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3657 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3658 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3659 %tmp8 = add i64 %tmp, 16
3660 %tmp9 = icmp eq i64 %tmp8, 1024
3661 br i1 %tmp9, label %bb10, label %bb1
3663 bb10: ; preds = %bb1
3667 define void @bcast_unfold_pcmpeq_v2i64(i64* %arg) {
3668 ; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64:
3669 ; CHECK: # %bb.0: # %bb
3670 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3671 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1]
3672 ; CHECK-NEXT: .p2align 4, 0x90
3673 ; CHECK-NEXT: .LBB105_1: # %bb1
3674 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3675 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
3676 ; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1
3677 ; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
3678 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3679 ; CHECK-NEXT: addq $16, %rax
3680 ; CHECK-NEXT: jne .LBB105_1
3681 ; CHECK-NEXT: # %bb.2: # %bb10
3686 bb1: ; preds = %bb1, %bb
3687 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3688 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3689 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3690 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3691 %tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1>
3692 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3693 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3694 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3695 %tmp8 = add i64 %tmp, 2
3696 %tmp9 = icmp eq i64 %tmp8, 1024
3697 br i1 %tmp9, label %bb10, label %bb1
3699 bb10: ; preds = %bb1
3702 define void @bcast_unfold_pcmpeq_v4i64(i64* %arg) {
3703 ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64:
3704 ; CHECK: # %bb.0: # %bb
3705 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3706 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3707 ; CHECK-NEXT: .p2align 4, 0x90
3708 ; CHECK-NEXT: .LBB106_1: # %bb1
3709 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3710 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
3711 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1
3712 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
3713 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3714 ; CHECK-NEXT: addq $32, %rax
3715 ; CHECK-NEXT: jne .LBB106_1
3716 ; CHECK-NEXT: # %bb.2: # %bb10
3717 ; CHECK-NEXT: vzeroupper
3722 bb1: ; preds = %bb1, %bb
3723 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3724 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3725 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3726 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3727 %tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3728 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3729 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3730 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3731 %tmp8 = add i64 %tmp, 4
3732 %tmp9 = icmp eq i64 %tmp8, 1024
3733 br i1 %tmp9, label %bb10, label %bb1
3735 bb10: ; preds = %bb1
3739 define void @bcast_unfold_pcmpeq_v8i64(i64* %arg) {
3740 ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64:
3741 ; CHECK: # %bb.0: # %bb
3742 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3743 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3744 ; CHECK-NEXT: .p2align 4, 0x90
3745 ; CHECK-NEXT: .LBB107_1: # %bb1
3746 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3747 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
3748 ; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1
3749 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
3750 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3751 ; CHECK-NEXT: addq $64, %rax
3752 ; CHECK-NEXT: jne .LBB107_1
3753 ; CHECK-NEXT: # %bb.2: # %bb10
3754 ; CHECK-NEXT: vzeroupper
3759 bb1: ; preds = %bb1, %bb
3760 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3761 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3762 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3763 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3764 %tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3765 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3766 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3767 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3768 %tmp8 = add i64 %tmp, 8
3769 %tmp9 = icmp eq i64 %tmp8, 1024
3770 br i1 %tmp9, label %bb10, label %bb1
3772 bb10: ; preds = %bb1
3776 define void @bcast_unfold_pcmp_v4i32(i32* %arg) {
3777 ; CHECK-LABEL: bcast_unfold_pcmp_v4i32:
3778 ; CHECK: # %bb.0: # %bb
3779 ; CHECK-NEXT: xorl %eax, %eax
3780 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3781 ; CHECK-NEXT: .p2align 4, 0x90
3782 ; CHECK-NEXT: .LBB108_1: # %bb1
3783 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3784 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
3785 ; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1
3786 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
3787 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
3788 ; CHECK-NEXT: addq $4, %rax
3789 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3790 ; CHECK-NEXT: jg .LBB108_1
3791 ; CHECK-NEXT: # %bb.2: # %bb10
3796 bb1: ; preds = %bb1, %bb
3797 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3798 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3799 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3800 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3801 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3802 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3803 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3804 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3805 %tmp8 = add i64 %tmp, 4
3806 %tmp9 = icmp slt i64 %tmp8, 1024
3807 br i1 %tmp9, label %bb10, label %bb1
3809 bb10: ; preds = %bb1
3813 define void @bcast_unfold_pcmp_v8i32(i32* %arg) {
3814 ; CHECK-LABEL: bcast_unfold_pcmp_v8i32:
3815 ; CHECK: # %bb.0: # %bb
3816 ; CHECK-NEXT: xorl %eax, %eax
3817 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3818 ; CHECK-NEXT: .p2align 4, 0x90
3819 ; CHECK-NEXT: .LBB109_1: # %bb1
3820 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3821 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
3822 ; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1
3823 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
3824 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
3825 ; CHECK-NEXT: addq $8, %rax
3826 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3827 ; CHECK-NEXT: jg .LBB109_1
3828 ; CHECK-NEXT: # %bb.2: # %bb10
3829 ; CHECK-NEXT: vzeroupper
3834 bb1: ; preds = %bb1, %bb
3835 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3836 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3837 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3838 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3839 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3840 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3841 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3842 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3843 %tmp8 = add i64 %tmp, 8
3844 %tmp9 = icmp slt i64 %tmp8, 1024
3845 br i1 %tmp9, label %bb10, label %bb1
3847 bb10: ; preds = %bb1
3851 define void @bcast_unfold_pcmp_v16i32(i32* %arg) {
3852 ; CHECK-LABEL: bcast_unfold_pcmp_v16i32:
3853 ; CHECK: # %bb.0: # %bb
3854 ; CHECK-NEXT: xorl %eax, %eax
3855 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3856 ; CHECK-NEXT: .p2align 4, 0x90
3857 ; CHECK-NEXT: .LBB110_1: # %bb1
3858 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3859 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
3860 ; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1
3861 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
3862 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
3863 ; CHECK-NEXT: addq $16, %rax
3864 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3865 ; CHECK-NEXT: jg .LBB110_1
3866 ; CHECK-NEXT: # %bb.2: # %bb10
3867 ; CHECK-NEXT: vzeroupper
3872 bb1: ; preds = %bb1, %bb
3873 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3874 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3875 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3876 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3877 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3878 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3879 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3880 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3881 %tmp8 = add i64 %tmp, 16
3882 %tmp9 = icmp slt i64 %tmp8, 1024
3883 br i1 %tmp9, label %bb10, label %bb1
3885 bb10: ; preds = %bb1
3889 define void @bcast_unfold_pcmp_v2i64(i64* %arg) {
3890 ; CHECK-LABEL: bcast_unfold_pcmp_v2i64:
3891 ; CHECK: # %bb.0: # %bb
3892 ; CHECK-NEXT: xorl %eax, %eax
3893 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1]
3894 ; CHECK-NEXT: .p2align 4, 0x90
3895 ; CHECK-NEXT: .LBB111_1: # %bb1
3896 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3897 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
3898 ; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1
3899 ; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
3900 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
3901 ; CHECK-NEXT: addq $2, %rax
3902 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3903 ; CHECK-NEXT: jg .LBB111_1
3904 ; CHECK-NEXT: # %bb.2: # %bb10
3909 bb1: ; preds = %bb1, %bb
3910 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3911 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3912 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3913 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3914 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1>
3915 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3916 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3917 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3918 %tmp8 = add i64 %tmp, 2
3919 %tmp9 = icmp slt i64 %tmp8, 1024
3920 br i1 %tmp9, label %bb10, label %bb1
3922 bb10: ; preds = %bb1
3925 define void @bcast_unfold_pcmp_v4i64(i64* %arg) {
3926 ; CHECK-LABEL: bcast_unfold_pcmp_v4i64:
3927 ; CHECK: # %bb.0: # %bb
3928 ; CHECK-NEXT: xorl %eax, %eax
3929 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3930 ; CHECK-NEXT: .p2align 4, 0x90
3931 ; CHECK-NEXT: .LBB112_1: # %bb1
3932 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3933 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
3934 ; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1
3935 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
3936 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
3937 ; CHECK-NEXT: addq $4, %rax
3938 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3939 ; CHECK-NEXT: jg .LBB112_1
3940 ; CHECK-NEXT: # %bb.2: # %bb10
3941 ; CHECK-NEXT: vzeroupper
3946 bb1: ; preds = %bb1, %bb
3947 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3948 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3949 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3950 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3951 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3952 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3953 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3954 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3955 %tmp8 = add i64 %tmp, 4
3956 %tmp9 = icmp slt i64 %tmp8, 1024
3957 br i1 %tmp9, label %bb10, label %bb1
3959 bb10: ; preds = %bb1
3963 define void @bcast_unfold_pcmp_v8i64(i64* %arg) {
3964 ; CHECK-LABEL: bcast_unfold_pcmp_v8i64:
3965 ; CHECK: # %bb.0: # %bb
3966 ; CHECK-NEXT: xorl %eax, %eax
3967 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3968 ; CHECK-NEXT: .p2align 4, 0x90
3969 ; CHECK-NEXT: .LBB113_1: # %bb1
3970 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3971 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
3972 ; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1
3973 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
3974 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
3975 ; CHECK-NEXT: addq $8, %rax
3976 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3977 ; CHECK-NEXT: jg .LBB113_1
3978 ; CHECK-NEXT: # %bb.2: # %bb10
3979 ; CHECK-NEXT: vzeroupper
3984 bb1: ; preds = %bb1, %bb
3985 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3986 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3987 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3988 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3989 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3990 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3991 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3992 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3993 %tmp8 = add i64 %tmp, 8
3994 %tmp9 = icmp slt i64 %tmp8, 1024
3995 br i1 %tmp9, label %bb10, label %bb1
3997 bb10: ; preds = %bb1
4001 define void @bcast_unfold_pcmpu_v4i32(i32* %arg) {
4002 ; CHECK-LABEL: bcast_unfold_pcmpu_v4i32:
4003 ; CHECK: # %bb.0: # %bb
4004 ; CHECK-NEXT: xorl %eax, %eax
4005 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4006 ; CHECK-NEXT: .p2align 4, 0x90
4007 ; CHECK-NEXT: .LBB114_1: # %bb1
4008 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4009 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
4010 ; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1
4011 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
4012 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
4013 ; CHECK-NEXT: addq $4, %rax
4014 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4015 ; CHECK-NEXT: ja .LBB114_1
4016 ; CHECK-NEXT: # %bb.2: # %bb10
4021 bb1: ; preds = %bb1, %bb
4022 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4023 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4024 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4025 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4026 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4027 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4028 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4029 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4030 %tmp8 = add i64 %tmp, 4
4031 %tmp9 = icmp ult i64 %tmp8, 1024
4032 br i1 %tmp9, label %bb10, label %bb1
4034 bb10: ; preds = %bb1
4038 define void @bcast_unfold_pcmpu_v8i32(i32* %arg) {
4039 ; CHECK-LABEL: bcast_unfold_pcmpu_v8i32:
4040 ; CHECK: # %bb.0: # %bb
4041 ; CHECK-NEXT: xorl %eax, %eax
4042 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
4043 ; CHECK-NEXT: .p2align 4, 0x90
4044 ; CHECK-NEXT: .LBB115_1: # %bb1
4045 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4046 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
4047 ; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1
4048 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
4049 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
4050 ; CHECK-NEXT: addq $8, %rax
4051 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4052 ; CHECK-NEXT: ja .LBB115_1
4053 ; CHECK-NEXT: # %bb.2: # %bb10
4054 ; CHECK-NEXT: vzeroupper
4059 bb1: ; preds = %bb1, %bb
4060 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4061 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4062 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
4063 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
4064 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
4065 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
4066 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
4067 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
4068 %tmp8 = add i64 %tmp, 8
4069 %tmp9 = icmp ult i64 %tmp8, 1024
4070 br i1 %tmp9, label %bb10, label %bb1
4072 bb10: ; preds = %bb1
4076 define void @bcast_unfold_pcmpu_v16i32(i32* %arg) {
4077 ; CHECK-LABEL: bcast_unfold_pcmpu_v16i32:
4078 ; CHECK: # %bb.0: # %bb
4079 ; CHECK-NEXT: xorl %eax, %eax
4080 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
4081 ; CHECK-NEXT: .p2align 4, 0x90
4082 ; CHECK-NEXT: .LBB116_1: # %bb1
4083 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4084 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
4085 ; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1
4086 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
4087 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
4088 ; CHECK-NEXT: addq $16, %rax
4089 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4090 ; CHECK-NEXT: ja .LBB116_1
4091 ; CHECK-NEXT: # %bb.2: # %bb10
4092 ; CHECK-NEXT: vzeroupper
4097 bb1: ; preds = %bb1, %bb
4098 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4099 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4100 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
4101 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
4102 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
4103 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
4104 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
4105 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
4106 %tmp8 = add i64 %tmp, 16
4107 %tmp9 = icmp ult i64 %tmp8, 1024
4108 br i1 %tmp9, label %bb10, label %bb1
4110 bb10: ; preds = %bb1
4114 define void @bcast_unfold_pcmpu_v2i64(i64* %arg) {
4115 ; CHECK-LABEL: bcast_unfold_pcmpu_v2i64:
4116 ; CHECK: # %bb.0: # %bb
4117 ; CHECK-NEXT: xorl %eax, %eax
4118 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2]
4119 ; CHECK-NEXT: .p2align 4, 0x90
4120 ; CHECK-NEXT: .LBB117_1: # %bb1
4121 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4122 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
4123 ; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
4124 ; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
4125 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
4126 ; CHECK-NEXT: addq $2, %rax
4127 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4128 ; CHECK-NEXT: ja .LBB117_1
4129 ; CHECK-NEXT: # %bb.2: # %bb10
4134 bb1: ; preds = %bb1, %bb
4135 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4136 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4137 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
4138 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
4139 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
4140 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
4141 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
4142 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
4143 %tmp8 = add i64 %tmp, 2
4144 %tmp9 = icmp ult i64 %tmp8, 1024
4145 br i1 %tmp9, label %bb10, label %bb1
4147 bb10: ; preds = %bb1
4150 define void @bcast_unfold_pcmpu_v4i64(i64* %arg) {
4151 ; CHECK-LABEL: bcast_unfold_pcmpu_v4i64:
4152 ; CHECK: # %bb.0: # %bb
4153 ; CHECK-NEXT: xorl %eax, %eax
4154 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4155 ; CHECK-NEXT: .p2align 4, 0x90
4156 ; CHECK-NEXT: .LBB118_1: # %bb1
4157 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4158 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
4159 ; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1
4160 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
4161 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
4162 ; CHECK-NEXT: addq $4, %rax
4163 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4164 ; CHECK-NEXT: ja .LBB118_1
4165 ; CHECK-NEXT: # %bb.2: # %bb10
4166 ; CHECK-NEXT: vzeroupper
4171 bb1: ; preds = %bb1, %bb
4172 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4173 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4174 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4175 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
4176 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4177 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4178 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4179 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
4180 %tmp8 = add i64 %tmp, 4
4181 %tmp9 = icmp ult i64 %tmp8, 1024
4182 br i1 %tmp9, label %bb10, label %bb1
4184 bb10: ; preds = %bb1
4188 define void @bcast_unfold_pcmpu_v8i64(i64* %arg) {
4189 ; CHECK-LABEL: bcast_unfold_pcmpu_v8i64:
4190 ; CHECK: # %bb.0: # %bb
4191 ; CHECK-NEXT: xorl %eax, %eax
4192 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
4193 ; CHECK-NEXT: .p2align 4, 0x90
4194 ; CHECK-NEXT: .LBB119_1: # %bb1
4195 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4196 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
4197 ; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1
4198 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
4199 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
4200 ; CHECK-NEXT: addq $8, %rax
4201 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4202 ; CHECK-NEXT: ja .LBB119_1
4203 ; CHECK-NEXT: # %bb.2: # %bb10
4204 ; CHECK-NEXT: vzeroupper
4209 bb1: ; preds = %bb1, %bb
4210 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4211 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4212 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
4213 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
4214 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
4215 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
4216 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
4217 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
4218 %tmp8 = add i64 %tmp, 8
4219 %tmp9 = icmp ult i64 %tmp8, 1024
4220 br i1 %tmp9, label %bb10, label %bb1
4222 bb10: ; preds = %bb1
4226 define void @bcast_unfold_cmp_v4f32(float* %arg) {
4227 ; CHECK-LABEL: bcast_unfold_cmp_v4f32:
4228 ; CHECK: # %bb.0: # %bb
4229 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4230 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4231 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4232 ; CHECK-NEXT: .p2align 4, 0x90
4233 ; CHECK-NEXT: .LBB120_1: # %bb1
4234 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4235 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2
4236 ; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1
4237 ; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1}
4238 ; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax)
4239 ; CHECK-NEXT: addq $16, %rax
4240 ; CHECK-NEXT: jne .LBB120_1
4241 ; CHECK-NEXT: # %bb.2: # %bb10
4246 bb1: ; preds = %bb1, %bb
4247 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4248 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4249 %tmp3 = bitcast float* %tmp2 to <4 x float>*
4250 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
4251 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4252 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4253 %tmp7 = bitcast float* %tmp2 to <4 x float>*
4254 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
4255 %tmp8 = add i64 %tmp, 4
4256 %tmp9 = icmp eq i64 %tmp8, 1024
4257 br i1 %tmp9, label %bb10, label %bb1
4259 bb10: ; preds = %bb1
4263 define void @bcast_unfold_cmp_v8f32(float* %arg) {
4264 ; CHECK-LABEL: bcast_unfold_cmp_v8f32:
4265 ; CHECK: # %bb.0: # %bb
4266 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4267 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4268 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4269 ; CHECK-NEXT: .p2align 4, 0x90
4270 ; CHECK-NEXT: .LBB121_1: # %bb1
4271 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4272 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2
4273 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1
4274 ; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1}
4275 ; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
4276 ; CHECK-NEXT: addq $32, %rax
4277 ; CHECK-NEXT: jne .LBB121_1
4278 ; CHECK-NEXT: # %bb.2: # %bb10
4279 ; CHECK-NEXT: vzeroupper
4284 bb1: ; preds = %bb1, %bb
4285 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4286 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4287 %tmp3 = bitcast float* %tmp2 to <8 x float>*
4288 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
4289 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4290 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4291 %tmp7 = bitcast float* %tmp2 to <8 x float>*
4292 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
4293 %tmp8 = add i64 %tmp, 8
4294 %tmp9 = icmp eq i64 %tmp8, 1024
4295 br i1 %tmp9, label %bb10, label %bb1
4297 bb10: ; preds = %bb1
4301 define void @bcast_unfold_cmp_v16f32(float* %arg) {
4302 ; CHECK-LABEL: bcast_unfold_cmp_v16f32:
4303 ; CHECK: # %bb.0: # %bb
4304 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4305 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4306 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4307 ; CHECK-NEXT: .p2align 4, 0x90
4308 ; CHECK-NEXT: .LBB122_1: # %bb1
4309 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4310 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2
4311 ; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1
4312 ; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1}
4313 ; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax)
4314 ; CHECK-NEXT: addq $64, %rax
4315 ; CHECK-NEXT: jne .LBB122_1
4316 ; CHECK-NEXT: # %bb.2: # %bb10
4317 ; CHECK-NEXT: vzeroupper
4322 bb1: ; preds = %bb1, %bb
4323 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4324 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4325 %tmp3 = bitcast float* %tmp2 to <16 x float>*
4326 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
4327 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4328 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4329 %tmp7 = bitcast float* %tmp2 to <16 x float>*
4330 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
4331 %tmp8 = add i64 %tmp, 16
4332 %tmp9 = icmp eq i64 %tmp8, 1024
4333 br i1 %tmp9, label %bb10, label %bb1
4335 bb10: ; preds = %bb1
4339 define void @bcast_unfold_cmp_v2f64(double* %arg) {
4340 ; CHECK-LABEL: bcast_unfold_cmp_v2f64:
4341 ; CHECK: # %bb.0: # %bb
4342 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4343 ; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
4344 ; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [3.0E+0,3.0E+0]
4345 ; CHECK-NEXT: .p2align 4, 0x90
4346 ; CHECK-NEXT: .LBB123_1: # %bb1
4347 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4348 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2
4349 ; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1
4350 ; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1}
4351 ; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax)
4352 ; CHECK-NEXT: addq $16, %rax
4353 ; CHECK-NEXT: jne .LBB123_1
4354 ; CHECK-NEXT: # %bb.2: # %bb10
4359 bb1: ; preds = %bb1, %bb
4360 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4361 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4362 %tmp3 = bitcast double* %tmp2 to <2 x double>*
4363 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
4364 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
4365 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00>
4366 %tmp7 = bitcast double* %tmp2 to <2 x double>*
4367 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
4368 %tmp8 = add i64 %tmp, 2
4369 %tmp9 = icmp eq i64 %tmp8, 1024
4370 br i1 %tmp9, label %bb10, label %bb1
4372 bb10: ; preds = %bb1
4376 define void @bcast_unfold_cmp_v4f64(double* %arg) {
4377 ; CHECK-LABEL: bcast_unfold_cmp_v4f64:
4378 ; CHECK: # %bb.0: # %bb
4379 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4380 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4381 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4382 ; CHECK-NEXT: .p2align 4, 0x90
4383 ; CHECK-NEXT: .LBB124_1: # %bb1
4384 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4385 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2
4386 ; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1
4387 ; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1}
4388 ; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax)
4389 ; CHECK-NEXT: addq $32, %rax
4390 ; CHECK-NEXT: jne .LBB124_1
4391 ; CHECK-NEXT: # %bb.2: # %bb10
4392 ; CHECK-NEXT: vzeroupper
4397 bb1: ; preds = %bb1, %bb
4398 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4399 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4400 %tmp3 = bitcast double* %tmp2 to <4 x double>*
4401 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
4402 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4403 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4404 %tmp7 = bitcast double* %tmp2 to <4 x double>*
4405 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
4406 %tmp8 = add i64 %tmp, 4
4407 %tmp9 = icmp eq i64 %tmp8, 1024
4408 br i1 %tmp9, label %bb10, label %bb1
4410 bb10: ; preds = %bb1
4414 define void @bcast_unfold_cmp_v8f64(double* %arg) {
4415 ; CHECK-LABEL: bcast_unfold_cmp_v8f64:
4416 ; CHECK: # %bb.0: # %bb
4417 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4418 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4419 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4420 ; CHECK-NEXT: .p2align 4, 0x90
4421 ; CHECK-NEXT: .LBB125_1: # %bb1
4422 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4423 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2
4424 ; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1
4425 ; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1}
4426 ; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax)
4427 ; CHECK-NEXT: addq $64, %rax
4428 ; CHECK-NEXT: jne .LBB125_1
4429 ; CHECK-NEXT: # %bb.2: # %bb10
4430 ; CHECK-NEXT: vzeroupper
4435 bb1: ; preds = %bb1, %bb
4436 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4437 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4438 %tmp3 = bitcast double* %tmp2 to <8 x double>*
4439 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
4440 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4441 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4442 %tmp7 = bitcast double* %tmp2 to <8 x double>*
4443 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
4444 %tmp8 = add i64 %tmp, 8
4445 %tmp9 = icmp eq i64 %tmp8, 1024
4446 br i1 %tmp9, label %bb10, label %bb1
4448 bb10: ; preds = %bb1
4452 define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) {
4453 ; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold:
4455 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4456 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4457 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4458 ; CHECK-NEXT: .p2align 4, 0x90
4459 ; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1
4460 ; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
4461 ; CHECK-NEXT: vblendmps {{.*}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
4462 ; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
4463 ; CHECK-NEXT: addq $32, %rax
4464 ; CHECK-NEXT: jne .LBB126_1
4465 ; CHECK-NEXT: # %bb.2:
4466 ; CHECK-NEXT: vzeroupper
4471 %3 = phi i64 [ 0, %1 ], [ %10, %2 ]
4472 %4 = getelementptr inbounds float, float* %0, i64 %3
4473 %5 = bitcast float* %4 to <8 x float>*
4474 %6 = load <8 x float>, <8 x float>* %5, align 4
4475 %7 = fcmp olt <8 x float> %6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4476 %8 = select <8 x i1> %7, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4477 %9 = bitcast float* %4 to <8 x float>*
4478 store <8 x float> %8, <8 x float>* %9, align 4
4480 %11 = icmp eq i64 %10, 1024
4481 br i1 %11, label %12, label %2
4487 define void @bcast_unfold_ptestm_v4i32(i32* %arg) {
4488 ; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
4489 ; CHECK: # %bb.0: # %bb
4490 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4491 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4492 ; CHECK-NEXT: .p2align 4, 0x90
4493 ; CHECK-NEXT: .LBB127_1: # %bb1
4494 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4495 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
4496 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1
4497 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
4498 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
4499 ; CHECK-NEXT: addq $16, %rax
4500 ; CHECK-NEXT: jne .LBB127_1
4501 ; CHECK-NEXT: # %bb.2: # %bb10
4506 bb1: ; preds = %bb1, %bb
4507 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4508 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4509 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4510 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4511 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4512 %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
4513 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4514 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4515 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4516 %tmp8 = add i64 %tmp, 4
4517 %tmp9 = icmp eq i64 %tmp8, 1024
4518 br i1 %tmp9, label %bb10, label %bb1
4520 bb10: ; preds = %bb1
4524 define void @bcast_unfold_ptestnm_v4i32(i32* %arg) {
4525 ; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
4526 ; CHECK: # %bb.0: # %bb
4527 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4528 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4529 ; CHECK-NEXT: .p2align 4, 0x90
4530 ; CHECK-NEXT: .LBB128_1: # %bb1
4531 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4532 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
4533 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1
4534 ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
4535 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
4536 ; CHECK-NEXT: addq $16, %rax
4537 ; CHECK-NEXT: jne .LBB128_1
4538 ; CHECK-NEXT: # %bb.2: # %bb10
4543 bb1: ; preds = %bb1, %bb
4544 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4545 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4546 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4547 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4548 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4549 %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
4550 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4551 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4552 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4553 %tmp8 = add i64 %tmp, 4
4554 %tmp9 = icmp eq i64 %tmp8, 1024
4555 br i1 %tmp9, label %bb10, label %bb1
4557 bb10: ; preds = %bb1
4561 define void @bcast_unfold_ptestm_v4i64(i64* %arg) {
4562 ; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
4563 ; CHECK: # %bb.0: # %bb
4564 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4565 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4566 ; CHECK-NEXT: .p2align 4, 0x90
4567 ; CHECK-NEXT: .LBB129_1: # %bb1
4568 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4569 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
4570 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1
4571 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
4572 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
4573 ; CHECK-NEXT: addq $32, %rax
4574 ; CHECK-NEXT: jne .LBB129_1
4575 ; CHECK-NEXT: # %bb.2: # %bb10
4576 ; CHECK-NEXT: vzeroupper
4581 bb1: ; preds = %bb1, %bb
4582 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4583 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4584 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4585 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
4586 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4587 %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
4588 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4589 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4590 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
4591 %tmp8 = add i64 %tmp, 4
4592 %tmp9 = icmp eq i64 %tmp8, 1024
4593 br i1 %tmp9, label %bb10, label %bb1
4595 bb10: ; preds = %bb1
4599 define void @bcast_unfold_ptestnm_v4i64(i64* %arg) {
4600 ; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
4601 ; CHECK: # %bb.0: # %bb
4602 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4603 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4604 ; CHECK-NEXT: .p2align 4, 0x90
4605 ; CHECK-NEXT: .LBB130_1: # %bb1
4606 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4607 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
4608 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1
4609 ; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
4610 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
4611 ; CHECK-NEXT: addq $32, %rax
4612 ; CHECK-NEXT: jne .LBB130_1
4613 ; CHECK-NEXT: # %bb.2: # %bb10
4614 ; CHECK-NEXT: vzeroupper
4619 bb1: ; preds = %bb1, %bb
4620 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4621 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4622 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4623 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
4624 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4625 %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
4626 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4627 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4628 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
4629 %tmp8 = add i64 %tmp, 4
4630 %tmp9 = icmp eq i64 %tmp8, 1024
4631 br i1 %tmp9, label %bb10, label %bb1
4633 bb10: ; preds = %bb1