1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s
4 ; Test that we can unfold constant pool loads when we're using avx512's
5 ; ability to fold a broadcast load into an operation.
7 define void @bcast_unfold_add_v16i32(ptr %arg) {
8 ; CHECK-LABEL: bcast_unfold_add_v16i32:
9 ; CHECK: # %bb.0: # %bb
10 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
11 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
12 ; CHECK-NEXT: .p2align 4
13 ; CHECK-NEXT: .LBB0_1: # %bb2
14 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
15 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
16 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
17 ; CHECK-NEXT: addq $64, %rax
18 ; CHECK-NEXT: jne .LBB0_1
19 ; CHECK-NEXT: # %bb.2: # %bb10
20 ; CHECK-NEXT: vzeroupper
25 bb2: ; preds = %bb2, %bb
26 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
27 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
28 %tmp5 = load <16 x i32>, ptr %tmp3, align 4
29 %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
30 store <16 x i32> %tmp6, ptr %tmp3, align 4
31 %tmp8 = add i64 %tmp, 16
32 %tmp9 = icmp eq i64 %tmp8, 1024
33 br i1 %tmp9, label %bb10, label %bb2
39 define void @bcast_unfold_add_v8i32(ptr %arg) {
40 ; CHECK-LABEL: bcast_unfold_add_v8i32:
41 ; CHECK: # %bb.0: # %bb
42 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
43 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
44 ; CHECK-NEXT: .p2align 4
45 ; CHECK-NEXT: .LBB1_1: # %bb2
46 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
47 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
48 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
49 ; CHECK-NEXT: addq $32, %rax
50 ; CHECK-NEXT: jne .LBB1_1
51 ; CHECK-NEXT: # %bb.2: # %bb10
52 ; CHECK-NEXT: vzeroupper
57 bb2: ; preds = %bb2, %bb
58 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
59 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
60 %tmp5 = load <8 x i32>, ptr %tmp3, align 4
61 %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
62 store <8 x i32> %tmp6, ptr %tmp3, align 4
63 %tmp8 = add i64 %tmp, 8
64 %tmp9 = icmp eq i64 %tmp8, 1024
65 br i1 %tmp9, label %bb10, label %bb2
71 define void @bcast_unfold_add_v4i32(ptr %arg) {
72 ; CHECK-LABEL: bcast_unfold_add_v4i32:
73 ; CHECK: # %bb.0: # %bb
74 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
75 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
76 ; CHECK-NEXT: .p2align 4
77 ; CHECK-NEXT: .LBB2_1: # %bb2
78 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
79 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
80 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
81 ; CHECK-NEXT: addq $16, %rax
82 ; CHECK-NEXT: jne .LBB2_1
83 ; CHECK-NEXT: # %bb.2: # %bb10
88 bb2: ; preds = %bb2, %bb
89 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
90 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
91 %tmp5 = load <4 x i32>, ptr %tmp3, align 4
92 %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
93 store <4 x i32> %tmp6, ptr %tmp3, align 4
94 %tmp8 = add i64 %tmp, 4
95 %tmp9 = icmp eq i64 %tmp8, 1024
96 br i1 %tmp9, label %bb10, label %bb2
102 define void @bcast_unfold_add_v8i64(ptr %arg) {
103 ; CHECK-LABEL: bcast_unfold_add_v8i64:
104 ; CHECK: # %bb.0: # %bb
105 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
106 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
107 ; CHECK-NEXT: .p2align 4
108 ; CHECK-NEXT: .LBB3_1: # %bb2
109 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
110 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
111 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
112 ; CHECK-NEXT: addq $64, %rax
113 ; CHECK-NEXT: jne .LBB3_1
114 ; CHECK-NEXT: # %bb.2: # %bb10
115 ; CHECK-NEXT: vzeroupper
120 bb2: ; preds = %bb2, %bb
121 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
122 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
123 %tmp5 = load <8 x i64>, ptr %tmp3, align 8
124 %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
125 store <8 x i64> %tmp6, ptr %tmp3, align 8
126 %tmp8 = add i64 %tmp, 8
127 %tmp9 = icmp eq i64 %tmp8, 1024
128 br i1 %tmp9, label %bb10, label %bb2
134 define void @bcast_unfold_add_v4i64(ptr %arg) {
135 ; CHECK-LABEL: bcast_unfold_add_v4i64:
136 ; CHECK: # %bb.0: # %bb
137 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
138 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
139 ; CHECK-NEXT: .p2align 4
140 ; CHECK-NEXT: .LBB4_1: # %bb2
141 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
142 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
143 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
144 ; CHECK-NEXT: addq $32, %rax
145 ; CHECK-NEXT: jne .LBB4_1
146 ; CHECK-NEXT: # %bb.2: # %bb10
147 ; CHECK-NEXT: vzeroupper
152 bb2: ; preds = %bb2, %bb
153 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
154 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
155 %tmp5 = load <4 x i64>, ptr %tmp3, align 8
156 %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
157 store <4 x i64> %tmp6, ptr %tmp3, align 8
158 %tmp8 = add i64 %tmp, 4
159 %tmp9 = icmp eq i64 %tmp8, 1024
160 br i1 %tmp9, label %bb10, label %bb2
166 define void @bcast_unfold_add_v2i64(ptr %arg) {
167 ; CHECK-LABEL: bcast_unfold_add_v2i64:
168 ; CHECK: # %bb.0: # %bb
169 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
170 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
171 ; CHECK-NEXT: .p2align 4
172 ; CHECK-NEXT: .LBB5_1: # %bb2
173 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
174 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
175 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
176 ; CHECK-NEXT: addq $16, %rax
177 ; CHECK-NEXT: jne .LBB5_1
178 ; CHECK-NEXT: # %bb.2: # %bb10
183 bb2: ; preds = %bb2, %bb
184 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
185 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
186 %tmp5 = load <2 x i64>, ptr %tmp3, align 8
187 %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
188 store <2 x i64> %tmp6, ptr %tmp3, align 8
189 %tmp8 = add i64 %tmp, 2
190 %tmp9 = icmp eq i64 %tmp8, 1024
191 br i1 %tmp9, label %bb10, label %bb2
197 define void @bcast_unfold_mul_v16i32(ptr %arg) {
198 ; CHECK-LABEL: bcast_unfold_mul_v16i32:
199 ; CHECK: # %bb.0: # %bb
200 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
201 ; CHECK-NEXT: .p2align 4
202 ; CHECK-NEXT: .LBB6_1: # %bb2
203 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
204 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
205 ; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm1
206 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
207 ; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
208 ; CHECK-NEXT: addq $64, %rax
209 ; CHECK-NEXT: jne .LBB6_1
210 ; CHECK-NEXT: # %bb.2: # %bb10
211 ; CHECK-NEXT: vzeroupper
216 bb2: ; preds = %bb2, %bb
217 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
218 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
219 %tmp5 = load <16 x i32>, ptr %tmp3, align 4
220 %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
221 store <16 x i32> %tmp6, ptr %tmp3, align 4
222 %tmp8 = add i64 %tmp, 16
223 %tmp9 = icmp eq i64 %tmp8, 1024
224 br i1 %tmp9, label %bb10, label %bb2
230 define void @bcast_unfold_mul_v8i32(ptr %arg) {
231 ; CHECK-LABEL: bcast_unfold_mul_v8i32:
232 ; CHECK: # %bb.0: # %bb
233 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
234 ; CHECK-NEXT: .p2align 4
235 ; CHECK-NEXT: .LBB7_1: # %bb2
236 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
237 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
238 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm1
239 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
240 ; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
241 ; CHECK-NEXT: addq $32, %rax
242 ; CHECK-NEXT: jne .LBB7_1
243 ; CHECK-NEXT: # %bb.2: # %bb10
244 ; CHECK-NEXT: vzeroupper
249 bb2: ; preds = %bb2, %bb
250 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
251 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
252 %tmp5 = load <8 x i32>, ptr %tmp3, align 4
253 %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
254 store <8 x i32> %tmp6, ptr %tmp3, align 4
255 %tmp8 = add i64 %tmp, 8
256 %tmp9 = icmp eq i64 %tmp8, 1024
257 br i1 %tmp9, label %bb10, label %bb2
263 define void @bcast_unfold_mul_v4i32(ptr %arg) {
264 ; CHECK-LABEL: bcast_unfold_mul_v4i32:
265 ; CHECK: # %bb.0: # %bb
266 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
267 ; CHECK-NEXT: .p2align 4
268 ; CHECK-NEXT: .LBB8_1: # %bb2
269 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
270 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
271 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
272 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
273 ; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
274 ; CHECK-NEXT: addq $16, %rax
275 ; CHECK-NEXT: jne .LBB8_1
276 ; CHECK-NEXT: # %bb.2: # %bb10
281 bb2: ; preds = %bb2, %bb
282 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
283 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
284 %tmp5 = load <4 x i32>, ptr %tmp3, align 4
285 %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
286 store <4 x i32> %tmp6, ptr %tmp3, align 4
287 %tmp8 = add i64 %tmp, 4
288 %tmp9 = icmp eq i64 %tmp8, 1024
289 br i1 %tmp9, label %bb10, label %bb2
295 define void @bcast_unfold_mul_v8i64(ptr %arg) {
296 ; CHECK-LABEL: bcast_unfold_mul_v8i64:
297 ; CHECK: # %bb.0: # %bb
298 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
299 ; CHECK-NEXT: .p2align 4
300 ; CHECK-NEXT: .LBB9_1: # %bb2
301 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
302 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
303 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1
304 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
305 ; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
306 ; CHECK-NEXT: addq $64, %rax
307 ; CHECK-NEXT: jne .LBB9_1
308 ; CHECK-NEXT: # %bb.2: # %bb10
309 ; CHECK-NEXT: vzeroupper
314 bb2: ; preds = %bb2, %bb
315 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
316 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
317 %tmp5 = load <8 x i64>, ptr %tmp3, align 8
318 %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
319 store <8 x i64> %tmp6, ptr %tmp3, align 8
320 %tmp8 = add i64 %tmp, 8
321 %tmp9 = icmp eq i64 %tmp8, 1024
322 br i1 %tmp9, label %bb10, label %bb2
328 define void @bcast_unfold_mul_v4i64(ptr %arg) {
329 ; CHECK-LABEL: bcast_unfold_mul_v4i64:
330 ; CHECK: # %bb.0: # %bb
331 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
332 ; CHECK-NEXT: .p2align 4
333 ; CHECK-NEXT: .LBB10_1: # %bb2
334 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
335 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
336 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1
337 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
338 ; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
339 ; CHECK-NEXT: addq $32, %rax
340 ; CHECK-NEXT: jne .LBB10_1
341 ; CHECK-NEXT: # %bb.2: # %bb10
342 ; CHECK-NEXT: vzeroupper
347 bb2: ; preds = %bb2, %bb
348 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
349 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
350 %tmp5 = load <4 x i64>, ptr %tmp3, align 8
351 %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
352 store <4 x i64> %tmp6, ptr %tmp3, align 8
353 %tmp8 = add i64 %tmp, 4
354 %tmp9 = icmp eq i64 %tmp8, 1024
355 br i1 %tmp9, label %bb10, label %bb2
361 define void @bcast_unfold_mul_v2i64(ptr %arg) {
362 ; CHECK-LABEL: bcast_unfold_mul_v2i64:
363 ; CHECK: # %bb.0: # %bb
364 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
365 ; CHECK-NEXT: .p2align 4
366 ; CHECK-NEXT: .LBB11_1: # %bb2
367 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
368 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0
369 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1
370 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
371 ; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax)
372 ; CHECK-NEXT: addq $16, %rax
373 ; CHECK-NEXT: jne .LBB11_1
374 ; CHECK-NEXT: # %bb.2: # %bb10
379 bb2: ; preds = %bb2, %bb
380 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
381 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
382 %tmp5 = load <2 x i64>, ptr %tmp3, align 8
383 %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
384 store <2 x i64> %tmp6, ptr %tmp3, align 8
385 %tmp8 = add i64 %tmp, 2
386 %tmp9 = icmp eq i64 %tmp8, 1024
387 br i1 %tmp9, label %bb10, label %bb2
393 define void @bcast_unfold_or_v16i32(ptr %arg) {
394 ; CHECK-LABEL: bcast_unfold_or_v16i32:
395 ; CHECK: # %bb.0: # %bb
396 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
397 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
398 ; CHECK-NEXT: .p2align 4
399 ; CHECK-NEXT: .LBB12_1: # %bb2
400 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
401 ; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1
402 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
403 ; CHECK-NEXT: addq $64, %rax
404 ; CHECK-NEXT: jne .LBB12_1
405 ; CHECK-NEXT: # %bb.2: # %bb10
406 ; CHECK-NEXT: vzeroupper
411 bb2: ; preds = %bb2, %bb
412 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
413 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
414 %tmp5 = load <16 x i32>, ptr %tmp3, align 4
415 %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
416 store <16 x i32> %tmp6, ptr %tmp3, align 4
417 %tmp8 = add i64 %tmp, 16
418 %tmp9 = icmp eq i64 %tmp8, 1024
419 br i1 %tmp9, label %bb10, label %bb2
425 define void @bcast_unfold_or_v8i32(ptr %arg) {
426 ; CHECK-LABEL: bcast_unfold_or_v8i32:
427 ; CHECK: # %bb.0: # %bb
428 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
429 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
430 ; CHECK-NEXT: .p2align 4
431 ; CHECK-NEXT: .LBB13_1: # %bb2
432 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
433 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1
434 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
435 ; CHECK-NEXT: addq $32, %rax
436 ; CHECK-NEXT: jne .LBB13_1
437 ; CHECK-NEXT: # %bb.2: # %bb10
438 ; CHECK-NEXT: vzeroupper
443 bb2: ; preds = %bb2, %bb
444 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
445 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
446 %tmp5 = load <8 x i32>, ptr %tmp3, align 4
447 %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
448 store <8 x i32> %tmp6, ptr %tmp3, align 4
449 %tmp8 = add i64 %tmp, 8
450 %tmp9 = icmp eq i64 %tmp8, 1024
451 br i1 %tmp9, label %bb10, label %bb2
457 define void @bcast_unfold_or_v4i32(ptr %arg) {
458 ; CHECK-LABEL: bcast_unfold_or_v4i32:
459 ; CHECK: # %bb.0: # %bb
460 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
461 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
462 ; CHECK-NEXT: .p2align 4
463 ; CHECK-NEXT: .LBB14_1: # %bb2
464 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
465 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1
466 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
467 ; CHECK-NEXT: addq $16, %rax
468 ; CHECK-NEXT: jne .LBB14_1
469 ; CHECK-NEXT: # %bb.2: # %bb10
474 bb2: ; preds = %bb2, %bb
475 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
476 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
477 %tmp5 = load <4 x i32>, ptr %tmp3, align 4
478 %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
479 store <4 x i32> %tmp6, ptr %tmp3, align 4
480 %tmp8 = add i64 %tmp, 4
481 %tmp9 = icmp eq i64 %tmp8, 1024
482 br i1 %tmp9, label %bb10, label %bb2
488 define void @bcast_unfold_or_v8i64(ptr %arg) {
489 ; CHECK-LABEL: bcast_unfold_or_v8i64:
490 ; CHECK: # %bb.0: # %bb
491 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
492 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
493 ; CHECK-NEXT: .p2align 4
494 ; CHECK-NEXT: .LBB15_1: # %bb2
495 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
496 ; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1
497 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
498 ; CHECK-NEXT: addq $64, %rax
499 ; CHECK-NEXT: jne .LBB15_1
500 ; CHECK-NEXT: # %bb.2: # %bb10
501 ; CHECK-NEXT: vzeroupper
506 bb2: ; preds = %bb2, %bb
507 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
508 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
509 %tmp5 = load <8 x i64>, ptr %tmp3, align 8
510 %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
511 store <8 x i64> %tmp6, ptr %tmp3, align 8
512 %tmp8 = add i64 %tmp, 8
513 %tmp9 = icmp eq i64 %tmp8, 1024
514 br i1 %tmp9, label %bb10, label %bb2
520 define void @bcast_unfold_or_v4i64(ptr %arg) {
521 ; CHECK-LABEL: bcast_unfold_or_v4i64:
522 ; CHECK: # %bb.0: # %bb
523 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
524 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
525 ; CHECK-NEXT: .p2align 4
526 ; CHECK-NEXT: .LBB16_1: # %bb2
527 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
528 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1
529 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
530 ; CHECK-NEXT: addq $32, %rax
531 ; CHECK-NEXT: jne .LBB16_1
532 ; CHECK-NEXT: # %bb.2: # %bb10
533 ; CHECK-NEXT: vzeroupper
538 bb2: ; preds = %bb2, %bb
539 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
540 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
541 %tmp5 = load <4 x i64>, ptr %tmp3, align 8
542 %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
543 store <4 x i64> %tmp6, ptr %tmp3, align 8
544 %tmp8 = add i64 %tmp, 4
545 %tmp9 = icmp eq i64 %tmp8, 1024
546 br i1 %tmp9, label %bb10, label %bb2
552 define void @bcast_unfold_or_v2i64(ptr %arg) {
553 ; CHECK-LABEL: bcast_unfold_or_v2i64:
554 ; CHECK: # %bb.0: # %bb
555 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
556 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3,3]
557 ; CHECK-NEXT: # xmm0 = mem[0,0]
558 ; CHECK-NEXT: .p2align 4
559 ; CHECK-NEXT: .LBB17_1: # %bb2
560 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
561 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1
562 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
563 ; CHECK-NEXT: addq $16, %rax
564 ; CHECK-NEXT: jne .LBB17_1
565 ; CHECK-NEXT: # %bb.2: # %bb10
570 bb2: ; preds = %bb2, %bb
571 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
572 %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
573 %tmp5 = load <2 x i64>, ptr %tmp3, align 8
574 %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
575 store <2 x i64> %tmp6, ptr %tmp3, align 8
576 %tmp8 = add i64 %tmp, 2
577 %tmp9 = icmp eq i64 %tmp8, 1024
578 br i1 %tmp9, label %bb10, label %bb2
584 define void @bcast_unfold_fneg_v16f32(ptr %arg) {
585 ; CHECK-LABEL: bcast_unfold_fneg_v16f32:
586 ; CHECK: # %bb.0: # %bb
587 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
588 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
589 ; CHECK-NEXT: .p2align 4
590 ; CHECK-NEXT: .LBB18_1: # %bb1
591 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
592 ; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1
593 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
594 ; CHECK-NEXT: addq $64, %rax
595 ; CHECK-NEXT: jne .LBB18_1
596 ; CHECK-NEXT: # %bb.2: # %bb9
597 ; CHECK-NEXT: vzeroupper
602 bb1: ; preds = %bb1, %bb
603 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
604 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
605 %tmp4 = load <16 x float>, ptr %tmp2, align 4
606 %tmp5 = fneg <16 x float> %tmp4
607 store <16 x float> %tmp5, ptr %tmp2, align 4
608 %tmp7 = add i64 %tmp, 16
609 %tmp8 = icmp eq i64 %tmp7, 1024
610 br i1 %tmp8, label %bb9, label %bb1
616 define void @bcast_unfold_fneg_v8f32(ptr %arg) {
617 ; CHECK-LABEL: bcast_unfold_fneg_v8f32:
618 ; CHECK: # %bb.0: # %bb
619 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
620 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
621 ; CHECK-NEXT: .p2align 4
622 ; CHECK-NEXT: .LBB19_1: # %bb1
623 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
624 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1
625 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
626 ; CHECK-NEXT: addq $32, %rax
627 ; CHECK-NEXT: jne .LBB19_1
628 ; CHECK-NEXT: # %bb.2: # %bb9
629 ; CHECK-NEXT: vzeroupper
634 bb1: ; preds = %bb1, %bb
635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
636 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
637 %tmp4 = load <8 x float>, ptr %tmp2, align 4
638 %tmp5 = fneg <8 x float> %tmp4
639 store <8 x float> %tmp5, ptr %tmp2, align 4
640 %tmp7 = add i64 %tmp, 8
641 %tmp8 = icmp eq i64 %tmp7, 1024
642 br i1 %tmp8, label %bb9, label %bb1
648 define void @bcast_unfold_fneg_v4f32(ptr %arg) {
649 ; CHECK-LABEL: bcast_unfold_fneg_v4f32:
650 ; CHECK: # %bb.0: # %bb
651 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
652 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
653 ; CHECK-NEXT: .p2align 4
654 ; CHECK-NEXT: .LBB20_1: # %bb1
655 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
656 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1
657 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
658 ; CHECK-NEXT: addq $16, %rax
659 ; CHECK-NEXT: jne .LBB20_1
660 ; CHECK-NEXT: # %bb.2: # %bb9
665 bb1: ; preds = %bb1, %bb
666 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
667 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
668 %tmp4 = load <4 x float>, ptr %tmp2, align 4
669 %tmp5 = fneg <4 x float> %tmp4
670 store <4 x float> %tmp5, ptr %tmp2, align 4
671 %tmp7 = add i64 %tmp, 4
672 %tmp8 = icmp eq i64 %tmp7, 1024
673 br i1 %tmp8, label %bb9, label %bb1
679 define void @bcast_unfold_fneg_v8f64(ptr %arg) {
680 ; CHECK-LABEL: bcast_unfold_fneg_v8f64:
681 ; CHECK: # %bb.0: # %bb
682 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
683 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
684 ; CHECK-NEXT: .p2align 4
685 ; CHECK-NEXT: .LBB21_1: # %bb1
686 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
687 ; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
688 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
689 ; CHECK-NEXT: addq $64, %rax
690 ; CHECK-NEXT: jne .LBB21_1
691 ; CHECK-NEXT: # %bb.2: # %bb9
692 ; CHECK-NEXT: vzeroupper
697 bb1: ; preds = %bb1, %bb
698 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
699 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
700 %tmp4 = load <8 x double>, ptr %tmp2, align 8
701 %tmp5 = fneg <8 x double> %tmp4
702 store <8 x double> %tmp5, ptr %tmp2, align 8
703 %tmp7 = add i64 %tmp, 8
704 %tmp8 = icmp eq i64 %tmp7, 1024
705 br i1 %tmp8, label %bb9, label %bb1
711 define void @bcast_unfold_fneg_v4f64(ptr %arg) {
712 ; CHECK-LABEL: bcast_unfold_fneg_v4f64:
713 ; CHECK: # %bb.0: # %bb
714 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
715 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
716 ; CHECK-NEXT: .p2align 4
717 ; CHECK-NEXT: .LBB22_1: # %bb1
718 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
719 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1
720 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
721 ; CHECK-NEXT: addq $32, %rax
722 ; CHECK-NEXT: jne .LBB22_1
723 ; CHECK-NEXT: # %bb.2: # %bb9
724 ; CHECK-NEXT: vzeroupper
729 bb1: ; preds = %bb1, %bb
730 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
731 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
732 %tmp4 = load <4 x double>, ptr %tmp2, align 8
733 %tmp5 = fneg <4 x double> %tmp4
734 store <4 x double> %tmp5, ptr %tmp2, align 8
735 %tmp7 = add i64 %tmp, 4
736 %tmp8 = icmp eq i64 %tmp7, 1024
737 br i1 %tmp8, label %bb9, label %bb1
743 define void @bcast_unfold_fneg_v2f64(ptr %arg) {
744 ; CHECK-LABEL: bcast_unfold_fneg_v2f64:
745 ; CHECK: # %bb.0: # %bb
746 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
747 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
748 ; CHECK-NEXT: # xmm0 = mem[0,0]
749 ; CHECK-NEXT: .p2align 4
750 ; CHECK-NEXT: .LBB23_1: # %bb1
751 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
752 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1
753 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
754 ; CHECK-NEXT: addq $16, %rax
755 ; CHECK-NEXT: jne .LBB23_1
756 ; CHECK-NEXT: # %bb.2: # %bb9
761 bb1: ; preds = %bb1, %bb
762 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
763 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
764 %tmp4 = load <2 x double>, ptr %tmp2, align 8
765 %tmp5 = fneg <2 x double> %tmp4
766 store <2 x double> %tmp5, ptr %tmp2, align 8
767 %tmp7 = add i64 %tmp, 2
768 %tmp8 = icmp eq i64 %tmp7, 1024
769 br i1 %tmp8, label %bb9, label %bb1
775 define void @bcast_unfold_fabs_v16f32(ptr %arg) {
776 ; CHECK-LABEL: bcast_unfold_fabs_v16f32:
777 ; CHECK: # %bb.0: # %bb
778 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
779 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
780 ; CHECK-NEXT: .p2align 4
781 ; CHECK-NEXT: .LBB24_1: # %bb1
782 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
783 ; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1
784 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
785 ; CHECK-NEXT: addq $64, %rax
786 ; CHECK-NEXT: jne .LBB24_1
787 ; CHECK-NEXT: # %bb.2: # %bb9
788 ; CHECK-NEXT: vzeroupper
793 bb1: ; preds = %bb1, %bb
794 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
795 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
796 %tmp4 = load <16 x float>, ptr %tmp2, align 4
797 %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
798 store <16 x float> %tmp5, ptr %tmp2, align 4
799 %tmp7 = add i64 %tmp, 16
800 %tmp8 = icmp eq i64 %tmp7, 1024
801 br i1 %tmp8, label %bb9, label %bb1
807 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
808 declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0
810 define void @bcast_unfold_fabs_v8f32(ptr %arg) {
811 ; CHECK-LABEL: bcast_unfold_fabs_v8f32:
812 ; CHECK: # %bb.0: # %bb
813 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
814 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
815 ; CHECK-NEXT: .p2align 4
816 ; CHECK-NEXT: .LBB25_1: # %bb1
817 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
818 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1
819 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
820 ; CHECK-NEXT: addq $32, %rax
821 ; CHECK-NEXT: jne .LBB25_1
822 ; CHECK-NEXT: # %bb.2: # %bb9
823 ; CHECK-NEXT: vzeroupper
828 bb1: ; preds = %bb1, %bb
829 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
830 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
831 %tmp4 = load <8 x float>, ptr %tmp2, align 4
832 %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
833 store <8 x float> %tmp5, ptr %tmp2, align 4
834 %tmp7 = add i64 %tmp, 8
835 %tmp8 = icmp eq i64 %tmp7, 1024
836 br i1 %tmp8, label %bb9, label %bb1
842 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
843 declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
845 define void @bcast_unfold_fabs_v4f32(ptr %arg) {
846 ; CHECK-LABEL: bcast_unfold_fabs_v4f32:
847 ; CHECK: # %bb.0: # %bb
848 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
849 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
850 ; CHECK-NEXT: .p2align 4
851 ; CHECK-NEXT: .LBB26_1: # %bb1
852 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
853 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1
854 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
855 ; CHECK-NEXT: addq $16, %rax
856 ; CHECK-NEXT: jne .LBB26_1
857 ; CHECK-NEXT: # %bb.2: # %bb9
862 bb1: ; preds = %bb1, %bb
863 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
864 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
865 %tmp4 = load <4 x float>, ptr %tmp2, align 4
866 %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
867 store <4 x float> %tmp5, ptr %tmp2, align 4
868 %tmp7 = add i64 %tmp, 4
869 %tmp8 = icmp eq i64 %tmp7, 1024
870 br i1 %tmp8, label %bb9, label %bb1
876 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
877 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0
879 define void @bcast_unfold_fabs_v8f64(ptr %arg) {
880 ; CHECK-LABEL: bcast_unfold_fabs_v8f64:
881 ; CHECK: # %bb.0: # %bb
882 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
883 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
884 ; CHECK-NEXT: .p2align 4
885 ; CHECK-NEXT: .LBB27_1: # %bb1
886 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
887 ; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1
888 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
889 ; CHECK-NEXT: addq $64, %rax
890 ; CHECK-NEXT: jne .LBB27_1
891 ; CHECK-NEXT: # %bb.2: # %bb9
892 ; CHECK-NEXT: vzeroupper
897 bb1: ; preds = %bb1, %bb
898 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
899 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
900 %tmp4 = load <8 x double>, ptr %tmp2, align 8
901 %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
902 store <8 x double> %tmp5, ptr %tmp2, align 8
903 %tmp7 = add i64 %tmp, 8
904 %tmp8 = icmp eq i64 %tmp7, 1024
905 br i1 %tmp8, label %bb9, label %bb1
911 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
912 declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0
914 define void @bcast_unfold_fabs_v4f64(ptr %arg) {
915 ; CHECK-LABEL: bcast_unfold_fabs_v4f64:
916 ; CHECK: # %bb.0: # %bb
917 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
918 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
919 ; CHECK-NEXT: .p2align 4
920 ; CHECK-NEXT: .LBB28_1: # %bb1
921 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
922 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1
923 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
924 ; CHECK-NEXT: addq $32, %rax
925 ; CHECK-NEXT: jne .LBB28_1
926 ; CHECK-NEXT: # %bb.2: # %bb9
927 ; CHECK-NEXT: vzeroupper
932 bb1: ; preds = %bb1, %bb
933 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
934 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
935 %tmp4 = load <4 x double>, ptr %tmp2, align 8
936 %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
937 store <4 x double> %tmp5, ptr %tmp2, align 8
938 %tmp7 = add i64 %tmp, 4
939 %tmp8 = icmp eq i64 %tmp7, 1024
940 br i1 %tmp8, label %bb9, label %bb1
946 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
947 declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0
949 define void @bcast_unfold_fabs_v2f64(ptr %arg) {
950 ; CHECK-LABEL: bcast_unfold_fabs_v2f64:
951 ; CHECK: # %bb.0: # %bb
952 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
953 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [NaN,NaN]
954 ; CHECK-NEXT: # xmm0 = mem[0,0]
955 ; CHECK-NEXT: .p2align 4
956 ; CHECK-NEXT: .LBB29_1: # %bb1
957 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
958 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1
959 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
960 ; CHECK-NEXT: addq $16, %rax
961 ; CHECK-NEXT: jne .LBB29_1
962 ; CHECK-NEXT: # %bb.2: # %bb9
967 bb1: ; preds = %bb1, %bb
968 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
969 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
970 %tmp4 = load <2 x double>, ptr %tmp2, align 8
971 %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
972 store <2 x double> %tmp5, ptr %tmp2, align 8
973 %tmp7 = add i64 %tmp, 2
974 %tmp8 = icmp eq i64 %tmp7, 1024
975 br i1 %tmp8, label %bb9, label %bb1
981 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
982 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
984 define void @bcast_unfold_fadd_v16f32(ptr nocapture %arg) {
985 ; CHECK-LABEL: bcast_unfold_fadd_v16f32:
986 ; CHECK: # %bb.0: # %bb
987 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
988 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
989 ; CHECK-NEXT: .p2align 4
990 ; CHECK-NEXT: .LBB30_1: # %bb1
991 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
992 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1
993 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
994 ; CHECK-NEXT: addq $64, %rax
995 ; CHECK-NEXT: jne .LBB30_1
996 ; CHECK-NEXT: # %bb.2: # %bb9
997 ; CHECK-NEXT: vzeroupper
1002 bb1: ; preds = %bb1, %bb
1003 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1004 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1005 %tmp4 = load <16 x float>, ptr %tmp2, align 4
1006 %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1007 store <16 x float> %tmp5, ptr %tmp2, align 4
1008 %tmp7 = add i64 %tmp, 16
1009 %tmp8 = icmp eq i64 %tmp7, 1024
1010 br i1 %tmp8, label %bb9, label %bb1
1016 define void @bcast_unfold_fadd_v8f32(ptr nocapture %arg) {
1017 ; CHECK-LABEL: bcast_unfold_fadd_v8f32:
1018 ; CHECK: # %bb.0: # %bb
1019 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1020 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1021 ; CHECK-NEXT: .p2align 4
1022 ; CHECK-NEXT: .LBB31_1: # %bb1
1023 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1024 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1
1025 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1026 ; CHECK-NEXT: addq $32, %rax
1027 ; CHECK-NEXT: jne .LBB31_1
1028 ; CHECK-NEXT: # %bb.2: # %bb9
1029 ; CHECK-NEXT: vzeroupper
1034 bb1: ; preds = %bb1, %bb
1035 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1036 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1037 %tmp4 = load <8 x float>, ptr %tmp2, align 4
1038 %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1039 store <8 x float> %tmp5, ptr %tmp2, align 4
1040 %tmp7 = add i64 %tmp, 8
1041 %tmp8 = icmp eq i64 %tmp7, 1024
1042 br i1 %tmp8, label %bb9, label %bb1
1048 define void @bcast_unfold_fadd_v4f32(ptr nocapture %arg) {
1049 ; CHECK-LABEL: bcast_unfold_fadd_v4f32:
1050 ; CHECK: # %bb.0: # %bb
1051 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1052 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1053 ; CHECK-NEXT: .p2align 4
1054 ; CHECK-NEXT: .LBB32_1: # %bb1
1055 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1056 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1
1057 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1058 ; CHECK-NEXT: addq $16, %rax
1059 ; CHECK-NEXT: jne .LBB32_1
1060 ; CHECK-NEXT: # %bb.2: # %bb9
1065 bb1: ; preds = %bb1, %bb
1066 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1067 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1068 %tmp4 = load <4 x float>, ptr %tmp2, align 4
1069 %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1070 store <4 x float> %tmp5, ptr %tmp2, align 4
1071 %tmp7 = add i64 %tmp, 4
1072 %tmp8 = icmp eq i64 %tmp7, 1024
1073 br i1 %tmp8, label %bb9, label %bb1
1079 define void @bcast_unfold_fadd_v8f64(ptr nocapture %arg) {
1080 ; CHECK-LABEL: bcast_unfold_fadd_v8f64:
1081 ; CHECK: # %bb.0: # %bb
1082 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1083 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1084 ; CHECK-NEXT: .p2align 4
1085 ; CHECK-NEXT: .LBB33_1: # %bb1
1086 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1087 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
1088 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1089 ; CHECK-NEXT: addq $64, %rax
1090 ; CHECK-NEXT: jne .LBB33_1
1091 ; CHECK-NEXT: # %bb.2: # %bb9
1092 ; CHECK-NEXT: vzeroupper
1097 bb1: ; preds = %bb1, %bb
1098 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1099 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1100 %tmp4 = load <8 x double>, ptr %tmp2, align 8
1101 %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1102 store <8 x double> %tmp5, ptr %tmp2, align 8
1103 %tmp7 = add i64 %tmp, 8
1104 %tmp8 = icmp eq i64 %tmp7, 1024
1105 br i1 %tmp8, label %bb9, label %bb1
1111 define void @bcast_unfold_fadd_v4f64(ptr nocapture %arg) {
1112 ; CHECK-LABEL: bcast_unfold_fadd_v4f64:
1113 ; CHECK: # %bb.0: # %bb
1114 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1115 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1116 ; CHECK-NEXT: .p2align 4
1117 ; CHECK-NEXT: .LBB34_1: # %bb1
1118 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1119 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
1120 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1121 ; CHECK-NEXT: addq $32, %rax
1122 ; CHECK-NEXT: jne .LBB34_1
1123 ; CHECK-NEXT: # %bb.2: # %bb9
1124 ; CHECK-NEXT: vzeroupper
1129 bb1: ; preds = %bb1, %bb
1130 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1131 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1132 %tmp4 = load <4 x double>, ptr %tmp2, align 8
1133 %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1134 store <4 x double> %tmp5, ptr %tmp2, align 8
1135 %tmp7 = add i64 %tmp, 4
1136 %tmp8 = icmp eq i64 %tmp7, 1024
1137 br i1 %tmp8, label %bb9, label %bb1
1143 define void @bcast_unfold_fadd_v2f64(ptr nocapture %arg) {
1144 ; CHECK-LABEL: bcast_unfold_fadd_v2f64:
1145 ; CHECK: # %bb.0: # %bb
1146 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1147 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1148 ; CHECK-NEXT: # xmm0 = mem[0,0]
1149 ; CHECK-NEXT: .p2align 4
1150 ; CHECK-NEXT: .LBB35_1: # %bb1
1151 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1152 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
1153 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1154 ; CHECK-NEXT: addq $16, %rax
1155 ; CHECK-NEXT: jne .LBB35_1
1156 ; CHECK-NEXT: # %bb.2: # %bb9
1161 bb1: ; preds = %bb1, %bb
1162 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1163 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1164 %tmp4 = load <2 x double>, ptr %tmp2, align 8
1165 %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1166 store <2 x double> %tmp5, ptr %tmp2, align 8
1167 %tmp7 = add i64 %tmp, 2
1168 %tmp8 = icmp eq i64 %tmp7, 1024
1169 br i1 %tmp8, label %bb9, label %bb1
1175 define void @bcast_unfold_fmul_v16f32(ptr nocapture %arg) {
1176 ; CHECK-LABEL: bcast_unfold_fmul_v16f32:
1177 ; CHECK: # %bb.0: # %bb
1178 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1179 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1180 ; CHECK-NEXT: .p2align 4
1181 ; CHECK-NEXT: .LBB36_1: # %bb1
1182 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1183 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1
1184 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1185 ; CHECK-NEXT: addq $64, %rax
1186 ; CHECK-NEXT: jne .LBB36_1
1187 ; CHECK-NEXT: # %bb.2: # %bb9
1188 ; CHECK-NEXT: vzeroupper
1193 bb1: ; preds = %bb1, %bb
1194 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1195 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1196 %tmp4 = load <16 x float>, ptr %tmp2, align 4
1197 %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1198 store <16 x float> %tmp5, ptr %tmp2, align 4
1199 %tmp7 = add i64 %tmp, 16
1200 %tmp8 = icmp eq i64 %tmp7, 1024
1201 br i1 %tmp8, label %bb9, label %bb1
1207 define void @bcast_unfold_fmul_v8f32(ptr nocapture %arg) {
1208 ; CHECK-LABEL: bcast_unfold_fmul_v8f32:
1209 ; CHECK: # %bb.0: # %bb
1210 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1211 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1212 ; CHECK-NEXT: .p2align 4
1213 ; CHECK-NEXT: .LBB37_1: # %bb1
1214 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1215 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1
1216 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1217 ; CHECK-NEXT: addq $32, %rax
1218 ; CHECK-NEXT: jne .LBB37_1
1219 ; CHECK-NEXT: # %bb.2: # %bb9
1220 ; CHECK-NEXT: vzeroupper
1225 bb1: ; preds = %bb1, %bb
1226 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1227 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1228 %tmp4 = load <8 x float>, ptr %tmp2, align 4
1229 %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1230 store <8 x float> %tmp5, ptr %tmp2, align 4
1231 %tmp7 = add i64 %tmp, 8
1232 %tmp8 = icmp eq i64 %tmp7, 1024
1233 br i1 %tmp8, label %bb9, label %bb1
1239 define void @bcast_unfold_fmul_v4f32(ptr nocapture %arg) {
1240 ; CHECK-LABEL: bcast_unfold_fmul_v4f32:
1241 ; CHECK: # %bb.0: # %bb
1242 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1243 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1244 ; CHECK-NEXT: .p2align 4
1245 ; CHECK-NEXT: .LBB38_1: # %bb1
1246 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1247 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1
1248 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1249 ; CHECK-NEXT: addq $16, %rax
1250 ; CHECK-NEXT: jne .LBB38_1
1251 ; CHECK-NEXT: # %bb.2: # %bb9
1256 bb1: ; preds = %bb1, %bb
1257 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1258 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1259 %tmp4 = load <4 x float>, ptr %tmp2, align 4
1260 %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1261 store <4 x float> %tmp5, ptr %tmp2, align 4
1262 %tmp7 = add i64 %tmp, 4
1263 %tmp8 = icmp eq i64 %tmp7, 1024
1264 br i1 %tmp8, label %bb9, label %bb1
1270 define void @bcast_unfold_fmul_v8f64(ptr nocapture %arg) {
1271 ; CHECK-LABEL: bcast_unfold_fmul_v8f64:
1272 ; CHECK: # %bb.0: # %bb
1273 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1274 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1275 ; CHECK-NEXT: .p2align 4
1276 ; CHECK-NEXT: .LBB39_1: # %bb1
1277 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1278 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
1279 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1280 ; CHECK-NEXT: addq $64, %rax
1281 ; CHECK-NEXT: jne .LBB39_1
1282 ; CHECK-NEXT: # %bb.2: # %bb9
1283 ; CHECK-NEXT: vzeroupper
1288 bb1: ; preds = %bb1, %bb
1289 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1290 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1291 %tmp4 = load <8 x double>, ptr %tmp2, align 8
1292 %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1293 store <8 x double> %tmp5, ptr %tmp2, align 8
1294 %tmp7 = add i64 %tmp, 8
1295 %tmp8 = icmp eq i64 %tmp7, 1024
1296 br i1 %tmp8, label %bb9, label %bb1
1302 define void @bcast_unfold_fmul_v4f64(ptr nocapture %arg) {
1303 ; CHECK-LABEL: bcast_unfold_fmul_v4f64:
1304 ; CHECK: # %bb.0: # %bb
1305 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1306 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1307 ; CHECK-NEXT: .p2align 4
1308 ; CHECK-NEXT: .LBB40_1: # %bb1
1309 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1310 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
1311 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1312 ; CHECK-NEXT: addq $32, %rax
1313 ; CHECK-NEXT: jne .LBB40_1
1314 ; CHECK-NEXT: # %bb.2: # %bb9
1315 ; CHECK-NEXT: vzeroupper
1320 bb1: ; preds = %bb1, %bb
1321 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1322 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1323 %tmp4 = load <4 x double>, ptr %tmp2, align 8
1324 %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1325 store <4 x double> %tmp5, ptr %tmp2, align 8
1326 %tmp7 = add i64 %tmp, 4
1327 %tmp8 = icmp eq i64 %tmp7, 1024
1328 br i1 %tmp8, label %bb9, label %bb1
1334 define void @bcast_unfold_fmul_v2f64(ptr nocapture %arg) {
1335 ; CHECK-LABEL: bcast_unfold_fmul_v2f64:
1336 ; CHECK: # %bb.0: # %bb
1337 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1338 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
1339 ; CHECK-NEXT: # xmm0 = mem[0,0]
1340 ; CHECK-NEXT: .p2align 4
1341 ; CHECK-NEXT: .LBB41_1: # %bb1
1342 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1343 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
1344 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1345 ; CHECK-NEXT: addq $16, %rax
1346 ; CHECK-NEXT: jne .LBB41_1
1347 ; CHECK-NEXT: # %bb.2: # %bb9
1352 bb1: ; preds = %bb1, %bb
1353 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1354 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1355 %tmp4 = load <2 x double>, ptr %tmp2, align 8
1356 %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
1357 store <2 x double> %tmp5, ptr %tmp2, align 8
1358 %tmp7 = add i64 %tmp, 2
1359 %tmp8 = icmp eq i64 %tmp7, 1024
1360 br i1 %tmp8, label %bb9, label %bb1
1366 define void @bcast_unfold_fdiv_v16f32(ptr nocapture %arg) {
1367 ; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
1368 ; CHECK: # %bb.0: # %bb
1369 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1370 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1371 ; CHECK-NEXT: .p2align 4
1372 ; CHECK-NEXT: .LBB42_1: # %bb1
1373 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1374 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1375 ; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1
1376 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1377 ; CHECK-NEXT: addq $64, %rax
1378 ; CHECK-NEXT: jne .LBB42_1
1379 ; CHECK-NEXT: # %bb.2: # %bb9
1380 ; CHECK-NEXT: vzeroupper
1385 bb1: ; preds = %bb1, %bb
1386 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1387 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1388 %tmp4 = load <16 x float>, ptr %tmp2, align 4
1389 %tmp5 = fdiv <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1390 store <16 x float> %tmp5, ptr %tmp2, align 4
1391 %tmp7 = add i64 %tmp, 16
1392 %tmp8 = icmp eq i64 %tmp7, 1024
1393 br i1 %tmp8, label %bb9, label %bb1
1399 define void @bcast_unfold_fdiv_v8f32(ptr nocapture %arg) {
1400 ; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
1401 ; CHECK: # %bb.0: # %bb
1402 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1403 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1404 ; CHECK-NEXT: .p2align 4
1405 ; CHECK-NEXT: .LBB43_1: # %bb1
1406 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1407 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1408 ; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1
1409 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1410 ; CHECK-NEXT: addq $32, %rax
1411 ; CHECK-NEXT: jne .LBB43_1
1412 ; CHECK-NEXT: # %bb.2: # %bb9
1413 ; CHECK-NEXT: vzeroupper
1418 bb1: ; preds = %bb1, %bb
1419 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1420 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1421 %tmp4 = load <8 x float>, ptr %tmp2, align 4
1422 %tmp5 = fdiv <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1423 store <8 x float> %tmp5, ptr %tmp2, align 4
1424 %tmp7 = add i64 %tmp, 8
1425 %tmp8 = icmp eq i64 %tmp7, 1024
1426 br i1 %tmp8, label %bb9, label %bb1
1432 define void @bcast_unfold_fdiv_v4f32(ptr nocapture %arg) {
1433 ; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
1434 ; CHECK: # %bb.0: # %bb
1435 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1436 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1437 ; CHECK-NEXT: .p2align 4
1438 ; CHECK-NEXT: .LBB44_1: # %bb1
1439 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1440 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1441 ; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1
1442 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1443 ; CHECK-NEXT: addq $16, %rax
1444 ; CHECK-NEXT: jne .LBB44_1
1445 ; CHECK-NEXT: # %bb.2: # %bb9
1450 bb1: ; preds = %bb1, %bb
1451 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1452 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1453 %tmp4 = load <4 x float>, ptr %tmp2, align 4
1454 %tmp5 = fdiv <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1455 store <4 x float> %tmp5, ptr %tmp2, align 4
1456 %tmp7 = add i64 %tmp, 4
1457 %tmp8 = icmp eq i64 %tmp7, 1024
1458 br i1 %tmp8, label %bb9, label %bb1
1464 define void @bcast_unfold_fdiv_v8f64(ptr nocapture %arg) {
1465 ; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
1466 ; CHECK: # %bb.0: # %bb
1467 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1468 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1469 ; CHECK-NEXT: .p2align 4
1470 ; CHECK-NEXT: .LBB45_1: # %bb1
1471 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1472 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
1473 ; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1
1474 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1475 ; CHECK-NEXT: addq $64, %rax
1476 ; CHECK-NEXT: jne .LBB45_1
1477 ; CHECK-NEXT: # %bb.2: # %bb9
1478 ; CHECK-NEXT: vzeroupper
1483 bb1: ; preds = %bb1, %bb
1484 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1485 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1486 %tmp4 = load <8 x double>, ptr %tmp2, align 8
1487 %tmp5 = fdiv <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1488 store <8 x double> %tmp5, ptr %tmp2, align 8
1489 %tmp7 = add i64 %tmp, 8
1490 %tmp8 = icmp eq i64 %tmp7, 1024
1491 br i1 %tmp8, label %bb9, label %bb1
1497 define void @bcast_unfold_fdiv_v4f64(ptr nocapture %arg) {
1498 ; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
1499 ; CHECK: # %bb.0: # %bb
1500 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1501 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1502 ; CHECK-NEXT: .p2align 4
1503 ; CHECK-NEXT: .LBB46_1: # %bb1
1504 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1505 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1506 ; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1
1507 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1508 ; CHECK-NEXT: addq $32, %rax
1509 ; CHECK-NEXT: jne .LBB46_1
1510 ; CHECK-NEXT: # %bb.2: # %bb9
1511 ; CHECK-NEXT: vzeroupper
1516 bb1: ; preds = %bb1, %bb
1517 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1518 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1519 %tmp4 = load <4 x double>, ptr %tmp2, align 8
1520 %tmp5 = fdiv <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1521 store <4 x double> %tmp5, ptr %tmp2, align 8
1522 %tmp7 = add i64 %tmp, 4
1523 %tmp8 = icmp eq i64 %tmp7, 1024
1524 br i1 %tmp8, label %bb9, label %bb1
1530 define void @bcast_unfold_fdiv_v2f64(ptr nocapture %arg) {
1531 ; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
1532 ; CHECK: # %bb.0: # %bb
1533 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1534 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
1535 ; CHECK-NEXT: # xmm0 = mem[0,0]
1536 ; CHECK-NEXT: .p2align 4
1537 ; CHECK-NEXT: .LBB47_1: # %bb1
1538 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1539 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1540 ; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1
1541 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1542 ; CHECK-NEXT: addq $16, %rax
1543 ; CHECK-NEXT: jne .LBB47_1
1544 ; CHECK-NEXT: # %bb.2: # %bb9
1549 bb1: ; preds = %bb1, %bb
1550 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1551 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1552 %tmp4 = load <2 x double>, ptr %tmp2, align 8
1553 %tmp5 = fdiv <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
1554 store <2 x double> %tmp5, ptr %tmp2, align 8
1555 %tmp7 = add i64 %tmp, 2
1556 %tmp8 = icmp eq i64 %tmp7, 1024
1557 br i1 %tmp8, label %bb9, label %bb1
1563 define void @bcast_unfold_fma213_v4f32(ptr %arg) {
1564 ; CHECK-LABEL: bcast_unfold_fma213_v4f32:
1565 ; CHECK: # %bb.0: # %bb
1566 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1567 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1568 ; CHECK-NEXT: .p2align 4
1569 ; CHECK-NEXT: .LBB48_1: # %bb2
1570 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1571 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1572 ; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1573 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1574 ; CHECK-NEXT: addq $16, %rax
1575 ; CHECK-NEXT: jne .LBB48_1
1576 ; CHECK-NEXT: # %bb.2: # %bb11
1581 bb2: ; preds = %bb2, %bb
1582 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1583 %tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
1584 %tmp5 = load <4 x float>, ptr %tmp3, align 4
1585 %tmp6 = fmul contract <4 x float> %tmp5, %tmp5
1586 %tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1587 store <4 x float> %tmp7, ptr %tmp3, align 4
1588 %tmp9 = add i64 %tmp, 4
1589 %tmp10 = icmp eq i64 %tmp9, 1024
1590 br i1 %tmp10, label %bb11, label %bb2
1592 bb11: ; preds = %bb2
1596 define void @bcast_unfold_fma231_v4f32(ptr %arg) {
1597 ; CHECK-LABEL: bcast_unfold_fma231_v4f32:
1598 ; CHECK: # %bb.0: # %bb
1599 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1600 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1601 ; CHECK-NEXT: .p2align 4
1602 ; CHECK-NEXT: .LBB49_1: # %bb1
1603 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1604 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1605 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1606 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1607 ; CHECK-NEXT: addq $16, %rax
1608 ; CHECK-NEXT: jne .LBB49_1
1609 ; CHECK-NEXT: # %bb.2: # %bb10
1614 bb1: ; preds = %bb1, %bb
1615 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1616 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1617 %tmp4 = load <4 x float>, ptr %tmp2, align 4
1618 %tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1619 %tmp6 = fadd contract <4 x float> %tmp4, %tmp5
1620 store <4 x float> %tmp6, ptr %tmp2, align 4
1621 %tmp8 = add i64 %tmp, 4
1622 %tmp9 = icmp eq i64 %tmp8, 1024
1623 br i1 %tmp9, label %bb10, label %bb1
1625 bb10: ; preds = %bb1
1629 define void @bcast_unfold_fma213_v8f32(ptr %arg) {
1630 ; CHECK-LABEL: bcast_unfold_fma213_v8f32:
1631 ; CHECK: # %bb.0: # %bb
1632 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1633 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1634 ; CHECK-NEXT: .p2align 4
1635 ; CHECK-NEXT: .LBB50_1: # %bb2
1636 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1637 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1638 ; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1639 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1640 ; CHECK-NEXT: addq $32, %rax
1641 ; CHECK-NEXT: jne .LBB50_1
1642 ; CHECK-NEXT: # %bb.2: # %bb11
1643 ; CHECK-NEXT: vzeroupper
1648 bb2: ; preds = %bb2, %bb
1649 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1650 %tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
1651 %tmp5 = load <8 x float>, ptr %tmp3, align 4
1652 %tmp6 = fmul contract <8 x float> %tmp5, %tmp5
1653 %tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1654 store <8 x float> %tmp7, ptr %tmp3, align 4
1655 %tmp9 = add i64 %tmp, 8
1656 %tmp10 = icmp eq i64 %tmp9, 1024
1657 br i1 %tmp10, label %bb11, label %bb2
1659 bb11: ; preds = %bb2
1663 define void @bcast_unfold_fma231_v8f32(ptr %arg) {
1664 ; CHECK-LABEL: bcast_unfold_fma231_v8f32:
1665 ; CHECK: # %bb.0: # %bb
1666 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1667 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1668 ; CHECK-NEXT: .p2align 4
1669 ; CHECK-NEXT: .LBB51_1: # %bb1
1670 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1671 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1672 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1673 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1674 ; CHECK-NEXT: addq $32, %rax
1675 ; CHECK-NEXT: jne .LBB51_1
1676 ; CHECK-NEXT: # %bb.2: # %bb10
1677 ; CHECK-NEXT: vzeroupper
1682 bb1: ; preds = %bb1, %bb
1683 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1684 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1685 %tmp4 = load <8 x float>, ptr %tmp2, align 4
1686 %tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1687 %tmp6 = fadd contract <8 x float> %tmp4, %tmp5
1688 store <8 x float> %tmp6, ptr %tmp2, align 4
1689 %tmp8 = add i64 %tmp, 8
1690 %tmp9 = icmp eq i64 %tmp8, 1024
1691 br i1 %tmp9, label %bb10, label %bb1
1693 bb10: ; preds = %bb1
1697 define void @bcast_unfold_fma213_v16f32(ptr %arg) {
1698 ; CHECK-LABEL: bcast_unfold_fma213_v16f32:
1699 ; CHECK: # %bb.0: # %bb
1700 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1701 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1702 ; CHECK-NEXT: .p2align 4
1703 ; CHECK-NEXT: .LBB52_1: # %bb2
1704 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1705 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1706 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
1707 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1708 ; CHECK-NEXT: addq $64, %rax
1709 ; CHECK-NEXT: jne .LBB52_1
1710 ; CHECK-NEXT: # %bb.2: # %bb11
1711 ; CHECK-NEXT: vzeroupper
1716 bb2: ; preds = %bb2, %bb
1717 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1718 %tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
1719 %tmp5 = load <16 x float>, ptr %tmp3, align 4
1720 %tmp6 = fmul contract <16 x float> %tmp5, %tmp5
1721 %tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1722 store <16 x float> %tmp7, ptr %tmp3, align 4
1723 %tmp9 = add i64 %tmp, 16
1724 %tmp10 = icmp eq i64 %tmp9, 1024
1725 br i1 %tmp10, label %bb11, label %bb2
1727 bb11: ; preds = %bb2
1731 define void @bcast_unfold_fma231_v16f32(ptr %arg) {
1732 ; CHECK-LABEL: bcast_unfold_fma231_v16f32:
1733 ; CHECK: # %bb.0: # %bb
1734 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1735 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1736 ; CHECK-NEXT: .p2align 4
1737 ; CHECK-NEXT: .LBB53_1: # %bb1
1738 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1739 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1740 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
1741 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1742 ; CHECK-NEXT: addq $64, %rax
1743 ; CHECK-NEXT: jne .LBB53_1
1744 ; CHECK-NEXT: # %bb.2: # %bb10
1745 ; CHECK-NEXT: vzeroupper
1750 bb1: ; preds = %bb1, %bb
1751 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1752 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1753 %tmp4 = load <16 x float>, ptr %tmp2, align 4
1754 %tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1755 %tmp6 = fadd contract <16 x float> %tmp4, %tmp5
1756 store <16 x float> %tmp6, ptr %tmp2, align 4
1757 %tmp8 = add i64 %tmp, 16
1758 %tmp9 = icmp eq i64 %tmp8, 1024
1759 br i1 %tmp9, label %bb10, label %bb1
1761 bb10: ; preds = %bb1
1765 define void @bcast_unfold_fma213_v2f64(ptr %arg) {
1766 ; CHECK-LABEL: bcast_unfold_fma213_v2f64:
1767 ; CHECK: # %bb.0: # %bb
1768 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1769 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1770 ; CHECK-NEXT: # xmm0 = mem[0,0]
1771 ; CHECK-NEXT: .p2align 4
1772 ; CHECK-NEXT: .LBB54_1: # %bb2
1773 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1774 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1775 ; CHECK-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1776 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1777 ; CHECK-NEXT: addq $16, %rax
1778 ; CHECK-NEXT: jne .LBB54_1
1779 ; CHECK-NEXT: # %bb.2: # %bb11
1784 bb2: ; preds = %bb2, %bb
1785 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1786 %tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
1787 %tmp5 = load <2 x double>, ptr %tmp3, align 4
1788 %tmp6 = fmul contract <2 x double> %tmp5, %tmp5
1789 %tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00>
1790 store <2 x double> %tmp7, ptr %tmp3, align 8
1791 %tmp9 = add i64 %tmp, 2
1792 %tmp10 = icmp eq i64 %tmp9, 1024
1793 br i1 %tmp10, label %bb11, label %bb2
1795 bb11: ; preds = %bb2
1799 define void @bcast_unfold_fma231_v2f64(ptr %arg) {
1800 ; CHECK-LABEL: bcast_unfold_fma231_v2f64:
1801 ; CHECK: # %bb.0: # %bb
1802 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1803 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1804 ; CHECK-NEXT: # xmm0 = mem[0,0]
1805 ; CHECK-NEXT: .p2align 4
1806 ; CHECK-NEXT: .LBB55_1: # %bb1
1807 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1808 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1809 ; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1810 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1811 ; CHECK-NEXT: addq $16, %rax
1812 ; CHECK-NEXT: jne .LBB55_1
1813 ; CHECK-NEXT: # %bb.2: # %bb10
1818 bb1: ; preds = %bb1, %bb
1819 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1820 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1821 %tmp4 = load <2 x double>, ptr %tmp2, align 8
1822 %tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1823 %tmp6 = fadd contract <2 x double> %tmp4, %tmp5
1824 store <2 x double> %tmp6, ptr %tmp2, align 8
1825 %tmp8 = add i64 %tmp, 2
1826 %tmp9 = icmp eq i64 %tmp8, 1024
1827 br i1 %tmp9, label %bb10, label %bb1
1829 bb10: ; preds = %bb1
1833 define void @bcast_unfold_fma213_v4f64(ptr %arg) {
1834 ; CHECK-LABEL: bcast_unfold_fma213_v4f64:
1835 ; CHECK: # %bb.0: # %bb
1836 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1837 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1838 ; CHECK-NEXT: .p2align 4
1839 ; CHECK-NEXT: .LBB56_1: # %bb2
1840 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1841 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1842 ; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1843 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1844 ; CHECK-NEXT: addq $32, %rax
1845 ; CHECK-NEXT: jne .LBB56_1
1846 ; CHECK-NEXT: # %bb.2: # %bb11
1847 ; CHECK-NEXT: vzeroupper
1852 bb2: ; preds = %bb2, %bb
1853 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1854 %tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
1855 %tmp5 = load <4 x double>, ptr %tmp3, align 8
1856 %tmp6 = fmul contract <4 x double> %tmp5, %tmp5
1857 %tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1858 store <4 x double> %tmp7, ptr %tmp3, align 8
1859 %tmp9 = add i64 %tmp, 4
1860 %tmp10 = icmp eq i64 %tmp9, 1024
1861 br i1 %tmp10, label %bb11, label %bb2
1863 bb11: ; preds = %bb2
1867 define void @bcast_unfold_fma231_v4f64(ptr %arg) {
1868 ; CHECK-LABEL: bcast_unfold_fma231_v4f64:
1869 ; CHECK: # %bb.0: # %bb
1870 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1871 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1872 ; CHECK-NEXT: .p2align 4
1873 ; CHECK-NEXT: .LBB57_1: # %bb1
1874 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1875 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1876 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1877 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1878 ; CHECK-NEXT: addq $32, %rax
1879 ; CHECK-NEXT: jne .LBB57_1
1880 ; CHECK-NEXT: # %bb.2: # %bb10
1881 ; CHECK-NEXT: vzeroupper
1886 bb1: ; preds = %bb1, %bb
1887 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1888 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1889 %tmp4 = load <4 x double>, ptr %tmp2, align 8
1890 %tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1891 %tmp6 = fadd contract <4 x double> %tmp4, %tmp5
1892 store <4 x double> %tmp6, ptr %tmp2, align 8
1893 %tmp8 = add i64 %tmp, 4
1894 %tmp9 = icmp eq i64 %tmp8, 1024
1895 br i1 %tmp9, label %bb10, label %bb1
1897 bb10: ; preds = %bb1
1901 define void @bcast_unfold_fma213_v8f64(ptr %arg) {
1902 ; CHECK-LABEL: bcast_unfold_fma213_v8f64:
1903 ; CHECK: # %bb.0: # %bb
1904 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1905 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1906 ; CHECK-NEXT: .p2align 4
1907 ; CHECK-NEXT: .LBB58_1: # %bb2
1908 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1909 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
1910 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
1911 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1912 ; CHECK-NEXT: addq $64, %rax
1913 ; CHECK-NEXT: jne .LBB58_1
1914 ; CHECK-NEXT: # %bb.2: # %bb11
1915 ; CHECK-NEXT: vzeroupper
1920 bb2: ; preds = %bb2, %bb
1921 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1922 %tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
1923 %tmp5 = load <8 x double>, ptr %tmp3, align 8
1924 %tmp6 = fmul contract <8 x double> %tmp5, %tmp5
1925 %tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1926 store <8 x double> %tmp7, ptr %tmp3, align 8
1927 %tmp9 = add i64 %tmp, 8
1928 %tmp10 = icmp eq i64 %tmp9, 1024
1929 br i1 %tmp10, label %bb11, label %bb2
1931 bb11: ; preds = %bb2
1935 define void @bcast_unfold_fma231_v8f64(ptr %arg) {
1936 ; CHECK-LABEL: bcast_unfold_fma231_v8f64:
1937 ; CHECK: # %bb.0: # %bb
1938 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1939 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1940 ; CHECK-NEXT: .p2align 4
1941 ; CHECK-NEXT: .LBB59_1: # %bb1
1942 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1943 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
1944 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
1945 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1946 ; CHECK-NEXT: addq $64, %rax
1947 ; CHECK-NEXT: jne .LBB59_1
1948 ; CHECK-NEXT: # %bb.2: # %bb10
1949 ; CHECK-NEXT: vzeroupper
1954 bb1: ; preds = %bb1, %bb
1955 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1956 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
1957 %tmp4 = load <8 x double>, ptr %tmp2, align 8
1958 %tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1959 %tmp6 = fadd contract <8 x double> %tmp4, %tmp5
1960 store <8 x double> %tmp6, ptr %tmp2, align 8
1961 %tmp8 = add i64 %tmp, 8
1962 %tmp9 = icmp eq i64 %tmp8, 1024
1963 br i1 %tmp9, label %bb10, label %bb1
1965 bb10: ; preds = %bb1
1969 define void @bcast_unfold_fmax_v4f32(ptr %arg) {
1970 ; CHECK-LABEL: bcast_unfold_fmax_v4f32:
1971 ; CHECK: # %bb.0: # %bb
1972 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1973 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1974 ; CHECK-NEXT: .p2align 4
1975 ; CHECK-NEXT: .LBB60_1: # %bb1
1976 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1977 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1978 ; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1
1979 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1980 ; CHECK-NEXT: addq $16, %rax
1981 ; CHECK-NEXT: jne .LBB60_1
1982 ; CHECK-NEXT: # %bb.2: # %bb10
1987 bb1: ; preds = %bb1, %bb
1988 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1989 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
1990 %tmp4 = load <4 x float>, ptr %tmp2, align 4
1991 %tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1992 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1993 store <4 x float> %tmp6, ptr %tmp2, align 4
1994 %tmp8 = add i64 %tmp, 4
1995 %tmp9 = icmp eq i64 %tmp8, 1024
1996 br i1 %tmp9, label %bb10, label %bb1
1998 bb10: ; preds = %bb1
2002 define void @bcast_unfold_fmax_v8f32(ptr %arg) {
2003 ; CHECK-LABEL: bcast_unfold_fmax_v8f32:
2004 ; CHECK: # %bb.0: # %bb
2005 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2006 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2007 ; CHECK-NEXT: .p2align 4
2008 ; CHECK-NEXT: .LBB61_1: # %bb1
2009 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2010 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
2011 ; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1
2012 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
2013 ; CHECK-NEXT: addq $32, %rax
2014 ; CHECK-NEXT: jne .LBB61_1
2015 ; CHECK-NEXT: # %bb.2: # %bb10
2016 ; CHECK-NEXT: vzeroupper
2021 bb1: ; preds = %bb1, %bb
2022 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2023 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
2024 %tmp4 = load <8 x float>, ptr %tmp2, align 4
2025 %tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2026 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2027 store <8 x float> %tmp6, ptr %tmp2, align 4
2028 %tmp8 = add i64 %tmp, 8
2029 %tmp9 = icmp eq i64 %tmp8, 1024
2030 br i1 %tmp9, label %bb10, label %bb1
2032 bb10: ; preds = %bb1
2036 define void @bcast_unfold_fmax_v16f32(ptr %arg) {
2037 ; CHECK-LABEL: bcast_unfold_fmax_v16f32:
2038 ; CHECK: # %bb.0: # %bb
2039 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2040 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2041 ; CHECK-NEXT: .p2align 4
2042 ; CHECK-NEXT: .LBB62_1: # %bb1
2043 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2044 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
2045 ; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1
2046 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
2047 ; CHECK-NEXT: addq $64, %rax
2048 ; CHECK-NEXT: jne .LBB62_1
2049 ; CHECK-NEXT: # %bb.2: # %bb10
2050 ; CHECK-NEXT: vzeroupper
2055 bb1: ; preds = %bb1, %bb
2056 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2057 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
2058 %tmp4 = load <16 x float>, ptr %tmp2, align 4
2059 %tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2060 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2061 store <16 x float> %tmp6, ptr %tmp2, align 4
2062 %tmp8 = add i64 %tmp, 16
2063 %tmp9 = icmp eq i64 %tmp8, 1024
2064 br i1 %tmp9, label %bb10, label %bb1
2066 bb10: ; preds = %bb1
2070 define void @bcast_unfold_fmax_v2f64(ptr %arg) {
2071 ; CHECK-LABEL: bcast_unfold_fmax_v2f64:
2072 ; CHECK: # %bb.0: # %bb
2073 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2074 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2075 ; CHECK-NEXT: # xmm0 = mem[0,0]
2076 ; CHECK-NEXT: .p2align 4
2077 ; CHECK-NEXT: .LBB63_1: # %bb1
2078 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2079 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
2080 ; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
2081 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
2082 ; CHECK-NEXT: addq $16, %rax
2083 ; CHECK-NEXT: jne .LBB63_1
2084 ; CHECK-NEXT: # %bb.2: # %bb10
2089 bb1: ; preds = %bb1, %bb
2090 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2091 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
2092 %tmp4 = load <2 x double>, ptr %tmp2, align 8
2093 %tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2094 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2095 store <2 x double> %tmp6, ptr %tmp2, align 8
2096 %tmp8 = add i64 %tmp, 2
2097 %tmp9 = icmp eq i64 %tmp8, 1024
2098 br i1 %tmp9, label %bb10, label %bb1
2100 bb10: ; preds = %bb1
2104 define void @bcast_unfold_fmax_v4f64(ptr %arg) {
2105 ; CHECK-LABEL: bcast_unfold_fmax_v4f64:
2106 ; CHECK: # %bb.0: # %bb
2107 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2108 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2109 ; CHECK-NEXT: .p2align 4
2110 ; CHECK-NEXT: .LBB64_1: # %bb1
2111 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2112 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
2113 ; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1
2114 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
2115 ; CHECK-NEXT: addq $32, %rax
2116 ; CHECK-NEXT: jne .LBB64_1
2117 ; CHECK-NEXT: # %bb.2: # %bb10
2118 ; CHECK-NEXT: vzeroupper
2123 bb1: ; preds = %bb1, %bb
2124 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2125 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
2126 %tmp4 = load <4 x double>, ptr %tmp2, align 8
2127 %tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2128 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2129 store <4 x double> %tmp6, ptr %tmp2, align 8
2130 %tmp8 = add i64 %tmp, 4
2131 %tmp9 = icmp eq i64 %tmp8, 1024
2132 br i1 %tmp9, label %bb10, label %bb1
2134 bb10: ; preds = %bb1
2138 define void @bcast_unfold_fmax_v8f64(ptr %arg) {
2139 ; CHECK-LABEL: bcast_unfold_fmax_v8f64:
2140 ; CHECK: # %bb.0: # %bb
2141 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2142 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2143 ; CHECK-NEXT: .p2align 4
2144 ; CHECK-NEXT: .LBB65_1: # %bb1
2145 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2146 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2147 ; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1
2148 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2149 ; CHECK-NEXT: addq $64, %rax
2150 ; CHECK-NEXT: jne .LBB65_1
2151 ; CHECK-NEXT: # %bb.2: # %bb10
2152 ; CHECK-NEXT: vzeroupper
2157 bb1: ; preds = %bb1, %bb
2158 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2159 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
2160 %tmp4 = load <8 x double>, ptr %tmp2, align 8
2161 %tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2162 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2163 store <8 x double> %tmp6, ptr %tmp2, align 8
2164 %tmp8 = add i64 %tmp, 8
2165 %tmp9 = icmp eq i64 %tmp8, 1024
2166 br i1 %tmp9, label %bb10, label %bb1
2168 bb10: ; preds = %bb1
2172 define void @bcast_unfold_fmin_v4f32(ptr %arg) {
2173 ; CHECK-LABEL: bcast_unfold_fmin_v4f32:
2174 ; CHECK: # %bb.0: # %bb
2175 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2176 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2177 ; CHECK-NEXT: .p2align 4
2178 ; CHECK-NEXT: .LBB66_1: # %bb1
2179 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2180 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
2181 ; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1
2182 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
2183 ; CHECK-NEXT: addq $16, %rax
2184 ; CHECK-NEXT: jne .LBB66_1
2185 ; CHECK-NEXT: # %bb.2: # %bb10
2190 bb1: ; preds = %bb1, %bb
2191 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2192 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
2193 %tmp4 = load <4 x float>, ptr %tmp2, align 4
2194 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2195 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2196 store <4 x float> %tmp6, ptr %tmp2, align 4
2197 %tmp8 = add i64 %tmp, 4
2198 %tmp9 = icmp eq i64 %tmp8, 1024
2199 br i1 %tmp9, label %bb10, label %bb1
2201 bb10: ; preds = %bb1
2205 define void @bcast_unfold_fmin_v8f32(ptr %arg) {
2206 ; CHECK-LABEL: bcast_unfold_fmin_v8f32:
2207 ; CHECK: # %bb.0: # %bb
2208 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2209 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2210 ; CHECK-NEXT: .p2align 4
2211 ; CHECK-NEXT: .LBB67_1: # %bb1
2212 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2213 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
2214 ; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1
2215 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
2216 ; CHECK-NEXT: addq $32, %rax
2217 ; CHECK-NEXT: jne .LBB67_1
2218 ; CHECK-NEXT: # %bb.2: # %bb10
2219 ; CHECK-NEXT: vzeroupper
2224 bb1: ; preds = %bb1, %bb
2225 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2226 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
2227 %tmp4 = load <8 x float>, ptr %tmp2, align 4
2228 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2229 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2230 store <8 x float> %tmp6, ptr %tmp2, align 4
2231 %tmp8 = add i64 %tmp, 8
2232 %tmp9 = icmp eq i64 %tmp8, 1024
2233 br i1 %tmp9, label %bb10, label %bb1
2235 bb10: ; preds = %bb1
2239 define void @bcast_unfold_fmin_v16f32(ptr %arg) {
2240 ; CHECK-LABEL: bcast_unfold_fmin_v16f32:
2241 ; CHECK: # %bb.0: # %bb
2242 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2243 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2244 ; CHECK-NEXT: .p2align 4
2245 ; CHECK-NEXT: .LBB68_1: # %bb1
2246 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2247 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
2248 ; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1
2249 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
2250 ; CHECK-NEXT: addq $64, %rax
2251 ; CHECK-NEXT: jne .LBB68_1
2252 ; CHECK-NEXT: # %bb.2: # %bb10
2253 ; CHECK-NEXT: vzeroupper
2258 bb1: ; preds = %bb1, %bb
2259 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2260 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
2261 %tmp4 = load <16 x float>, ptr %tmp2, align 4
2262 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2263 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2264 store <16 x float> %tmp6, ptr %tmp2, align 4
2265 %tmp8 = add i64 %tmp, 16
2266 %tmp9 = icmp eq i64 %tmp8, 1024
2267 br i1 %tmp9, label %bb10, label %bb1
2269 bb10: ; preds = %bb1
2273 define void @bcast_unfold_fmin_v2f64(ptr %arg) {
2274 ; CHECK-LABEL: bcast_unfold_fmin_v2f64:
2275 ; CHECK: # %bb.0: # %bb
2276 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2277 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2278 ; CHECK-NEXT: # xmm0 = mem[0,0]
2279 ; CHECK-NEXT: .p2align 4
2280 ; CHECK-NEXT: .LBB69_1: # %bb1
2281 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2282 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
2283 ; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1
2284 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
2285 ; CHECK-NEXT: addq $16, %rax
2286 ; CHECK-NEXT: jne .LBB69_1
2287 ; CHECK-NEXT: # %bb.2: # %bb10
2292 bb1: ; preds = %bb1, %bb
2293 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2294 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
2295 %tmp4 = load <2 x double>, ptr %tmp2, align 8
2296 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2297 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2298 store <2 x double> %tmp6, ptr %tmp2, align 8
2299 %tmp8 = add i64 %tmp, 2
2300 %tmp9 = icmp eq i64 %tmp8, 1024
2301 br i1 %tmp9, label %bb10, label %bb1
2303 bb10: ; preds = %bb1
2307 define void @bcast_unfold_fmin_v4f64(ptr %arg) {
2308 ; CHECK-LABEL: bcast_unfold_fmin_v4f64:
2309 ; CHECK: # %bb.0: # %bb
2310 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2311 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2312 ; CHECK-NEXT: .p2align 4
2313 ; CHECK-NEXT: .LBB70_1: # %bb1
2314 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2315 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
2316 ; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1
2317 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
2318 ; CHECK-NEXT: addq $32, %rax
2319 ; CHECK-NEXT: jne .LBB70_1
2320 ; CHECK-NEXT: # %bb.2: # %bb10
2321 ; CHECK-NEXT: vzeroupper
2326 bb1: ; preds = %bb1, %bb
2327 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2328 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
2329 %tmp4 = load <4 x double>, ptr %tmp2, align 8
2330 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2331 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2332 store <4 x double> %tmp6, ptr %tmp2, align 8
2333 %tmp8 = add i64 %tmp, 4
2334 %tmp9 = icmp eq i64 %tmp8, 1024
2335 br i1 %tmp9, label %bb10, label %bb1
2337 bb10: ; preds = %bb1
2341 define void @bcast_unfold_fmin_v8f64(ptr %arg) {
2342 ; CHECK-LABEL: bcast_unfold_fmin_v8f64:
2343 ; CHECK: # %bb.0: # %bb
2344 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2345 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2346 ; CHECK-NEXT: .p2align 4
2347 ; CHECK-NEXT: .LBB71_1: # %bb1
2348 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2349 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2350 ; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1
2351 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2352 ; CHECK-NEXT: addq $64, %rax
2353 ; CHECK-NEXT: jne .LBB71_1
2354 ; CHECK-NEXT: # %bb.2: # %bb10
2355 ; CHECK-NEXT: vzeroupper
2360 bb1: ; preds = %bb1, %bb
2361 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2362 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
2363 %tmp4 = load <8 x double>, ptr %tmp2, align 8
2364 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2365 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2366 store <8 x double> %tmp6, ptr %tmp2, align 8
2367 %tmp8 = add i64 %tmp, 8
2368 %tmp9 = icmp eq i64 %tmp8, 1024
2369 br i1 %tmp9, label %bb10, label %bb1
2371 bb10: ; preds = %bb1
2375 define void @bcast_unfold_smin_v4i32(ptr %arg) {
2376 ; CHECK-LABEL: bcast_unfold_smin_v4i32:
2377 ; CHECK: # %bb.0: # %bb
2378 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2379 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2380 ; CHECK-NEXT: .p2align 4
2381 ; CHECK-NEXT: .LBB72_1: # %bb1
2382 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2383 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
2384 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2385 ; CHECK-NEXT: addq $16, %rax
2386 ; CHECK-NEXT: jne .LBB72_1
2387 ; CHECK-NEXT: # %bb.2: # %bb10
2392 bb1: ; preds = %bb1, %bb
2393 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2394 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2395 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
2396 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2397 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2398 store <4 x i32> %tmp6, ptr %tmp2, align 4
2399 %tmp8 = add i64 %tmp, 4
2400 %tmp9 = icmp eq i64 %tmp8, 1024
2401 br i1 %tmp9, label %bb10, label %bb1
2403 bb10: ; preds = %bb1
2407 define void @bcast_unfold_smin_v8i32(ptr %arg) {
2408 ; CHECK-LABEL: bcast_unfold_smin_v8i32:
2409 ; CHECK: # %bb.0: # %bb
2410 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2411 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2412 ; CHECK-NEXT: .p2align 4
2413 ; CHECK-NEXT: .LBB73_1: # %bb1
2414 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2415 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
2416 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2417 ; CHECK-NEXT: addq $32, %rax
2418 ; CHECK-NEXT: jne .LBB73_1
2419 ; CHECK-NEXT: # %bb.2: # %bb10
2420 ; CHECK-NEXT: vzeroupper
2425 bb1: ; preds = %bb1, %bb
2426 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2427 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2428 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
2429 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2430 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2431 store <8 x i32> %tmp6, ptr %tmp2, align 4
2432 %tmp8 = add i64 %tmp, 8
2433 %tmp9 = icmp eq i64 %tmp8, 1024
2434 br i1 %tmp9, label %bb10, label %bb1
2436 bb10: ; preds = %bb1
2440 define void @bcast_unfold_smin_v16i32(ptr %arg) {
2441 ; CHECK-LABEL: bcast_unfold_smin_v16i32:
2442 ; CHECK: # %bb.0: # %bb
2443 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2444 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2445 ; CHECK-NEXT: .p2align 4
2446 ; CHECK-NEXT: .LBB74_1: # %bb1
2447 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2448 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
2449 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2450 ; CHECK-NEXT: addq $64, %rax
2451 ; CHECK-NEXT: jne .LBB74_1
2452 ; CHECK-NEXT: # %bb.2: # %bb10
2453 ; CHECK-NEXT: vzeroupper
2458 bb1: ; preds = %bb1, %bb
2459 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2460 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2461 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
2462 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2463 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2464 store <16 x i32> %tmp6, ptr %tmp2, align 4
2465 %tmp8 = add i64 %tmp, 16
2466 %tmp9 = icmp eq i64 %tmp8, 1024
2467 br i1 %tmp9, label %bb10, label %bb1
2469 bb10: ; preds = %bb1
2473 define void @bcast_unfold_smin_v2i64(ptr %arg) {
2474 ; CHECK-LABEL: bcast_unfold_smin_v2i64:
2475 ; CHECK: # %bb.0: # %bb
2476 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2477 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
2478 ; CHECK-NEXT: .p2align 4
2479 ; CHECK-NEXT: .LBB75_1: # %bb1
2480 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2481 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %xmm0, %xmm1
2482 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2483 ; CHECK-NEXT: addq $16, %rax
2484 ; CHECK-NEXT: jne .LBB75_1
2485 ; CHECK-NEXT: # %bb.2: # %bb10
2490 bb1: ; preds = %bb1, %bb
2491 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2492 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2493 %tmp4 = load <2 x i64>, ptr %tmp2, align 8
2494 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2>
2495 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2496 store <2 x i64> %tmp6, ptr %tmp2, align 8
2497 %tmp8 = add i64 %tmp, 2
2498 %tmp9 = icmp eq i64 %tmp8, 1024
2499 br i1 %tmp9, label %bb10, label %bb1
2501 bb10: ; preds = %bb1
2505 define void @bcast_unfold_smin_v4i64(ptr %arg) {
2506 ; CHECK-LABEL: bcast_unfold_smin_v4i64:
2507 ; CHECK: # %bb.0: # %bb
2508 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2509 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2510 ; CHECK-NEXT: .p2align 4
2511 ; CHECK-NEXT: .LBB76_1: # %bb1
2512 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2513 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
2514 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2515 ; CHECK-NEXT: addq $32, %rax
2516 ; CHECK-NEXT: jne .LBB76_1
2517 ; CHECK-NEXT: # %bb.2: # %bb10
2518 ; CHECK-NEXT: vzeroupper
2523 bb1: ; preds = %bb1, %bb
2524 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2525 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2526 %tmp4 = load <4 x i64>, ptr %tmp2, align 8
2527 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2528 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2529 store <4 x i64> %tmp6, ptr %tmp2, align 8
2530 %tmp8 = add i64 %tmp, 4
2531 %tmp9 = icmp eq i64 %tmp8, 1024
2532 br i1 %tmp9, label %bb10, label %bb1
2534 bb10: ; preds = %bb1
2538 define void @bcast_unfold_smin_v8i64(ptr %arg) {
2539 ; CHECK-LABEL: bcast_unfold_smin_v8i64:
2540 ; CHECK: # %bb.0: # %bb
2541 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2542 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2543 ; CHECK-NEXT: .p2align 4
2544 ; CHECK-NEXT: .LBB77_1: # %bb1
2545 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2546 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
2547 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2548 ; CHECK-NEXT: addq $64, %rax
2549 ; CHECK-NEXT: jne .LBB77_1
2550 ; CHECK-NEXT: # %bb.2: # %bb10
2551 ; CHECK-NEXT: vzeroupper
2556 bb1: ; preds = %bb1, %bb
2557 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2558 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2559 %tmp4 = load <8 x i64>, ptr %tmp2, align 8
2560 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2561 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2562 store <8 x i64> %tmp6, ptr %tmp2, align 8
2563 %tmp8 = add i64 %tmp, 8
2564 %tmp9 = icmp eq i64 %tmp8, 1024
2565 br i1 %tmp9, label %bb10, label %bb1
2567 bb10: ; preds = %bb1
2571 define void @bcast_unfold_smax_v4i32(ptr %arg) {
2572 ; CHECK-LABEL: bcast_unfold_smax_v4i32:
2573 ; CHECK: # %bb.0: # %bb
2574 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2575 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2576 ; CHECK-NEXT: .p2align 4
2577 ; CHECK-NEXT: .LBB78_1: # %bb1
2578 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2579 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
2580 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2581 ; CHECK-NEXT: addq $16, %rax
2582 ; CHECK-NEXT: jne .LBB78_1
2583 ; CHECK-NEXT: # %bb.2: # %bb10
2588 bb1: ; preds = %bb1, %bb
2589 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2590 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2591 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
2592 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2593 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2594 store <4 x i32> %tmp6, ptr %tmp2, align 4
2595 %tmp8 = add i64 %tmp, 4
2596 %tmp9 = icmp eq i64 %tmp8, 1024
2597 br i1 %tmp9, label %bb10, label %bb1
2599 bb10: ; preds = %bb1
2603 define void @bcast_unfold_smax_v8i32(ptr %arg) {
2604 ; CHECK-LABEL: bcast_unfold_smax_v8i32:
2605 ; CHECK: # %bb.0: # %bb
2606 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2607 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2608 ; CHECK-NEXT: .p2align 4
2609 ; CHECK-NEXT: .LBB79_1: # %bb1
2610 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2611 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
2612 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2613 ; CHECK-NEXT: addq $32, %rax
2614 ; CHECK-NEXT: jne .LBB79_1
2615 ; CHECK-NEXT: # %bb.2: # %bb10
2616 ; CHECK-NEXT: vzeroupper
2621 bb1: ; preds = %bb1, %bb
2622 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2623 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2624 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
2625 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2626 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2627 store <8 x i32> %tmp6, ptr %tmp2, align 4
2628 %tmp8 = add i64 %tmp, 8
2629 %tmp9 = icmp eq i64 %tmp8, 1024
2630 br i1 %tmp9, label %bb10, label %bb1
2632 bb10: ; preds = %bb1
2636 define void @bcast_unfold_smax_v16i32(ptr %arg) {
2637 ; CHECK-LABEL: bcast_unfold_smax_v16i32:
2638 ; CHECK: # %bb.0: # %bb
2639 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2640 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2641 ; CHECK-NEXT: .p2align 4
2642 ; CHECK-NEXT: .LBB80_1: # %bb1
2643 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2644 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
2645 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2646 ; CHECK-NEXT: addq $64, %rax
2647 ; CHECK-NEXT: jne .LBB80_1
2648 ; CHECK-NEXT: # %bb.2: # %bb10
2649 ; CHECK-NEXT: vzeroupper
2654 bb1: ; preds = %bb1, %bb
2655 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2656 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2657 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
2658 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2659 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2660 store <16 x i32> %tmp6, ptr %tmp2, align 4
2661 %tmp8 = add i64 %tmp, 16
2662 %tmp9 = icmp eq i64 %tmp8, 1024
2663 br i1 %tmp9, label %bb10, label %bb1
2665 bb10: ; preds = %bb1
2669 define void @bcast_unfold_smax_v2i64(ptr %arg) {
2670 ; CHECK-LABEL: bcast_unfold_smax_v2i64:
2671 ; CHECK: # %bb.0: # %bb
2672 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2673 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
2674 ; CHECK-NEXT: .p2align 4
2675 ; CHECK-NEXT: .LBB81_1: # %bb1
2676 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2677 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1
2678 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2679 ; CHECK-NEXT: addq $16, %rax
2680 ; CHECK-NEXT: jne .LBB81_1
2681 ; CHECK-NEXT: # %bb.2: # %bb10
2686 bb1: ; preds = %bb1, %bb
2687 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2688 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2689 %tmp4 = load <2 x i64>, ptr %tmp2, align 8
2690 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2>
2691 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2692 store <2 x i64> %tmp6, ptr %tmp2, align 8
2693 %tmp8 = add i64 %tmp, 2
2694 %tmp9 = icmp eq i64 %tmp8, 1024
2695 br i1 %tmp9, label %bb10, label %bb1
2697 bb10: ; preds = %bb1
2701 define void @bcast_unfold_smax_v4i64(ptr %arg) {
2702 ; CHECK-LABEL: bcast_unfold_smax_v4i64:
2703 ; CHECK: # %bb.0: # %bb
2704 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2705 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2706 ; CHECK-NEXT: .p2align 4
2707 ; CHECK-NEXT: .LBB82_1: # %bb1
2708 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2709 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
2710 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2711 ; CHECK-NEXT: addq $32, %rax
2712 ; CHECK-NEXT: jne .LBB82_1
2713 ; CHECK-NEXT: # %bb.2: # %bb10
2714 ; CHECK-NEXT: vzeroupper
2719 bb1: ; preds = %bb1, %bb
2720 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2721 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2722 %tmp4 = load <4 x i64>, ptr %tmp2, align 8
2723 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2724 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2725 store <4 x i64> %tmp6, ptr %tmp2, align 8
2726 %tmp8 = add i64 %tmp, 4
2727 %tmp9 = icmp eq i64 %tmp8, 1024
2728 br i1 %tmp9, label %bb10, label %bb1
2730 bb10: ; preds = %bb1
2734 define void @bcast_unfold_smax_v8i64(ptr %arg) {
2735 ; CHECK-LABEL: bcast_unfold_smax_v8i64:
2736 ; CHECK: # %bb.0: # %bb
2737 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2738 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2739 ; CHECK-NEXT: .p2align 4
2740 ; CHECK-NEXT: .LBB83_1: # %bb1
2741 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2742 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
2743 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2744 ; CHECK-NEXT: addq $64, %rax
2745 ; CHECK-NEXT: jne .LBB83_1
2746 ; CHECK-NEXT: # %bb.2: # %bb10
2747 ; CHECK-NEXT: vzeroupper
2752 bb1: ; preds = %bb1, %bb
2753 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2754 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2755 %tmp4 = load <8 x i64>, ptr %tmp2, align 8
2756 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2757 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2758 store <8 x i64> %tmp6, ptr %tmp2, align 8
2759 %tmp8 = add i64 %tmp, 8
2760 %tmp9 = icmp eq i64 %tmp8, 1024
2761 br i1 %tmp9, label %bb10, label %bb1
2763 bb10: ; preds = %bb1
2767 define void @bcast_unfold_umin_v4i32(ptr %arg) {
2768 ; CHECK-LABEL: bcast_unfold_umin_v4i32:
2769 ; CHECK: # %bb.0: # %bb
2770 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2771 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2772 ; CHECK-NEXT: .p2align 4
2773 ; CHECK-NEXT: .LBB84_1: # %bb1
2774 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2775 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1
2776 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2777 ; CHECK-NEXT: addq $16, %rax
2778 ; CHECK-NEXT: jne .LBB84_1
2779 ; CHECK-NEXT: # %bb.2: # %bb10
2784 bb1: ; preds = %bb1, %bb
2785 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2786 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2787 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
2788 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2789 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2790 store <4 x i32> %tmp6, ptr %tmp2, align 4
2791 %tmp8 = add i64 %tmp, 4
2792 %tmp9 = icmp eq i64 %tmp8, 1024
2793 br i1 %tmp9, label %bb10, label %bb1
2795 bb10: ; preds = %bb1
2799 define void @bcast_unfold_umin_v8i32(ptr %arg) {
2800 ; CHECK-LABEL: bcast_unfold_umin_v8i32:
2801 ; CHECK: # %bb.0: # %bb
2802 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2803 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2804 ; CHECK-NEXT: .p2align 4
2805 ; CHECK-NEXT: .LBB85_1: # %bb1
2806 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2807 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1
2808 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2809 ; CHECK-NEXT: addq $32, %rax
2810 ; CHECK-NEXT: jne .LBB85_1
2811 ; CHECK-NEXT: # %bb.2: # %bb10
2812 ; CHECK-NEXT: vzeroupper
2817 bb1: ; preds = %bb1, %bb
2818 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2819 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2820 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
2821 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2822 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2823 store <8 x i32> %tmp6, ptr %tmp2, align 4
2824 %tmp8 = add i64 %tmp, 8
2825 %tmp9 = icmp eq i64 %tmp8, 1024
2826 br i1 %tmp9, label %bb10, label %bb1
2828 bb10: ; preds = %bb1
2832 define void @bcast_unfold_umin_v16i32(ptr %arg) {
2833 ; CHECK-LABEL: bcast_unfold_umin_v16i32:
2834 ; CHECK: # %bb.0: # %bb
2835 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2836 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2837 ; CHECK-NEXT: .p2align 4
2838 ; CHECK-NEXT: .LBB86_1: # %bb1
2839 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2840 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1
2841 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2842 ; CHECK-NEXT: addq $64, %rax
2843 ; CHECK-NEXT: jne .LBB86_1
2844 ; CHECK-NEXT: # %bb.2: # %bb10
2845 ; CHECK-NEXT: vzeroupper
2850 bb1: ; preds = %bb1, %bb
2851 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2852 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2853 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
2854 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2855 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2856 store <16 x i32> %tmp6, ptr %tmp2, align 4
2857 %tmp8 = add i64 %tmp, 16
2858 %tmp9 = icmp eq i64 %tmp8, 1024
2859 br i1 %tmp9, label %bb10, label %bb1
2861 bb10: ; preds = %bb1
2865 define void @bcast_unfold_umin_v2i64(ptr %arg) {
2866 ; CHECK-LABEL: bcast_unfold_umin_v2i64:
2867 ; CHECK: # %bb.0: # %bb
2868 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2869 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
2870 ; CHECK-NEXT: .p2align 4
2871 ; CHECK-NEXT: .LBB87_1: # %bb1
2872 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2873 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %xmm0, %xmm1
2874 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2875 ; CHECK-NEXT: addq $16, %rax
2876 ; CHECK-NEXT: jne .LBB87_1
2877 ; CHECK-NEXT: # %bb.2: # %bb10
2882 bb1: ; preds = %bb1, %bb
2883 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2884 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2885 %tmp4 = load <2 x i64>, ptr %tmp2, align 8
2886 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
2887 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2888 store <2 x i64> %tmp6, ptr %tmp2, align 8
2889 %tmp8 = add i64 %tmp, 2
2890 %tmp9 = icmp eq i64 %tmp8, 1024
2891 br i1 %tmp9, label %bb10, label %bb1
2893 bb10: ; preds = %bb1
2897 define void @bcast_unfold_umin_v4i64(ptr %arg) {
2898 ; CHECK-LABEL: bcast_unfold_umin_v4i64:
2899 ; CHECK: # %bb.0: # %bb
2900 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2901 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2902 ; CHECK-NEXT: .p2align 4
2903 ; CHECK-NEXT: .LBB88_1: # %bb1
2904 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2905 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
2906 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2907 ; CHECK-NEXT: addq $32, %rax
2908 ; CHECK-NEXT: jne .LBB88_1
2909 ; CHECK-NEXT: # %bb.2: # %bb10
2910 ; CHECK-NEXT: vzeroupper
2915 bb1: ; preds = %bb1, %bb
2916 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2917 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2918 %tmp4 = load <4 x i64>, ptr %tmp2, align 8
2919 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2920 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2921 store <4 x i64> %tmp6, ptr %tmp2, align 8
2922 %tmp8 = add i64 %tmp, 4
2923 %tmp9 = icmp eq i64 %tmp8, 1024
2924 br i1 %tmp9, label %bb10, label %bb1
2926 bb10: ; preds = %bb1
2930 define void @bcast_unfold_umin_v8i64(ptr %arg) {
2931 ; CHECK-LABEL: bcast_unfold_umin_v8i64:
2932 ; CHECK: # %bb.0: # %bb
2933 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2934 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2935 ; CHECK-NEXT: .p2align 4
2936 ; CHECK-NEXT: .LBB89_1: # %bb1
2937 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2938 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
2939 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2940 ; CHECK-NEXT: addq $64, %rax
2941 ; CHECK-NEXT: jne .LBB89_1
2942 ; CHECK-NEXT: # %bb.2: # %bb10
2943 ; CHECK-NEXT: vzeroupper
2948 bb1: ; preds = %bb1, %bb
2949 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2950 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
2951 %tmp4 = load <8 x i64>, ptr %tmp2, align 8
2952 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2953 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2954 store <8 x i64> %tmp6, ptr %tmp2, align 8
2955 %tmp8 = add i64 %tmp, 8
2956 %tmp9 = icmp eq i64 %tmp8, 1024
2957 br i1 %tmp9, label %bb10, label %bb1
2959 bb10: ; preds = %bb1
2963 define void @bcast_unfold_umax_v4i32(ptr %arg) {
2964 ; CHECK-LABEL: bcast_unfold_umax_v4i32:
2965 ; CHECK: # %bb.0: # %bb
2966 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2967 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2968 ; CHECK-NEXT: .p2align 4
2969 ; CHECK-NEXT: .LBB90_1: # %bb1
2970 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2971 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
2972 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2973 ; CHECK-NEXT: addq $16, %rax
2974 ; CHECK-NEXT: jne .LBB90_1
2975 ; CHECK-NEXT: # %bb.2: # %bb10
2980 bb1: ; preds = %bb1, %bb
2981 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2982 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
2983 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
2984 %tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2985 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2986 store <4 x i32> %tmp6, ptr %tmp2, align 4
2987 %tmp8 = add i64 %tmp, 4
2988 %tmp9 = icmp eq i64 %tmp8, 1024
2989 br i1 %tmp9, label %bb10, label %bb1
2991 bb10: ; preds = %bb1
2995 define void @bcast_unfold_umax_v8i32(ptr %arg) {
2996 ; CHECK-LABEL: bcast_unfold_umax_v8i32:
2997 ; CHECK: # %bb.0: # %bb
2998 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2999 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
3000 ; CHECK-NEXT: .p2align 4
3001 ; CHECK-NEXT: .LBB91_1: # %bb1
3002 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3003 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
3004 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3005 ; CHECK-NEXT: addq $32, %rax
3006 ; CHECK-NEXT: jne .LBB91_1
3007 ; CHECK-NEXT: # %bb.2: # %bb10
3008 ; CHECK-NEXT: vzeroupper
3013 bb1: ; preds = %bb1, %bb
3014 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3015 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3016 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
3017 %tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3018 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3019 store <8 x i32> %tmp6, ptr %tmp2, align 4
3020 %tmp8 = add i64 %tmp, 8
3021 %tmp9 = icmp eq i64 %tmp8, 1024
3022 br i1 %tmp9, label %bb10, label %bb1
3024 bb10: ; preds = %bb1
3028 define void @bcast_unfold_umax_v16i32(ptr %arg) {
3029 ; CHECK-LABEL: bcast_unfold_umax_v16i32:
3030 ; CHECK: # %bb.0: # %bb
3031 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3032 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3033 ; CHECK-NEXT: .p2align 4
3034 ; CHECK-NEXT: .LBB92_1: # %bb1
3035 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3036 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
3037 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3038 ; CHECK-NEXT: addq $64, %rax
3039 ; CHECK-NEXT: jne .LBB92_1
3040 ; CHECK-NEXT: # %bb.2: # %bb10
3041 ; CHECK-NEXT: vzeroupper
3046 bb1: ; preds = %bb1, %bb
3047 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3048 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3049 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
3050 %tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3051 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3052 store <16 x i32> %tmp6, ptr %tmp2, align 4
3053 %tmp8 = add i64 %tmp, 16
3054 %tmp9 = icmp eq i64 %tmp8, 1024
3055 br i1 %tmp9, label %bb10, label %bb1
3057 bb10: ; preds = %bb1
3061 define void @bcast_unfold_umax_v2i64(ptr %arg) {
3062 ; CHECK-LABEL: bcast_unfold_umax_v2i64:
3063 ; CHECK: # %bb.0: # %bb
3064 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3065 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
3066 ; CHECK-NEXT: .p2align 4
3067 ; CHECK-NEXT: .LBB93_1: # %bb1
3068 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3069 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1
3070 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3071 ; CHECK-NEXT: addq $16, %rax
3072 ; CHECK-NEXT: jne .LBB93_1
3073 ; CHECK-NEXT: # %bb.2: # %bb10
3078 bb1: ; preds = %bb1, %bb
3079 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3080 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3081 %tmp4 = load <2 x i64>, ptr %tmp2, align 8
3082 %tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2>
3083 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3084 store <2 x i64> %tmp6, ptr %tmp2, align 8
3085 %tmp8 = add i64 %tmp, 2
3086 %tmp9 = icmp eq i64 %tmp8, 1024
3087 br i1 %tmp9, label %bb10, label %bb1
3089 bb10: ; preds = %bb1
3093 define void @bcast_unfold_umax_v4i64(ptr %arg) {
3094 ; CHECK-LABEL: bcast_unfold_umax_v4i64:
3095 ; CHECK: # %bb.0: # %bb
3096 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3097 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3098 ; CHECK-NEXT: .p2align 4
3099 ; CHECK-NEXT: .LBB94_1: # %bb1
3100 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3101 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
3102 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3103 ; CHECK-NEXT: addq $32, %rax
3104 ; CHECK-NEXT: jne .LBB94_1
3105 ; CHECK-NEXT: # %bb.2: # %bb10
3106 ; CHECK-NEXT: vzeroupper
3111 bb1: ; preds = %bb1, %bb
3112 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3113 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3114 %tmp4 = load <4 x i64>, ptr %tmp2, align 8
3115 %tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3116 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3117 store <4 x i64> %tmp6, ptr %tmp2, align 8
3118 %tmp8 = add i64 %tmp, 4
3119 %tmp9 = icmp eq i64 %tmp8, 1024
3120 br i1 %tmp9, label %bb10, label %bb1
3122 bb10: ; preds = %bb1
3126 define void @bcast_unfold_umax_v8i64(ptr %arg) {
3127 ; CHECK-LABEL: bcast_unfold_umax_v8i64:
3128 ; CHECK: # %bb.0: # %bb
3129 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3130 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3131 ; CHECK-NEXT: .p2align 4
3132 ; CHECK-NEXT: .LBB95_1: # %bb1
3133 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3134 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
3135 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3136 ; CHECK-NEXT: addq $64, %rax
3137 ; CHECK-NEXT: jne .LBB95_1
3138 ; CHECK-NEXT: # %bb.2: # %bb10
3139 ; CHECK-NEXT: vzeroupper
3144 bb1: ; preds = %bb1, %bb
3145 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3146 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3147 %tmp4 = load <8 x i64>, ptr %tmp2, align 8
3148 %tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3149 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3150 store <8 x i64> %tmp6, ptr %tmp2, align 8
3151 %tmp8 = add i64 %tmp, 8
3152 %tmp9 = icmp eq i64 %tmp8, 1024
3153 br i1 %tmp9, label %bb10, label %bb1
3155 bb10: ; preds = %bb1
3159 define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) {
3160 ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32:
3161 ; CHECK: # %bb.0: # %bb
3162 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3163 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3164 ; CHECK-NEXT: .p2align 4
3165 ; CHECK-NEXT: .LBB96_1: # %bb1
3166 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3167 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
3168 ; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
3169 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
3170 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3171 ; CHECK-NEXT: addq $16, %rax
3172 ; CHECK-NEXT: jne .LBB96_1
3173 ; CHECK-NEXT: # %bb.2: # %bb10
3178 bb1: ; preds = %bb1, %bb
3179 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3180 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3181 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
3182 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3183 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3184 store <4 x i32> %tmp6, ptr %tmp2, align 4
3185 %tmp8 = add i64 %tmp, 4
3186 %tmp9 = icmp eq i64 %tmp8, 1024
3187 br i1 %tmp9, label %bb10, label %bb1
3189 bb10: ; preds = %bb1
3193 define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) {
3194 ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32:
3195 ; CHECK: # %bb.0: # %bb
3196 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3197 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3198 ; CHECK-NEXT: .p2align 4
3199 ; CHECK-NEXT: .LBB97_1: # %bb1
3200 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3201 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
3202 ; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
3203 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
3204 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3205 ; CHECK-NEXT: addq $32, %rax
3206 ; CHECK-NEXT: jne .LBB97_1
3207 ; CHECK-NEXT: # %bb.2: # %bb10
3208 ; CHECK-NEXT: vzeroupper
3213 bb1: ; preds = %bb1, %bb
3214 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3215 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3216 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
3217 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3218 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3219 store <8 x i32> %tmp6, ptr %tmp2, align 4
3220 %tmp8 = add i64 %tmp, 8
3221 %tmp9 = icmp eq i64 %tmp8, 1024
3222 br i1 %tmp9, label %bb10, label %bb1
3224 bb10: ; preds = %bb1
3228 define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) {
3229 ; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32:
3230 ; CHECK: # %bb.0: # %bb
3231 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3232 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3233 ; CHECK-NEXT: .p2align 4
3234 ; CHECK-NEXT: .LBB98_1: # %bb1
3235 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3236 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
3237 ; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
3238 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
3239 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3240 ; CHECK-NEXT: addq $64, %rax
3241 ; CHECK-NEXT: jne .LBB98_1
3242 ; CHECK-NEXT: # %bb.2: # %bb10
3243 ; CHECK-NEXT: vzeroupper
3248 bb1: ; preds = %bb1, %bb
3249 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3250 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3251 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
3252 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3253 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3254 store <16 x i32> %tmp6, ptr %tmp2, align 4
3255 %tmp8 = add i64 %tmp, 16
3256 %tmp9 = icmp eq i64 %tmp8, 1024
3257 br i1 %tmp9, label %bb10, label %bb1
3259 bb10: ; preds = %bb1
3263 define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) {
3264 ; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64:
3265 ; CHECK: # %bb.0: # %bb
3266 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3267 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
3268 ; CHECK-NEXT: .p2align 4
3269 ; CHECK-NEXT: .LBB99_1: # %bb1
3270 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3271 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
3272 ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
3273 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
3274 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3275 ; CHECK-NEXT: addq $16, %rax
3276 ; CHECK-NEXT: jne .LBB99_1
3277 ; CHECK-NEXT: # %bb.2: # %bb10
3282 bb1: ; preds = %bb1, %bb
3283 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3284 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3285 %tmp4 = load <2 x i64>, ptr %tmp2, align 4
3286 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1>
3287 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3288 store <2 x i64> %tmp6, ptr %tmp2, align 4
3289 %tmp8 = add i64 %tmp, 2
3290 %tmp9 = icmp eq i64 %tmp8, 1024
3291 br i1 %tmp9, label %bb10, label %bb1
3293 bb10: ; preds = %bb1
3297 define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) {
3298 ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64:
3299 ; CHECK: # %bb.0: # %bb
3300 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3301 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3302 ; CHECK-NEXT: .p2align 4
3303 ; CHECK-NEXT: .LBB100_1: # %bb1
3304 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3305 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
3306 ; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
3307 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
3308 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3309 ; CHECK-NEXT: addq $32, %rax
3310 ; CHECK-NEXT: jne .LBB100_1
3311 ; CHECK-NEXT: # %bb.2: # %bb10
3312 ; CHECK-NEXT: vzeroupper
3317 bb1: ; preds = %bb1, %bb
3318 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3319 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3320 %tmp4 = load <4 x i64>, ptr %tmp2, align 4
3321 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3322 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3323 store <4 x i64> %tmp6, ptr %tmp2, align 4
3324 %tmp8 = add i64 %tmp, 4
3325 %tmp9 = icmp eq i64 %tmp8, 1024
3326 br i1 %tmp9, label %bb10, label %bb1
3328 bb10: ; preds = %bb1
3332 define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) {
3333 ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64:
3334 ; CHECK: # %bb.0: # %bb
3335 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3336 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3337 ; CHECK-NEXT: .p2align 4
3338 ; CHECK-NEXT: .LBB101_1: # %bb1
3339 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3340 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
3341 ; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
3342 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
3343 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3344 ; CHECK-NEXT: addq $64, %rax
3345 ; CHECK-NEXT: jne .LBB101_1
3346 ; CHECK-NEXT: # %bb.2: # %bb10
3347 ; CHECK-NEXT: vzeroupper
3352 bb1: ; preds = %bb1, %bb
3353 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3354 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3355 %tmp4 = load <8 x i64>, ptr %tmp2, align 4
3356 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3357 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3358 store <8 x i64> %tmp6, ptr %tmp2, align 4
3359 %tmp8 = add i64 %tmp, 8
3360 %tmp9 = icmp eq i64 %tmp8, 1024
3361 br i1 %tmp9, label %bb10, label %bb1
3363 bb10: ; preds = %bb1
3367 define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) {
3368 ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32:
3369 ; CHECK: # %bb.0: # %bb
3370 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3371 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3372 ; CHECK-NEXT: .p2align 4
3373 ; CHECK-NEXT: .LBB102_1: # %bb1
3374 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3375 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
3376 ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
3377 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
3378 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3379 ; CHECK-NEXT: addq $16, %rax
3380 ; CHECK-NEXT: jne .LBB102_1
3381 ; CHECK-NEXT: # %bb.2: # %bb10
3386 bb1: ; preds = %bb1, %bb
3387 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3388 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3389 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
3390 %tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3391 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3392 store <4 x i32> %tmp6, ptr %tmp2, align 4
3393 %tmp8 = add i64 %tmp, 4
3394 %tmp9 = icmp eq i64 %tmp8, 1024
3395 br i1 %tmp9, label %bb10, label %bb1
3397 bb10: ; preds = %bb1
3401 define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) {
3402 ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32:
3403 ; CHECK: # %bb.0: # %bb
3404 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3405 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3406 ; CHECK-NEXT: .p2align 4
3407 ; CHECK-NEXT: .LBB103_1: # %bb1
3408 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3409 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
3410 ; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1
3411 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
3412 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3413 ; CHECK-NEXT: addq $32, %rax
3414 ; CHECK-NEXT: jne .LBB103_1
3415 ; CHECK-NEXT: # %bb.2: # %bb10
3416 ; CHECK-NEXT: vzeroupper
3421 bb1: ; preds = %bb1, %bb
3422 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3423 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3424 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
3425 %tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3426 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3427 store <8 x i32> %tmp6, ptr %tmp2, align 4
3428 %tmp8 = add i64 %tmp, 8
3429 %tmp9 = icmp eq i64 %tmp8, 1024
3430 br i1 %tmp9, label %bb10, label %bb1
3432 bb10: ; preds = %bb1
3436 define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) {
3437 ; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32:
3438 ; CHECK: # %bb.0: # %bb
3439 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3440 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3441 ; CHECK-NEXT: .p2align 4
3442 ; CHECK-NEXT: .LBB104_1: # %bb1
3443 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3444 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
3445 ; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1
3446 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
3447 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3448 ; CHECK-NEXT: addq $64, %rax
3449 ; CHECK-NEXT: jne .LBB104_1
3450 ; CHECK-NEXT: # %bb.2: # %bb10
3451 ; CHECK-NEXT: vzeroupper
3456 bb1: ; preds = %bb1, %bb
3457 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3458 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3459 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
3460 %tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3461 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3462 store <16 x i32> %tmp6, ptr %tmp2, align 4
3463 %tmp8 = add i64 %tmp, 16
3464 %tmp9 = icmp eq i64 %tmp8, 1024
3465 br i1 %tmp9, label %bb10, label %bb1
3467 bb10: ; preds = %bb1
3471 define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) {
3472 ; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64:
3473 ; CHECK: # %bb.0: # %bb
3474 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3475 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
3476 ; CHECK-NEXT: .p2align 4
3477 ; CHECK-NEXT: .LBB105_1: # %bb1
3478 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3479 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
3480 ; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1
3481 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
3482 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3483 ; CHECK-NEXT: addq $16, %rax
3484 ; CHECK-NEXT: jne .LBB105_1
3485 ; CHECK-NEXT: # %bb.2: # %bb10
3490 bb1: ; preds = %bb1, %bb
3491 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3492 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3493 %tmp4 = load <2 x i64>, ptr %tmp2, align 4
3494 %tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1>
3495 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3496 store <2 x i64> %tmp6, ptr %tmp2, align 4
3497 %tmp8 = add i64 %tmp, 2
3498 %tmp9 = icmp eq i64 %tmp8, 1024
3499 br i1 %tmp9, label %bb10, label %bb1
3501 bb10: ; preds = %bb1
3505 define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) {
3506 ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64:
3507 ; CHECK: # %bb.0: # %bb
3508 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3509 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3510 ; CHECK-NEXT: .p2align 4
3511 ; CHECK-NEXT: .LBB106_1: # %bb1
3512 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3513 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
3514 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1
3515 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
3516 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3517 ; CHECK-NEXT: addq $32, %rax
3518 ; CHECK-NEXT: jne .LBB106_1
3519 ; CHECK-NEXT: # %bb.2: # %bb10
3520 ; CHECK-NEXT: vzeroupper
3525 bb1: ; preds = %bb1, %bb
3526 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3527 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3528 %tmp4 = load <4 x i64>, ptr %tmp2, align 4
3529 %tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3530 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3531 store <4 x i64> %tmp6, ptr %tmp2, align 4
3532 %tmp8 = add i64 %tmp, 4
3533 %tmp9 = icmp eq i64 %tmp8, 1024
3534 br i1 %tmp9, label %bb10, label %bb1
3536 bb10: ; preds = %bb1
3540 define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) {
3541 ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64:
3542 ; CHECK: # %bb.0: # %bb
3543 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3544 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3545 ; CHECK-NEXT: .p2align 4
3546 ; CHECK-NEXT: .LBB107_1: # %bb1
3547 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3548 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
3549 ; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1
3550 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
3551 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3552 ; CHECK-NEXT: addq $64, %rax
3553 ; CHECK-NEXT: jne .LBB107_1
3554 ; CHECK-NEXT: # %bb.2: # %bb10
3555 ; CHECK-NEXT: vzeroupper
3560 bb1: ; preds = %bb1, %bb
3561 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3562 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3563 %tmp4 = load <8 x i64>, ptr %tmp2, align 4
3564 %tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3565 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3566 store <8 x i64> %tmp6, ptr %tmp2, align 4
3567 %tmp8 = add i64 %tmp, 8
3568 %tmp9 = icmp eq i64 %tmp8, 1024
3569 br i1 %tmp9, label %bb10, label %bb1
3571 bb10: ; preds = %bb1
3575 define void @bcast_unfold_pcmp_v4i32(ptr %arg) {
3576 ; CHECK-LABEL: bcast_unfold_pcmp_v4i32:
3577 ; CHECK: # %bb.0: # %bb
3578 ; CHECK-NEXT: xorl %eax, %eax
3579 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3580 ; CHECK-NEXT: .p2align 4
3581 ; CHECK-NEXT: .LBB108_1: # %bb1
3582 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3583 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
3584 ; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1
3585 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
3586 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
3587 ; CHECK-NEXT: addq $4, %rax
3588 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3589 ; CHECK-NEXT: jg .LBB108_1
3590 ; CHECK-NEXT: # %bb.2: # %bb10
3595 bb1: ; preds = %bb1, %bb
3596 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3597 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3598 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
3599 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3600 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3601 store <4 x i32> %tmp6, ptr %tmp2, align 4
3602 %tmp8 = add i64 %tmp, 4
3603 %tmp9 = icmp slt i64 %tmp8, 1024
3604 br i1 %tmp9, label %bb10, label %bb1
3606 bb10: ; preds = %bb1
3610 define void @bcast_unfold_pcmp_v8i32(ptr %arg) {
3611 ; CHECK-LABEL: bcast_unfold_pcmp_v8i32:
3612 ; CHECK: # %bb.0: # %bb
3613 ; CHECK-NEXT: xorl %eax, %eax
3614 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3615 ; CHECK-NEXT: .p2align 4
3616 ; CHECK-NEXT: .LBB109_1: # %bb1
3617 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3618 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
3619 ; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1
3620 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
3621 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
3622 ; CHECK-NEXT: addq $8, %rax
3623 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3624 ; CHECK-NEXT: jg .LBB109_1
3625 ; CHECK-NEXT: # %bb.2: # %bb10
3626 ; CHECK-NEXT: vzeroupper
3631 bb1: ; preds = %bb1, %bb
3632 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3633 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3634 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
3635 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3636 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3637 store <8 x i32> %tmp6, ptr %tmp2, align 4
3638 %tmp8 = add i64 %tmp, 8
3639 %tmp9 = icmp slt i64 %tmp8, 1024
3640 br i1 %tmp9, label %bb10, label %bb1
3642 bb10: ; preds = %bb1
3646 define void @bcast_unfold_pcmp_v16i32(ptr %arg) {
3647 ; CHECK-LABEL: bcast_unfold_pcmp_v16i32:
3648 ; CHECK: # %bb.0: # %bb
3649 ; CHECK-NEXT: xorl %eax, %eax
3650 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3651 ; CHECK-NEXT: .p2align 4
3652 ; CHECK-NEXT: .LBB110_1: # %bb1
3653 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3654 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
3655 ; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1
3656 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
3657 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
3658 ; CHECK-NEXT: addq $16, %rax
3659 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3660 ; CHECK-NEXT: jg .LBB110_1
3661 ; CHECK-NEXT: # %bb.2: # %bb10
3662 ; CHECK-NEXT: vzeroupper
3667 bb1: ; preds = %bb1, %bb
3668 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3669 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3670 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
3671 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3672 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3673 store <16 x i32> %tmp6, ptr %tmp2, align 4
3674 %tmp8 = add i64 %tmp, 16
3675 %tmp9 = icmp slt i64 %tmp8, 1024
3676 br i1 %tmp9, label %bb10, label %bb1
3678 bb10: ; preds = %bb1
3682 define void @bcast_unfold_pcmp_v2i64(ptr %arg) {
3683 ; CHECK-LABEL: bcast_unfold_pcmp_v2i64:
3684 ; CHECK: # %bb.0: # %bb
3685 ; CHECK-NEXT: xorl %eax, %eax
3686 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
3687 ; CHECK-NEXT: .p2align 4
3688 ; CHECK-NEXT: .LBB111_1: # %bb1
3689 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3690 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
3691 ; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1
3692 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
3693 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
3694 ; CHECK-NEXT: addq $2, %rax
3695 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3696 ; CHECK-NEXT: jg .LBB111_1
3697 ; CHECK-NEXT: # %bb.2: # %bb10
3702 bb1: ; preds = %bb1, %bb
3703 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3704 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3705 %tmp4 = load <2 x i64>, ptr %tmp2, align 4
3706 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1>
3707 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3708 store <2 x i64> %tmp6, ptr %tmp2, align 4
3709 %tmp8 = add i64 %tmp, 2
3710 %tmp9 = icmp slt i64 %tmp8, 1024
3711 br i1 %tmp9, label %bb10, label %bb1
3713 bb10: ; preds = %bb1
3717 define void @bcast_unfold_pcmp_v4i64(ptr %arg) {
3718 ; CHECK-LABEL: bcast_unfold_pcmp_v4i64:
3719 ; CHECK: # %bb.0: # %bb
3720 ; CHECK-NEXT: xorl %eax, %eax
3721 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3722 ; CHECK-NEXT: .p2align 4
3723 ; CHECK-NEXT: .LBB112_1: # %bb1
3724 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3725 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
3726 ; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1
3727 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
3728 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
3729 ; CHECK-NEXT: addq $4, %rax
3730 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3731 ; CHECK-NEXT: jg .LBB112_1
3732 ; CHECK-NEXT: # %bb.2: # %bb10
3733 ; CHECK-NEXT: vzeroupper
3738 bb1: ; preds = %bb1, %bb
3739 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3740 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3741 %tmp4 = load <4 x i64>, ptr %tmp2, align 4
3742 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3743 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3744 store <4 x i64> %tmp6, ptr %tmp2, align 4
3745 %tmp8 = add i64 %tmp, 4
3746 %tmp9 = icmp slt i64 %tmp8, 1024
3747 br i1 %tmp9, label %bb10, label %bb1
3749 bb10: ; preds = %bb1
3753 define void @bcast_unfold_pcmp_v8i64(ptr %arg) {
3754 ; CHECK-LABEL: bcast_unfold_pcmp_v8i64:
3755 ; CHECK: # %bb.0: # %bb
3756 ; CHECK-NEXT: xorl %eax, %eax
3757 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3758 ; CHECK-NEXT: .p2align 4
3759 ; CHECK-NEXT: .LBB113_1: # %bb1
3760 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3761 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
3762 ; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1
3763 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
3764 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
3765 ; CHECK-NEXT: addq $8, %rax
3766 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3767 ; CHECK-NEXT: jg .LBB113_1
3768 ; CHECK-NEXT: # %bb.2: # %bb10
3769 ; CHECK-NEXT: vzeroupper
3774 bb1: ; preds = %bb1, %bb
3775 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3776 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3777 %tmp4 = load <8 x i64>, ptr %tmp2, align 4
3778 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3779 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3780 store <8 x i64> %tmp6, ptr %tmp2, align 4
3781 %tmp8 = add i64 %tmp, 8
3782 %tmp9 = icmp slt i64 %tmp8, 1024
3783 br i1 %tmp9, label %bb10, label %bb1
3785 bb10: ; preds = %bb1
3789 define void @bcast_unfold_pcmpu_v4i32(ptr %arg) {
3790 ; CHECK-LABEL: bcast_unfold_pcmpu_v4i32:
3791 ; CHECK: # %bb.0: # %bb
3792 ; CHECK-NEXT: xorl %eax, %eax
3793 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
3794 ; CHECK-NEXT: .p2align 4
3795 ; CHECK-NEXT: .LBB114_1: # %bb1
3796 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3797 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
3798 ; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1
3799 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
3800 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
3801 ; CHECK-NEXT: addq $4, %rax
3802 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3803 ; CHECK-NEXT: ja .LBB114_1
3804 ; CHECK-NEXT: # %bb.2: # %bb10
3809 bb1: ; preds = %bb1, %bb
3810 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3811 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3812 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
3813 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
3814 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3815 store <4 x i32> %tmp6, ptr %tmp2, align 4
3816 %tmp8 = add i64 %tmp, 4
3817 %tmp9 = icmp ult i64 %tmp8, 1024
3818 br i1 %tmp9, label %bb10, label %bb1
3820 bb10: ; preds = %bb1
3824 define void @bcast_unfold_pcmpu_v8i32(ptr %arg) {
3825 ; CHECK-LABEL: bcast_unfold_pcmpu_v8i32:
3826 ; CHECK: # %bb.0: # %bb
3827 ; CHECK-NEXT: xorl %eax, %eax
3828 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
3829 ; CHECK-NEXT: .p2align 4
3830 ; CHECK-NEXT: .LBB115_1: # %bb1
3831 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3832 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
3833 ; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1
3834 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
3835 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
3836 ; CHECK-NEXT: addq $8, %rax
3837 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3838 ; CHECK-NEXT: ja .LBB115_1
3839 ; CHECK-NEXT: # %bb.2: # %bb10
3840 ; CHECK-NEXT: vzeroupper
3845 bb1: ; preds = %bb1, %bb
3846 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3847 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3848 %tmp4 = load <8 x i32>, ptr %tmp2, align 4
3849 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3850 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3851 store <8 x i32> %tmp6, ptr %tmp2, align 4
3852 %tmp8 = add i64 %tmp, 8
3853 %tmp9 = icmp ult i64 %tmp8, 1024
3854 br i1 %tmp9, label %bb10, label %bb1
3856 bb10: ; preds = %bb1
3860 define void @bcast_unfold_pcmpu_v16i32(ptr %arg) {
3861 ; CHECK-LABEL: bcast_unfold_pcmpu_v16i32:
3862 ; CHECK: # %bb.0: # %bb
3863 ; CHECK-NEXT: xorl %eax, %eax
3864 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3865 ; CHECK-NEXT: .p2align 4
3866 ; CHECK-NEXT: .LBB116_1: # %bb1
3867 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3868 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
3869 ; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1
3870 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
3871 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
3872 ; CHECK-NEXT: addq $16, %rax
3873 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3874 ; CHECK-NEXT: ja .LBB116_1
3875 ; CHECK-NEXT: # %bb.2: # %bb10
3876 ; CHECK-NEXT: vzeroupper
3881 bb1: ; preds = %bb1, %bb
3882 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3883 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
3884 %tmp4 = load <16 x i32>, ptr %tmp2, align 4
3885 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3886 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3887 store <16 x i32> %tmp6, ptr %tmp2, align 4
3888 %tmp8 = add i64 %tmp, 16
3889 %tmp9 = icmp ult i64 %tmp8, 1024
3890 br i1 %tmp9, label %bb10, label %bb1
3892 bb10: ; preds = %bb1
3896 define void @bcast_unfold_pcmpu_v2i64(ptr %arg) {
3897 ; CHECK-LABEL: bcast_unfold_pcmpu_v2i64:
3898 ; CHECK: # %bb.0: # %bb
3899 ; CHECK-NEXT: xorl %eax, %eax
3900 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
3901 ; CHECK-NEXT: .p2align 4
3902 ; CHECK-NEXT: .LBB117_1: # %bb1
3903 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3904 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
3905 ; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
3906 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
3907 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
3908 ; CHECK-NEXT: addq $2, %rax
3909 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3910 ; CHECK-NEXT: ja .LBB117_1
3911 ; CHECK-NEXT: # %bb.2: # %bb10
3916 bb1: ; preds = %bb1, %bb
3917 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3918 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3919 %tmp4 = load <2 x i64>, ptr %tmp2, align 4
3920 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
3921 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3922 store <2 x i64> %tmp6, ptr %tmp2, align 4
3923 %tmp8 = add i64 %tmp, 2
3924 %tmp9 = icmp ult i64 %tmp8, 1024
3925 br i1 %tmp9, label %bb10, label %bb1
3927 bb10: ; preds = %bb1
3931 define void @bcast_unfold_pcmpu_v4i64(ptr %arg) {
3932 ; CHECK-LABEL: bcast_unfold_pcmpu_v4i64:
3933 ; CHECK: # %bb.0: # %bb
3934 ; CHECK-NEXT: xorl %eax, %eax
3935 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3936 ; CHECK-NEXT: .p2align 4
3937 ; CHECK-NEXT: .LBB118_1: # %bb1
3938 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3939 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
3940 ; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1
3941 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
3942 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
3943 ; CHECK-NEXT: addq $4, %rax
3944 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3945 ; CHECK-NEXT: ja .LBB118_1
3946 ; CHECK-NEXT: # %bb.2: # %bb10
3947 ; CHECK-NEXT: vzeroupper
3952 bb1: ; preds = %bb1, %bb
3953 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3954 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3955 %tmp4 = load <4 x i64>, ptr %tmp2, align 4
3956 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3957 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3958 store <4 x i64> %tmp6, ptr %tmp2, align 4
3959 %tmp8 = add i64 %tmp, 4
3960 %tmp9 = icmp ult i64 %tmp8, 1024
3961 br i1 %tmp9, label %bb10, label %bb1
3963 bb10: ; preds = %bb1
3967 define void @bcast_unfold_pcmpu_v8i64(ptr %arg) {
3968 ; CHECK-LABEL: bcast_unfold_pcmpu_v8i64:
3969 ; CHECK: # %bb.0: # %bb
3970 ; CHECK-NEXT: xorl %eax, %eax
3971 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3972 ; CHECK-NEXT: .p2align 4
3973 ; CHECK-NEXT: .LBB119_1: # %bb1
3974 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3975 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
3976 ; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1
3977 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
3978 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
3979 ; CHECK-NEXT: addq $8, %rax
3980 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3981 ; CHECK-NEXT: ja .LBB119_1
3982 ; CHECK-NEXT: # %bb.2: # %bb10
3983 ; CHECK-NEXT: vzeroupper
3988 bb1: ; preds = %bb1, %bb
3989 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3990 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
3991 %tmp4 = load <8 x i64>, ptr %tmp2, align 4
3992 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3993 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3994 store <8 x i64> %tmp6, ptr %tmp2, align 4
3995 %tmp8 = add i64 %tmp, 8
3996 %tmp9 = icmp ult i64 %tmp8, 1024
3997 br i1 %tmp9, label %bb10, label %bb1
3999 bb10: ; preds = %bb1
4003 define void @bcast_unfold_cmp_v4f32(ptr %arg) {
4004 ; CHECK-LABEL: bcast_unfold_cmp_v4f32:
4005 ; CHECK: # %bb.0: # %bb
4006 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4007 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4008 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4009 ; CHECK-NEXT: .p2align 4
4010 ; CHECK-NEXT: .LBB120_1: # %bb1
4011 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4012 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2
4013 ; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1
4014 ; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1}
4015 ; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax)
4016 ; CHECK-NEXT: addq $16, %rax
4017 ; CHECK-NEXT: jne .LBB120_1
4018 ; CHECK-NEXT: # %bb.2: # %bb10
4023 bb1: ; preds = %bb1, %bb
4024 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4025 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
4026 %tmp4 = load <4 x float>, ptr %tmp2, align 4
4027 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4028 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4029 store <4 x float> %tmp6, ptr %tmp2, align 4
4030 %tmp8 = add i64 %tmp, 4
4031 %tmp9 = icmp eq i64 %tmp8, 1024
4032 br i1 %tmp9, label %bb10, label %bb1
4034 bb10: ; preds = %bb1
4038 define void @bcast_unfold_cmp_v8f32(ptr %arg) {
4039 ; CHECK-LABEL: bcast_unfold_cmp_v8f32:
4040 ; CHECK: # %bb.0: # %bb
4041 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4042 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4043 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4044 ; CHECK-NEXT: .p2align 4
4045 ; CHECK-NEXT: .LBB121_1: # %bb1
4046 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4047 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2
4048 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1
4049 ; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1}
4050 ; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
4051 ; CHECK-NEXT: addq $32, %rax
4052 ; CHECK-NEXT: jne .LBB121_1
4053 ; CHECK-NEXT: # %bb.2: # %bb10
4054 ; CHECK-NEXT: vzeroupper
4059 bb1: ; preds = %bb1, %bb
4060 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4061 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
4062 %tmp4 = load <8 x float>, ptr %tmp2, align 4
4063 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4064 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4065 store <8 x float> %tmp6, ptr %tmp2, align 4
4066 %tmp8 = add i64 %tmp, 8
4067 %tmp9 = icmp eq i64 %tmp8, 1024
4068 br i1 %tmp9, label %bb10, label %bb1
4070 bb10: ; preds = %bb1
4074 define void @bcast_unfold_cmp_v16f32(ptr %arg) {
4075 ; CHECK-LABEL: bcast_unfold_cmp_v16f32:
4076 ; CHECK: # %bb.0: # %bb
4077 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4078 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4079 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4080 ; CHECK-NEXT: .p2align 4
4081 ; CHECK-NEXT: .LBB122_1: # %bb1
4082 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4083 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2
4084 ; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1
4085 ; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1}
4086 ; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax)
4087 ; CHECK-NEXT: addq $64, %rax
4088 ; CHECK-NEXT: jne .LBB122_1
4089 ; CHECK-NEXT: # %bb.2: # %bb10
4090 ; CHECK-NEXT: vzeroupper
4095 bb1: ; preds = %bb1, %bb
4096 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4097 %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
4098 %tmp4 = load <16 x float>, ptr %tmp2, align 4
4099 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4100 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4101 store <16 x float> %tmp6, ptr %tmp2, align 4
4102 %tmp8 = add i64 %tmp, 16
4103 %tmp9 = icmp eq i64 %tmp8, 1024
4104 br i1 %tmp9, label %bb10, label %bb1
4106 bb10: ; preds = %bb1
4110 define void @bcast_unfold_cmp_v2f64(ptr %arg) {
4111 ; CHECK-LABEL: bcast_unfold_cmp_v2f64:
4112 ; CHECK: # %bb.0: # %bb
4113 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4114 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
4115 ; CHECK-NEXT: # xmm0 = mem[0,0]
4116 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [3.0E+0,3.0E+0]
4117 ; CHECK-NEXT: # xmm1 = mem[0,0]
4118 ; CHECK-NEXT: .p2align 4
4119 ; CHECK-NEXT: .LBB123_1: # %bb1
4120 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4121 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2
4122 ; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1
4123 ; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1}
4124 ; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax)
4125 ; CHECK-NEXT: addq $16, %rax
4126 ; CHECK-NEXT: jne .LBB123_1
4127 ; CHECK-NEXT: # %bb.2: # %bb10
4132 bb1: ; preds = %bb1, %bb
4133 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4134 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
4135 %tmp4 = load <2 x double>, ptr %tmp2, align 8
4136 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
4137 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00>
4138 store <2 x double> %tmp6, ptr %tmp2, align 8
4139 %tmp8 = add i64 %tmp, 2
4140 %tmp9 = icmp eq i64 %tmp8, 1024
4141 br i1 %tmp9, label %bb10, label %bb1
4143 bb10: ; preds = %bb1
4147 define void @bcast_unfold_cmp_v4f64(ptr %arg) {
4148 ; CHECK-LABEL: bcast_unfold_cmp_v4f64:
4149 ; CHECK: # %bb.0: # %bb
4150 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4151 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4152 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4153 ; CHECK-NEXT: .p2align 4
4154 ; CHECK-NEXT: .LBB124_1: # %bb1
4155 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4156 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2
4157 ; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1
4158 ; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1}
4159 ; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax)
4160 ; CHECK-NEXT: addq $32, %rax
4161 ; CHECK-NEXT: jne .LBB124_1
4162 ; CHECK-NEXT: # %bb.2: # %bb10
4163 ; CHECK-NEXT: vzeroupper
4168 bb1: ; preds = %bb1, %bb
4169 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4170 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
4171 %tmp4 = load <4 x double>, ptr %tmp2, align 8
4172 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4173 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4174 store <4 x double> %tmp6, ptr %tmp2, align 8
4175 %tmp8 = add i64 %tmp, 4
4176 %tmp9 = icmp eq i64 %tmp8, 1024
4177 br i1 %tmp9, label %bb10, label %bb1
4179 bb10: ; preds = %bb1
4183 define void @bcast_unfold_cmp_v8f64(ptr %arg) {
4184 ; CHECK-LABEL: bcast_unfold_cmp_v8f64:
4185 ; CHECK: # %bb.0: # %bb
4186 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4187 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4188 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4189 ; CHECK-NEXT: .p2align 4
4190 ; CHECK-NEXT: .LBB125_1: # %bb1
4191 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4192 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2
4193 ; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1
4194 ; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1}
4195 ; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax)
4196 ; CHECK-NEXT: addq $64, %rax
4197 ; CHECK-NEXT: jne .LBB125_1
4198 ; CHECK-NEXT: # %bb.2: # %bb10
4199 ; CHECK-NEXT: vzeroupper
4204 bb1: ; preds = %bb1, %bb
4205 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4206 %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
4207 %tmp4 = load <8 x double>, ptr %tmp2, align 8
4208 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4209 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4210 store <8 x double> %tmp6, ptr %tmp2, align 8
4211 %tmp8 = add i64 %tmp, 8
4212 %tmp9 = icmp eq i64 %tmp8, 1024
4213 br i1 %tmp9, label %bb10, label %bb1
4215 bb10: ; preds = %bb1
4219 define void @bcast_unfold_cmp_v8f32_refold(ptr nocapture %0) {
4220 ; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold:
4222 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4223 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4224 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4225 ; CHECK-NEXT: .p2align 4
4226 ; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1
4227 ; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
4228 ; CHECK-NEXT: vblendmps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
4229 ; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
4230 ; CHECK-NEXT: addq $32, %rax
4231 ; CHECK-NEXT: jne .LBB126_1
4232 ; CHECK-NEXT: # %bb.2:
4233 ; CHECK-NEXT: vzeroupper
4238 %3 = phi i64 [ 0, %1 ], [ %8, %2 ]
4239 %4 = getelementptr inbounds float, ptr %0, i64 %3
4240 %5 = load <8 x float>, ptr %4, align 4
4241 %6 = fcmp olt <8 x float> %5, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4242 %7 = select <8 x i1> %6, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4243 store <8 x float> %7, ptr %4, align 4
4245 %9 = icmp eq i64 %8, 1024
4246 br i1 %9, label %10, label %2
4252 define void @bcast_unfold_ptestm_v4i32(ptr %arg) {
4253 ; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
4254 ; CHECK: # %bb.0: # %bb
4255 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4256 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4257 ; CHECK-NEXT: .p2align 4
4258 ; CHECK-NEXT: .LBB127_1: # %bb1
4259 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4260 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
4261 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1
4262 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
4263 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
4264 ; CHECK-NEXT: addq $16, %rax
4265 ; CHECK-NEXT: jne .LBB127_1
4266 ; CHECK-NEXT: # %bb.2: # %bb10
4271 bb1: ; preds = %bb1, %bb
4272 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4273 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
4274 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
4275 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4276 %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
4277 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4278 store <4 x i32> %tmp6, ptr %tmp2, align 4
4279 %tmp8 = add i64 %tmp, 4
4280 %tmp9 = icmp eq i64 %tmp8, 1024
4281 br i1 %tmp9, label %bb10, label %bb1
4283 bb10: ; preds = %bb1
4287 define void @bcast_unfold_ptestnm_v4i32(ptr %arg) {
4288 ; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
4289 ; CHECK: # %bb.0: # %bb
4290 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4291 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4292 ; CHECK-NEXT: .p2align 4
4293 ; CHECK-NEXT: .LBB128_1: # %bb1
4294 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4295 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
4296 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1
4297 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
4298 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
4299 ; CHECK-NEXT: addq $16, %rax
4300 ; CHECK-NEXT: jne .LBB128_1
4301 ; CHECK-NEXT: # %bb.2: # %bb10
4306 bb1: ; preds = %bb1, %bb
4307 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4308 %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
4309 %tmp4 = load <4 x i32>, ptr %tmp2, align 4
4310 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4311 %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
4312 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4313 store <4 x i32> %tmp6, ptr %tmp2, align 4
4314 %tmp8 = add i64 %tmp, 4
4315 %tmp9 = icmp eq i64 %tmp8, 1024
4316 br i1 %tmp9, label %bb10, label %bb1
4318 bb10: ; preds = %bb1
4322 define void @bcast_unfold_ptestm_v4i64(ptr %arg) {
4323 ; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
4324 ; CHECK: # %bb.0: # %bb
4325 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4326 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4327 ; CHECK-NEXT: .p2align 4
4328 ; CHECK-NEXT: .LBB129_1: # %bb1
4329 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4330 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
4331 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1
4332 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
4333 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
4334 ; CHECK-NEXT: addq $32, %rax
4335 ; CHECK-NEXT: jne .LBB129_1
4336 ; CHECK-NEXT: # %bb.2: # %bb10
4337 ; CHECK-NEXT: vzeroupper
4342 bb1: ; preds = %bb1, %bb
4343 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4344 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
4345 %tmp4 = load <4 x i64>, ptr %tmp2, align 8
4346 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4347 %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
4348 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4349 store <4 x i64> %tmp6, ptr %tmp2, align 8
4350 %tmp8 = add i64 %tmp, 4
4351 %tmp9 = icmp eq i64 %tmp8, 1024
4352 br i1 %tmp9, label %bb10, label %bb1
4354 bb10: ; preds = %bb1
4358 define void @bcast_unfold_ptestnm_v4i64(ptr %arg) {
4359 ; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
4360 ; CHECK: # %bb.0: # %bb
4361 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4362 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4363 ; CHECK-NEXT: .p2align 4
4364 ; CHECK-NEXT: .LBB130_1: # %bb1
4365 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4366 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
4367 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1
4368 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
4369 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
4370 ; CHECK-NEXT: addq $32, %rax
4371 ; CHECK-NEXT: jne .LBB130_1
4372 ; CHECK-NEXT: # %bb.2: # %bb10
4373 ; CHECK-NEXT: vzeroupper
4378 bb1: ; preds = %bb1, %bb
4379 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4380 %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
4381 %tmp4 = load <4 x i64>, ptr %tmp2, align 8
4382 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4383 %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
4384 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4385 store <4 x i64> %tmp6, ptr %tmp2, align 8
4386 %tmp8 = add i64 %tmp, 4
4387 %tmp9 = icmp eq i64 %tmp8, 1024
4388 br i1 %tmp9, label %bb10, label %bb1
4390 bb10: ; preds = %bb1
4394 define void @bcast_unfold_vpternlog_v16i32(ptr %arg, ptr %arg1) {
4395 ; CHECK-LABEL: bcast_unfold_vpternlog_v16i32:
4396 ; CHECK: # %bb.0: # %bb
4397 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4398 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
4399 ; CHECK-NEXT: .p2align 4
4400 ; CHECK-NEXT: .LBB131_1: # %bb2
4401 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4402 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
4403 ; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm2
4404 ; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm3
4405 ; CHECK-NEXT: vpternlogd $216, %zmm0, %zmm1, %zmm2
4406 ; CHECK-NEXT: vpmulld %zmm3, %zmm2, %zmm1
4407 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
4408 ; CHECK-NEXT: addq $64, %rax
4409 ; CHECK-NEXT: jne .LBB131_1
4410 ; CHECK-NEXT: # %bb.2: # %bb20
4411 ; CHECK-NEXT: vzeroupper
4416 bb2: ; preds = %bb2, %bb
4417 %tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ]
4418 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
4419 %tmp5 = load <16 x i32>, ptr %tmp3, align 4
4420 %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp
4421 %tmp11 = load <16 x i32>, ptr %tmp6, align 4
4422 %tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
4423 %tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
4424 %tmp14 = or <16 x i32> %tmp12, %tmp13
4425 %tmp15 = mul <16 x i32> %tmp14, %tmp5
4426 %tmp16 = mul <16 x i32> %tmp15, %tmp11
4427 store <16 x i32> %tmp16, ptr %tmp3, align 4
4428 %tmp18 = add i64 %tmp, 16
4429 %tmp19 = icmp eq i64 %tmp18, 1024
4430 br i1 %tmp19, label %bb20, label %bb2
4432 bb20: ; preds = %bb2
4436 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }