1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s
4 ; Test that we can unfold constant pool loads when we're using avx512's
5 ; ability to fold a broadcast load into an operation.
7 define void @bcast_unfold_add_v16i32(i32* %arg) {
8 ; CHECK-LABEL: bcast_unfold_add_v16i32:
9 ; CHECK: # %bb.0: # %bb
10 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
11 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
12 ; CHECK-NEXT: .p2align 4, 0x90
13 ; CHECK-NEXT: .LBB0_1: # %bb2
14 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
15 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
16 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
17 ; CHECK-NEXT: addq $64, %rax
18 ; CHECK-NEXT: jne .LBB0_1
19 ; CHECK-NEXT: # %bb.2: # %bb10
20 ; CHECK-NEXT: vzeroupper
25 bb2: ; preds = %bb2, %bb
26 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
27 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
28 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
29 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
30 %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
31 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
32 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
33 %tmp8 = add i64 %tmp, 16
34 %tmp9 = icmp eq i64 %tmp8, 1024
35 br i1 %tmp9, label %bb10, label %bb2
41 define void @bcast_unfold_add_v8i32(i32* %arg) {
42 ; CHECK-LABEL: bcast_unfold_add_v8i32:
43 ; CHECK: # %bb.0: # %bb
44 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
45 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
46 ; CHECK-NEXT: .p2align 4, 0x90
47 ; CHECK-NEXT: .LBB1_1: # %bb2
48 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
49 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
50 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
51 ; CHECK-NEXT: addq $32, %rax
52 ; CHECK-NEXT: jne .LBB1_1
53 ; CHECK-NEXT: # %bb.2: # %bb10
54 ; CHECK-NEXT: vzeroupper
59 bb2: ; preds = %bb2, %bb
60 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
61 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
62 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
63 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
64 %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
65 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
66 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
67 %tmp8 = add i64 %tmp, 8
68 %tmp9 = icmp eq i64 %tmp8, 1024
69 br i1 %tmp9, label %bb10, label %bb2
75 define void @bcast_unfold_add_v4i32(i32* %arg) {
76 ; CHECK-LABEL: bcast_unfold_add_v4i32:
77 ; CHECK: # %bb.0: # %bb
78 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
79 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
80 ; CHECK-NEXT: .p2align 4, 0x90
81 ; CHECK-NEXT: .LBB2_1: # %bb2
82 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
83 ; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
84 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
85 ; CHECK-NEXT: addq $16, %rax
86 ; CHECK-NEXT: jne .LBB2_1
87 ; CHECK-NEXT: # %bb.2: # %bb10
92 bb2: ; preds = %bb2, %bb
93 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
94 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
95 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
96 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
97 %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
98 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
99 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
100 %tmp8 = add i64 %tmp, 4
101 %tmp9 = icmp eq i64 %tmp8, 1024
102 br i1 %tmp9, label %bb10, label %bb2
108 define void @bcast_unfold_add_v8i64(i64* %arg) {
109 ; CHECK-LABEL: bcast_unfold_add_v8i64:
110 ; CHECK: # %bb.0: # %bb
111 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
112 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
113 ; CHECK-NEXT: .p2align 4, 0x90
114 ; CHECK-NEXT: .LBB3_1: # %bb2
115 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
116 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
117 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
118 ; CHECK-NEXT: addq $64, %rax
119 ; CHECK-NEXT: jne .LBB3_1
120 ; CHECK-NEXT: # %bb.2: # %bb10
121 ; CHECK-NEXT: vzeroupper
126 bb2: ; preds = %bb2, %bb
127 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
128 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
129 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
130 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
131 %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
132 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
133 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
134 %tmp8 = add i64 %tmp, 8
135 %tmp9 = icmp eq i64 %tmp8, 1024
136 br i1 %tmp9, label %bb10, label %bb2
142 define void @bcast_unfold_add_v4i64(i64* %arg) {
143 ; CHECK-LABEL: bcast_unfold_add_v4i64:
144 ; CHECK: # %bb.0: # %bb
145 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
146 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
147 ; CHECK-NEXT: .p2align 4, 0x90
148 ; CHECK-NEXT: .LBB4_1: # %bb2
149 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
150 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
151 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
152 ; CHECK-NEXT: addq $32, %rax
153 ; CHECK-NEXT: jne .LBB4_1
154 ; CHECK-NEXT: # %bb.2: # %bb10
155 ; CHECK-NEXT: vzeroupper
160 bb2: ; preds = %bb2, %bb
161 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
162 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
163 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
164 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
165 %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
166 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
167 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
168 %tmp8 = add i64 %tmp, 4
169 %tmp9 = icmp eq i64 %tmp8, 1024
170 br i1 %tmp9, label %bb10, label %bb2
176 define void @bcast_unfold_add_v2i64(i64* %arg) {
177 ; CHECK-LABEL: bcast_unfold_add_v2i64:
178 ; CHECK: # %bb.0: # %bb
179 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
180 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
181 ; CHECK-NEXT: .p2align 4, 0x90
182 ; CHECK-NEXT: .LBB5_1: # %bb2
183 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
184 ; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
185 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
186 ; CHECK-NEXT: addq $16, %rax
187 ; CHECK-NEXT: jne .LBB5_1
188 ; CHECK-NEXT: # %bb.2: # %bb10
193 bb2: ; preds = %bb2, %bb
194 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
195 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
196 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
197 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
198 %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
199 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
200 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
201 %tmp8 = add i64 %tmp, 2
202 %tmp9 = icmp eq i64 %tmp8, 1024
203 br i1 %tmp9, label %bb10, label %bb2
209 define void @bcast_unfold_mul_v16i32(i32* %arg) {
210 ; CHECK-LABEL: bcast_unfold_mul_v16i32:
211 ; CHECK: # %bb.0: # %bb
212 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
213 ; CHECK-NEXT: .p2align 4, 0x90
214 ; CHECK-NEXT: .LBB6_1: # %bb2
215 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
216 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
217 ; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm1
218 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
219 ; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
220 ; CHECK-NEXT: addq $64, %rax
221 ; CHECK-NEXT: jne .LBB6_1
222 ; CHECK-NEXT: # %bb.2: # %bb10
223 ; CHECK-NEXT: vzeroupper
228 bb2: ; preds = %bb2, %bb
229 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
230 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
231 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
232 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
233 %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
234 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
235 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
236 %tmp8 = add i64 %tmp, 16
237 %tmp9 = icmp eq i64 %tmp8, 1024
238 br i1 %tmp9, label %bb10, label %bb2
244 define void @bcast_unfold_mul_v8i32(i32* %arg) {
245 ; CHECK-LABEL: bcast_unfold_mul_v8i32:
246 ; CHECK: # %bb.0: # %bb
247 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
248 ; CHECK-NEXT: .p2align 4, 0x90
249 ; CHECK-NEXT: .LBB7_1: # %bb2
250 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
251 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
252 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm1
253 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
254 ; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
255 ; CHECK-NEXT: addq $32, %rax
256 ; CHECK-NEXT: jne .LBB7_1
257 ; CHECK-NEXT: # %bb.2: # %bb10
258 ; CHECK-NEXT: vzeroupper
263 bb2: ; preds = %bb2, %bb
264 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
265 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
266 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
267 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
268 %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
269 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
270 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
271 %tmp8 = add i64 %tmp, 8
272 %tmp9 = icmp eq i64 %tmp8, 1024
273 br i1 %tmp9, label %bb10, label %bb2
279 define void @bcast_unfold_mul_v4i32(i32* %arg) {
280 ; CHECK-LABEL: bcast_unfold_mul_v4i32:
281 ; CHECK: # %bb.0: # %bb
282 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
283 ; CHECK-NEXT: .p2align 4, 0x90
284 ; CHECK-NEXT: .LBB8_1: # %bb2
285 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
286 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
287 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
288 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
289 ; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
290 ; CHECK-NEXT: addq $16, %rax
291 ; CHECK-NEXT: jne .LBB8_1
292 ; CHECK-NEXT: # %bb.2: # %bb10
297 bb2: ; preds = %bb2, %bb
298 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
299 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
300 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
301 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
302 %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
303 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
304 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
305 %tmp8 = add i64 %tmp, 4
306 %tmp9 = icmp eq i64 %tmp8, 1024
307 br i1 %tmp9, label %bb10, label %bb2
313 define void @bcast_unfold_mul_v8i64(i64* %arg) {
314 ; CHECK-LABEL: bcast_unfold_mul_v8i64:
315 ; CHECK: # %bb.0: # %bb
316 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
317 ; CHECK-NEXT: .p2align 4, 0x90
318 ; CHECK-NEXT: .LBB9_1: # %bb2
319 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
320 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0
321 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1
322 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
323 ; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax)
324 ; CHECK-NEXT: addq $64, %rax
325 ; CHECK-NEXT: jne .LBB9_1
326 ; CHECK-NEXT: # %bb.2: # %bb10
327 ; CHECK-NEXT: vzeroupper
332 bb2: ; preds = %bb2, %bb
333 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
334 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
335 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
336 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
337 %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
338 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
339 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
340 %tmp8 = add i64 %tmp, 8
341 %tmp9 = icmp eq i64 %tmp8, 1024
342 br i1 %tmp9, label %bb10, label %bb2
348 define void @bcast_unfold_mul_v4i64(i64* %arg) {
349 ; CHECK-LABEL: bcast_unfold_mul_v4i64:
350 ; CHECK: # %bb.0: # %bb
351 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
352 ; CHECK-NEXT: .p2align 4, 0x90
353 ; CHECK-NEXT: .LBB10_1: # %bb2
354 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
355 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0
356 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1
357 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
358 ; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax)
359 ; CHECK-NEXT: addq $32, %rax
360 ; CHECK-NEXT: jne .LBB10_1
361 ; CHECK-NEXT: # %bb.2: # %bb10
362 ; CHECK-NEXT: vzeroupper
367 bb2: ; preds = %bb2, %bb
368 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
369 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
370 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
371 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
372 %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
373 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
374 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
375 %tmp8 = add i64 %tmp, 4
376 %tmp9 = icmp eq i64 %tmp8, 1024
377 br i1 %tmp9, label %bb10, label %bb2
383 define void @bcast_unfold_mul_v2i64(i64* %arg) {
384 ; CHECK-LABEL: bcast_unfold_mul_v2i64:
385 ; CHECK: # %bb.0: # %bb
386 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
387 ; CHECK-NEXT: .p2align 4, 0x90
388 ; CHECK-NEXT: .LBB11_1: # %bb2
389 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
390 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0
391 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1
392 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
393 ; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax)
394 ; CHECK-NEXT: addq $16, %rax
395 ; CHECK-NEXT: jne .LBB11_1
396 ; CHECK-NEXT: # %bb.2: # %bb10
401 bb2: ; preds = %bb2, %bb
402 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
403 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
404 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
405 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
406 %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
407 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
408 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
409 %tmp8 = add i64 %tmp, 2
410 %tmp9 = icmp eq i64 %tmp8, 1024
411 br i1 %tmp9, label %bb10, label %bb2
417 define void @bcast_unfold_or_v16i32(i32* %arg) {
418 ; CHECK-LABEL: bcast_unfold_or_v16i32:
419 ; CHECK: # %bb.0: # %bb
420 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
421 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
422 ; CHECK-NEXT: .p2align 4, 0x90
423 ; CHECK-NEXT: .LBB12_1: # %bb2
424 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
425 ; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1
426 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
427 ; CHECK-NEXT: addq $64, %rax
428 ; CHECK-NEXT: jne .LBB12_1
429 ; CHECK-NEXT: # %bb.2: # %bb10
430 ; CHECK-NEXT: vzeroupper
435 bb2: ; preds = %bb2, %bb
436 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
437 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
438 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
439 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
440 %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
441 %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
442 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
443 %tmp8 = add i64 %tmp, 16
444 %tmp9 = icmp eq i64 %tmp8, 1024
445 br i1 %tmp9, label %bb10, label %bb2
451 define void @bcast_unfold_or_v8i32(i32* %arg) {
452 ; CHECK-LABEL: bcast_unfold_or_v8i32:
453 ; CHECK: # %bb.0: # %bb
454 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
455 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
456 ; CHECK-NEXT: .p2align 4, 0x90
457 ; CHECK-NEXT: .LBB13_1: # %bb2
458 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
459 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1
460 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
461 ; CHECK-NEXT: addq $32, %rax
462 ; CHECK-NEXT: jne .LBB13_1
463 ; CHECK-NEXT: # %bb.2: # %bb10
464 ; CHECK-NEXT: vzeroupper
469 bb2: ; preds = %bb2, %bb
470 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
471 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
472 %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
473 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
474 %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
475 %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
476 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
477 %tmp8 = add i64 %tmp, 8
478 %tmp9 = icmp eq i64 %tmp8, 1024
479 br i1 %tmp9, label %bb10, label %bb2
485 define void @bcast_unfold_or_v4i32(i32* %arg) {
486 ; CHECK-LABEL: bcast_unfold_or_v4i32:
487 ; CHECK: # %bb.0: # %bb
488 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
489 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
490 ; CHECK-NEXT: .p2align 4, 0x90
491 ; CHECK-NEXT: .LBB14_1: # %bb2
492 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
493 ; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1
494 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
495 ; CHECK-NEXT: addq $16, %rax
496 ; CHECK-NEXT: jne .LBB14_1
497 ; CHECK-NEXT: # %bb.2: # %bb10
502 bb2: ; preds = %bb2, %bb
503 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
504 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
505 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
506 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
507 %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
508 %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
509 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
510 %tmp8 = add i64 %tmp, 4
511 %tmp9 = icmp eq i64 %tmp8, 1024
512 br i1 %tmp9, label %bb10, label %bb2
518 define void @bcast_unfold_or_v8i64(i64* %arg) {
519 ; CHECK-LABEL: bcast_unfold_or_v8i64:
520 ; CHECK: # %bb.0: # %bb
521 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
522 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
523 ; CHECK-NEXT: .p2align 4, 0x90
524 ; CHECK-NEXT: .LBB15_1: # %bb2
525 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
526 ; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1
527 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
528 ; CHECK-NEXT: addq $64, %rax
529 ; CHECK-NEXT: jne .LBB15_1
530 ; CHECK-NEXT: # %bb.2: # %bb10
531 ; CHECK-NEXT: vzeroupper
536 bb2: ; preds = %bb2, %bb
537 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
538 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
539 %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
540 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
541 %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
542 %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
543 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
544 %tmp8 = add i64 %tmp, 8
545 %tmp9 = icmp eq i64 %tmp8, 1024
546 br i1 %tmp9, label %bb10, label %bb2
552 define void @bcast_unfold_or_v4i64(i64* %arg) {
553 ; CHECK-LABEL: bcast_unfold_or_v4i64:
554 ; CHECK: # %bb.0: # %bb
555 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
556 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
557 ; CHECK-NEXT: .p2align 4, 0x90
558 ; CHECK-NEXT: .LBB16_1: # %bb2
559 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
560 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1
561 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
562 ; CHECK-NEXT: addq $32, %rax
563 ; CHECK-NEXT: jne .LBB16_1
564 ; CHECK-NEXT: # %bb.2: # %bb10
565 ; CHECK-NEXT: vzeroupper
570 bb2: ; preds = %bb2, %bb
571 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
572 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
573 %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
574 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
575 %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
576 %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
577 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
578 %tmp8 = add i64 %tmp, 4
579 %tmp9 = icmp eq i64 %tmp8, 1024
580 br i1 %tmp9, label %bb10, label %bb2
586 define void @bcast_unfold_or_v2i64(i64* %arg) {
587 ; CHECK-LABEL: bcast_unfold_or_v2i64:
588 ; CHECK: # %bb.0: # %bb
589 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
590 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3,3]
591 ; CHECK-NEXT: # xmm0 = mem[0,0]
592 ; CHECK-NEXT: .p2align 4, 0x90
593 ; CHECK-NEXT: .LBB17_1: # %bb2
594 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
595 ; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1
596 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
597 ; CHECK-NEXT: addq $16, %rax
598 ; CHECK-NEXT: jne .LBB17_1
599 ; CHECK-NEXT: # %bb.2: # %bb10
604 bb2: ; preds = %bb2, %bb
605 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
606 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
607 %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
608 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
609 %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
610 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
611 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
612 %tmp8 = add i64 %tmp, 2
613 %tmp9 = icmp eq i64 %tmp8, 1024
614 br i1 %tmp9, label %bb10, label %bb2
620 define void @bcast_unfold_fneg_v16f32(float* %arg) {
621 ; CHECK-LABEL: bcast_unfold_fneg_v16f32:
622 ; CHECK: # %bb.0: # %bb
623 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
624 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
625 ; CHECK-NEXT: .p2align 4, 0x90
626 ; CHECK-NEXT: .LBB18_1: # %bb1
627 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
628 ; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1
629 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
630 ; CHECK-NEXT: addq $64, %rax
631 ; CHECK-NEXT: jne .LBB18_1
632 ; CHECK-NEXT: # %bb.2: # %bb9
633 ; CHECK-NEXT: vzeroupper
638 bb1: ; preds = %bb1, %bb
639 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
640 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
641 %tmp3 = bitcast float* %tmp2 to <16 x float>*
642 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
643 %tmp5 = fneg <16 x float> %tmp4
644 %tmp6 = bitcast float* %tmp2 to <16 x float>*
645 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
646 %tmp7 = add i64 %tmp, 16
647 %tmp8 = icmp eq i64 %tmp7, 1024
648 br i1 %tmp8, label %bb9, label %bb1
654 define void @bcast_unfold_fneg_v8f32(float* %arg) {
655 ; CHECK-LABEL: bcast_unfold_fneg_v8f32:
656 ; CHECK: # %bb.0: # %bb
657 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
658 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
659 ; CHECK-NEXT: .p2align 4, 0x90
660 ; CHECK-NEXT: .LBB19_1: # %bb1
661 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
662 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1
663 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
664 ; CHECK-NEXT: addq $32, %rax
665 ; CHECK-NEXT: jne .LBB19_1
666 ; CHECK-NEXT: # %bb.2: # %bb9
667 ; CHECK-NEXT: vzeroupper
672 bb1: ; preds = %bb1, %bb
673 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
674 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
675 %tmp3 = bitcast float* %tmp2 to <8 x float>*
676 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
677 %tmp5 = fneg <8 x float> %tmp4
678 %tmp6 = bitcast float* %tmp2 to <8 x float>*
679 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
680 %tmp7 = add i64 %tmp, 8
681 %tmp8 = icmp eq i64 %tmp7, 1024
682 br i1 %tmp8, label %bb9, label %bb1
688 define void @bcast_unfold_fneg_v4f32(float* %arg) {
689 ; CHECK-LABEL: bcast_unfold_fneg_v4f32:
690 ; CHECK: # %bb.0: # %bb
691 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
692 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
693 ; CHECK-NEXT: .p2align 4, 0x90
694 ; CHECK-NEXT: .LBB20_1: # %bb1
695 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
696 ; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1
697 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
698 ; CHECK-NEXT: addq $16, %rax
699 ; CHECK-NEXT: jne .LBB20_1
700 ; CHECK-NEXT: # %bb.2: # %bb9
705 bb1: ; preds = %bb1, %bb
706 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
707 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
708 %tmp3 = bitcast float* %tmp2 to <4 x float>*
709 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
710 %tmp5 = fneg <4 x float> %tmp4
711 %tmp6 = bitcast float* %tmp2 to <4 x float>*
712 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
713 %tmp7 = add i64 %tmp, 4
714 %tmp8 = icmp eq i64 %tmp7, 1024
715 br i1 %tmp8, label %bb9, label %bb1
721 define void @bcast_unfold_fneg_v8f64(double* %arg) {
722 ; CHECK-LABEL: bcast_unfold_fneg_v8f64:
723 ; CHECK: # %bb.0: # %bb
724 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
725 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
726 ; CHECK-NEXT: .p2align 4, 0x90
727 ; CHECK-NEXT: .LBB21_1: # %bb1
728 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
729 ; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
730 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
731 ; CHECK-NEXT: addq $64, %rax
732 ; CHECK-NEXT: jne .LBB21_1
733 ; CHECK-NEXT: # %bb.2: # %bb9
734 ; CHECK-NEXT: vzeroupper
739 bb1: ; preds = %bb1, %bb
740 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
741 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
742 %tmp3 = bitcast double* %tmp2 to <8 x double>*
743 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
744 %tmp5 = fneg <8 x double> %tmp4
745 %tmp6 = bitcast double* %tmp2 to <8 x double>*
746 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
747 %tmp7 = add i64 %tmp, 8
748 %tmp8 = icmp eq i64 %tmp7, 1024
749 br i1 %tmp8, label %bb9, label %bb1
755 define void @bcast_unfold_fneg_v4f64(double* %arg) {
756 ; CHECK-LABEL: bcast_unfold_fneg_v4f64:
757 ; CHECK: # %bb.0: # %bb
758 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
759 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
760 ; CHECK-NEXT: .p2align 4, 0x90
761 ; CHECK-NEXT: .LBB22_1: # %bb1
762 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
763 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1
764 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
765 ; CHECK-NEXT: addq $32, %rax
766 ; CHECK-NEXT: jne .LBB22_1
767 ; CHECK-NEXT: # %bb.2: # %bb9
768 ; CHECK-NEXT: vzeroupper
773 bb1: ; preds = %bb1, %bb
774 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
775 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
776 %tmp3 = bitcast double* %tmp2 to <4 x double>*
777 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
778 %tmp5 = fneg <4 x double> %tmp4
779 %tmp6 = bitcast double* %tmp2 to <4 x double>*
780 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
781 %tmp7 = add i64 %tmp, 4
782 %tmp8 = icmp eq i64 %tmp7, 1024
783 br i1 %tmp8, label %bb9, label %bb1
789 define void @bcast_unfold_fneg_v2f64(double* %arg) {
790 ; CHECK-LABEL: bcast_unfold_fneg_v2f64:
791 ; CHECK: # %bb.0: # %bb
792 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
793 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
794 ; CHECK-NEXT: # xmm0 = mem[0,0]
795 ; CHECK-NEXT: .p2align 4, 0x90
796 ; CHECK-NEXT: .LBB23_1: # %bb1
797 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
798 ; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1
799 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
800 ; CHECK-NEXT: addq $16, %rax
801 ; CHECK-NEXT: jne .LBB23_1
802 ; CHECK-NEXT: # %bb.2: # %bb9
807 bb1: ; preds = %bb1, %bb
808 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
809 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
810 %tmp3 = bitcast double* %tmp2 to <2 x double>*
811 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
812 %tmp5 = fneg <2 x double> %tmp4
813 %tmp6 = bitcast double* %tmp2 to <2 x double>*
814 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
815 %tmp7 = add i64 %tmp, 2
816 %tmp8 = icmp eq i64 %tmp7, 1024
817 br i1 %tmp8, label %bb9, label %bb1
823 define void @bcast_unfold_fabs_v16f32(float* %arg) {
824 ; CHECK-LABEL: bcast_unfold_fabs_v16f32:
825 ; CHECK: # %bb.0: # %bb
826 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
827 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
828 ; CHECK-NEXT: .p2align 4, 0x90
829 ; CHECK-NEXT: .LBB24_1: # %bb1
830 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
831 ; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1
832 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
833 ; CHECK-NEXT: addq $64, %rax
834 ; CHECK-NEXT: jne .LBB24_1
835 ; CHECK-NEXT: # %bb.2: # %bb9
836 ; CHECK-NEXT: vzeroupper
841 bb1: ; preds = %bb1, %bb
842 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
843 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
844 %tmp3 = bitcast float* %tmp2 to <16 x float>*
845 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
846 %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
847 %tmp6 = bitcast float* %tmp2 to <16 x float>*
848 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
849 %tmp7 = add i64 %tmp, 16
850 %tmp8 = icmp eq i64 %tmp7, 1024
851 br i1 %tmp8, label %bb9, label %bb1
857 ; Function Attrs: nounwind readnone speculatable willreturn
858 declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0
860 define void @bcast_unfold_fabs_v8f32(float* %arg) {
861 ; CHECK-LABEL: bcast_unfold_fabs_v8f32:
862 ; CHECK: # %bb.0: # %bb
863 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
864 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
865 ; CHECK-NEXT: .p2align 4, 0x90
866 ; CHECK-NEXT: .LBB25_1: # %bb1
867 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
868 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1
869 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
870 ; CHECK-NEXT: addq $32, %rax
871 ; CHECK-NEXT: jne .LBB25_1
872 ; CHECK-NEXT: # %bb.2: # %bb9
873 ; CHECK-NEXT: vzeroupper
878 bb1: ; preds = %bb1, %bb
879 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
880 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
881 %tmp3 = bitcast float* %tmp2 to <8 x float>*
882 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
883 %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
884 %tmp6 = bitcast float* %tmp2 to <8 x float>*
885 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
886 %tmp7 = add i64 %tmp, 8
887 %tmp8 = icmp eq i64 %tmp7, 1024
888 br i1 %tmp8, label %bb9, label %bb1
894 ; Function Attrs: nounwind readnone speculatable willreturn
895 declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
897 define void @bcast_unfold_fabs_v4f32(float* %arg) {
898 ; CHECK-LABEL: bcast_unfold_fabs_v4f32:
899 ; CHECK: # %bb.0: # %bb
900 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
901 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
902 ; CHECK-NEXT: .p2align 4, 0x90
903 ; CHECK-NEXT: .LBB26_1: # %bb1
904 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
905 ; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1
906 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
907 ; CHECK-NEXT: addq $16, %rax
908 ; CHECK-NEXT: jne .LBB26_1
909 ; CHECK-NEXT: # %bb.2: # %bb9
914 bb1: ; preds = %bb1, %bb
915 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
916 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
917 %tmp3 = bitcast float* %tmp2 to <4 x float>*
918 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
919 %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
920 %tmp6 = bitcast float* %tmp2 to <4 x float>*
921 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
922 %tmp7 = add i64 %tmp, 4
923 %tmp8 = icmp eq i64 %tmp7, 1024
924 br i1 %tmp8, label %bb9, label %bb1
930 ; Function Attrs: nounwind readnone speculatable willreturn
931 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0
933 define void @bcast_unfold_fabs_v8f64(double* %arg) {
934 ; CHECK-LABEL: bcast_unfold_fabs_v8f64:
935 ; CHECK: # %bb.0: # %bb
936 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
937 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
938 ; CHECK-NEXT: .p2align 4, 0x90
939 ; CHECK-NEXT: .LBB27_1: # %bb1
940 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
941 ; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1
942 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
943 ; CHECK-NEXT: addq $64, %rax
944 ; CHECK-NEXT: jne .LBB27_1
945 ; CHECK-NEXT: # %bb.2: # %bb9
946 ; CHECK-NEXT: vzeroupper
951 bb1: ; preds = %bb1, %bb
952 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
953 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
954 %tmp3 = bitcast double* %tmp2 to <8 x double>*
955 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
956 %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
957 %tmp6 = bitcast double* %tmp2 to <8 x double>*
958 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
959 %tmp7 = add i64 %tmp, 8
960 %tmp8 = icmp eq i64 %tmp7, 1024
961 br i1 %tmp8, label %bb9, label %bb1
967 ; Function Attrs: nounwind readnone speculatable willreturn
968 declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0
970 define void @bcast_unfold_fabs_v4f64(double* %arg) {
971 ; CHECK-LABEL: bcast_unfold_fabs_v4f64:
972 ; CHECK: # %bb.0: # %bb
973 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
974 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
975 ; CHECK-NEXT: .p2align 4, 0x90
976 ; CHECK-NEXT: .LBB28_1: # %bb1
977 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
978 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1
979 ; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax)
980 ; CHECK-NEXT: addq $32, %rax
981 ; CHECK-NEXT: jne .LBB28_1
982 ; CHECK-NEXT: # %bb.2: # %bb9
983 ; CHECK-NEXT: vzeroupper
988 bb1: ; preds = %bb1, %bb
989 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
990 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
991 %tmp3 = bitcast double* %tmp2 to <4 x double>*
992 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
993 %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
994 %tmp6 = bitcast double* %tmp2 to <4 x double>*
995 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
996 %tmp7 = add i64 %tmp, 4
997 %tmp8 = icmp eq i64 %tmp7, 1024
998 br i1 %tmp8, label %bb9, label %bb1
1004 ; Function Attrs: nounwind readnone speculatable willreturn
1005 declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0
1007 define void @bcast_unfold_fabs_v2f64(double* %arg) {
1008 ; CHECK-LABEL: bcast_unfold_fabs_v2f64:
1009 ; CHECK: # %bb.0: # %bb
1010 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1011 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [NaN,NaN]
1012 ; CHECK-NEXT: # xmm0 = mem[0,0]
1013 ; CHECK-NEXT: .p2align 4, 0x90
1014 ; CHECK-NEXT: .LBB29_1: # %bb1
1015 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1016 ; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1
1017 ; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax)
1018 ; CHECK-NEXT: addq $16, %rax
1019 ; CHECK-NEXT: jne .LBB29_1
1020 ; CHECK-NEXT: # %bb.2: # %bb9
1025 bb1: ; preds = %bb1, %bb
1026 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1027 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1028 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1029 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1030 %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
1031 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1032 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1033 %tmp7 = add i64 %tmp, 2
1034 %tmp8 = icmp eq i64 %tmp7, 1024
1035 br i1 %tmp8, label %bb9, label %bb1
1041 ; Function Attrs: nounwind readnone speculatable willreturn
1042 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
1044 define void @bcast_unfold_fadd_v16f32(float* nocapture %arg) {
1045 ; CHECK-LABEL: bcast_unfold_fadd_v16f32:
1046 ; CHECK: # %bb.0: # %bb
1047 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1048 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1049 ; CHECK-NEXT: .p2align 4, 0x90
1050 ; CHECK-NEXT: .LBB30_1: # %bb1
1051 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1052 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1
1053 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1054 ; CHECK-NEXT: addq $64, %rax
1055 ; CHECK-NEXT: jne .LBB30_1
1056 ; CHECK-NEXT: # %bb.2: # %bb9
1057 ; CHECK-NEXT: vzeroupper
1062 bb1: ; preds = %bb1, %bb
1063 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1064 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1065 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1066 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1067 %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1068 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1069 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1070 %tmp7 = add i64 %tmp, 16
1071 %tmp8 = icmp eq i64 %tmp7, 1024
1072 br i1 %tmp8, label %bb9, label %bb1
1078 define void @bcast_unfold_fadd_v8f32(float* nocapture %arg) {
1079 ; CHECK-LABEL: bcast_unfold_fadd_v8f32:
1080 ; CHECK: # %bb.0: # %bb
1081 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1082 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1083 ; CHECK-NEXT: .p2align 4, 0x90
1084 ; CHECK-NEXT: .LBB31_1: # %bb1
1085 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1086 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1
1087 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1088 ; CHECK-NEXT: addq $32, %rax
1089 ; CHECK-NEXT: jne .LBB31_1
1090 ; CHECK-NEXT: # %bb.2: # %bb9
1091 ; CHECK-NEXT: vzeroupper
1096 bb1: ; preds = %bb1, %bb
1097 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1098 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1099 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1100 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1101 %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1102 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1103 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1104 %tmp7 = add i64 %tmp, 8
1105 %tmp8 = icmp eq i64 %tmp7, 1024
1106 br i1 %tmp8, label %bb9, label %bb1
1112 define void @bcast_unfold_fadd_v4f32(float* nocapture %arg) {
1113 ; CHECK-LABEL: bcast_unfold_fadd_v4f32:
1114 ; CHECK: # %bb.0: # %bb
1115 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1116 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1117 ; CHECK-NEXT: .p2align 4, 0x90
1118 ; CHECK-NEXT: .LBB32_1: # %bb1
1119 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1120 ; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1
1121 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1122 ; CHECK-NEXT: addq $16, %rax
1123 ; CHECK-NEXT: jne .LBB32_1
1124 ; CHECK-NEXT: # %bb.2: # %bb9
1129 bb1: ; preds = %bb1, %bb
1130 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1131 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1132 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1133 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1134 %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1135 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1136 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1137 %tmp7 = add i64 %tmp, 4
1138 %tmp8 = icmp eq i64 %tmp7, 1024
1139 br i1 %tmp8, label %bb9, label %bb1
1145 define void @bcast_unfold_fadd_v8f64(double* nocapture %arg) {
1146 ; CHECK-LABEL: bcast_unfold_fadd_v8f64:
1147 ; CHECK: # %bb.0: # %bb
1148 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1149 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1150 ; CHECK-NEXT: .p2align 4, 0x90
1151 ; CHECK-NEXT: .LBB33_1: # %bb1
1152 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1153 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
1154 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1155 ; CHECK-NEXT: addq $64, %rax
1156 ; CHECK-NEXT: jne .LBB33_1
1157 ; CHECK-NEXT: # %bb.2: # %bb9
1158 ; CHECK-NEXT: vzeroupper
1163 bb1: ; preds = %bb1, %bb
1164 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1165 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1166 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1167 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1168 %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1169 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1170 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1171 %tmp7 = add i64 %tmp, 8
1172 %tmp8 = icmp eq i64 %tmp7, 1024
1173 br i1 %tmp8, label %bb9, label %bb1
1179 define void @bcast_unfold_fadd_v4f64(double* nocapture %arg) {
1180 ; CHECK-LABEL: bcast_unfold_fadd_v4f64:
1181 ; CHECK: # %bb.0: # %bb
1182 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1183 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1184 ; CHECK-NEXT: .p2align 4, 0x90
1185 ; CHECK-NEXT: .LBB34_1: # %bb1
1186 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1187 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
1188 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1189 ; CHECK-NEXT: addq $32, %rax
1190 ; CHECK-NEXT: jne .LBB34_1
1191 ; CHECK-NEXT: # %bb.2: # %bb9
1192 ; CHECK-NEXT: vzeroupper
1197 bb1: ; preds = %bb1, %bb
1198 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1199 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1200 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1201 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1202 %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1203 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1204 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1205 %tmp7 = add i64 %tmp, 4
1206 %tmp8 = icmp eq i64 %tmp7, 1024
1207 br i1 %tmp8, label %bb9, label %bb1
1213 define void @bcast_unfold_fadd_v2f64(double* nocapture %arg) {
1214 ; CHECK-LABEL: bcast_unfold_fadd_v2f64:
1215 ; CHECK: # %bb.0: # %bb
1216 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1217 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1218 ; CHECK-NEXT: # xmm0 = mem[0,0]
1219 ; CHECK-NEXT: .p2align 4, 0x90
1220 ; CHECK-NEXT: .LBB35_1: # %bb1
1221 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1222 ; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
1223 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1224 ; CHECK-NEXT: addq $16, %rax
1225 ; CHECK-NEXT: jne .LBB35_1
1226 ; CHECK-NEXT: # %bb.2: # %bb9
1231 bb1: ; preds = %bb1, %bb
1232 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1233 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1234 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1235 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1236 %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1237 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1238 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1239 %tmp7 = add i64 %tmp, 2
1240 %tmp8 = icmp eq i64 %tmp7, 1024
1241 br i1 %tmp8, label %bb9, label %bb1
1247 define void @bcast_unfold_fmul_v16f32(float* nocapture %arg) {
1248 ; CHECK-LABEL: bcast_unfold_fmul_v16f32:
1249 ; CHECK: # %bb.0: # %bb
1250 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1251 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1252 ; CHECK-NEXT: .p2align 4, 0x90
1253 ; CHECK-NEXT: .LBB36_1: # %bb1
1254 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1255 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1
1256 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1257 ; CHECK-NEXT: addq $64, %rax
1258 ; CHECK-NEXT: jne .LBB36_1
1259 ; CHECK-NEXT: # %bb.2: # %bb9
1260 ; CHECK-NEXT: vzeroupper
1265 bb1: ; preds = %bb1, %bb
1266 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1267 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1268 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1269 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1270 %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1271 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1272 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1273 %tmp7 = add i64 %tmp, 16
1274 %tmp8 = icmp eq i64 %tmp7, 1024
1275 br i1 %tmp8, label %bb9, label %bb1
1281 define void @bcast_unfold_fmul_v8f32(float* nocapture %arg) {
1282 ; CHECK-LABEL: bcast_unfold_fmul_v8f32:
1283 ; CHECK: # %bb.0: # %bb
1284 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1285 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1286 ; CHECK-NEXT: .p2align 4, 0x90
1287 ; CHECK-NEXT: .LBB37_1: # %bb1
1288 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1289 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1
1290 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1291 ; CHECK-NEXT: addq $32, %rax
1292 ; CHECK-NEXT: jne .LBB37_1
1293 ; CHECK-NEXT: # %bb.2: # %bb9
1294 ; CHECK-NEXT: vzeroupper
1299 bb1: ; preds = %bb1, %bb
1300 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1301 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1302 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1303 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1304 %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1305 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1306 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1307 %tmp7 = add i64 %tmp, 8
1308 %tmp8 = icmp eq i64 %tmp7, 1024
1309 br i1 %tmp8, label %bb9, label %bb1
1315 define void @bcast_unfold_fmul_v4f32(float* nocapture %arg) {
1316 ; CHECK-LABEL: bcast_unfold_fmul_v4f32:
1317 ; CHECK: # %bb.0: # %bb
1318 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1319 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1320 ; CHECK-NEXT: .p2align 4, 0x90
1321 ; CHECK-NEXT: .LBB38_1: # %bb1
1322 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1323 ; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1
1324 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1325 ; CHECK-NEXT: addq $16, %rax
1326 ; CHECK-NEXT: jne .LBB38_1
1327 ; CHECK-NEXT: # %bb.2: # %bb9
1332 bb1: ; preds = %bb1, %bb
1333 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1334 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1335 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1336 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1337 %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1338 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1339 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1340 %tmp7 = add i64 %tmp, 4
1341 %tmp8 = icmp eq i64 %tmp7, 1024
1342 br i1 %tmp8, label %bb9, label %bb1
1348 define void @bcast_unfold_fmul_v8f64(double* nocapture %arg) {
1349 ; CHECK-LABEL: bcast_unfold_fmul_v8f64:
1350 ; CHECK: # %bb.0: # %bb
1351 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1352 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1353 ; CHECK-NEXT: .p2align 4, 0x90
1354 ; CHECK-NEXT: .LBB39_1: # %bb1
1355 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1356 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
1357 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1358 ; CHECK-NEXT: addq $64, %rax
1359 ; CHECK-NEXT: jne .LBB39_1
1360 ; CHECK-NEXT: # %bb.2: # %bb9
1361 ; CHECK-NEXT: vzeroupper
1366 bb1: ; preds = %bb1, %bb
1367 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1368 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1369 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1370 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1371 %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1372 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1373 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1374 %tmp7 = add i64 %tmp, 8
1375 %tmp8 = icmp eq i64 %tmp7, 1024
1376 br i1 %tmp8, label %bb9, label %bb1
1382 define void @bcast_unfold_fmul_v4f64(double* nocapture %arg) {
1383 ; CHECK-LABEL: bcast_unfold_fmul_v4f64:
1384 ; CHECK: # %bb.0: # %bb
1385 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1386 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1387 ; CHECK-NEXT: .p2align 4, 0x90
1388 ; CHECK-NEXT: .LBB40_1: # %bb1
1389 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1390 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
1391 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1392 ; CHECK-NEXT: addq $32, %rax
1393 ; CHECK-NEXT: jne .LBB40_1
1394 ; CHECK-NEXT: # %bb.2: # %bb9
1395 ; CHECK-NEXT: vzeroupper
1400 bb1: ; preds = %bb1, %bb
1401 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1402 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1403 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1404 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1405 %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1406 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1407 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1408 %tmp7 = add i64 %tmp, 4
1409 %tmp8 = icmp eq i64 %tmp7, 1024
1410 br i1 %tmp8, label %bb9, label %bb1
1416 define void @bcast_unfold_fmul_v2f64(double* nocapture %arg) {
1417 ; CHECK-LABEL: bcast_unfold_fmul_v2f64:
1418 ; CHECK: # %bb.0: # %bb
1419 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1420 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
1421 ; CHECK-NEXT: # xmm0 = mem[0,0]
1422 ; CHECK-NEXT: .p2align 4, 0x90
1423 ; CHECK-NEXT: .LBB41_1: # %bb1
1424 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1425 ; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
1426 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1427 ; CHECK-NEXT: addq $16, %rax
1428 ; CHECK-NEXT: jne .LBB41_1
1429 ; CHECK-NEXT: # %bb.2: # %bb9
1434 bb1: ; preds = %bb1, %bb
1435 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1436 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1437 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1438 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1439 %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
1440 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1441 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1442 %tmp7 = add i64 %tmp, 2
1443 %tmp8 = icmp eq i64 %tmp7, 1024
1444 br i1 %tmp8, label %bb9, label %bb1
1450 define void @bcast_unfold_fdiv_v16f32(float* nocapture %arg) {
1451 ; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
1452 ; CHECK: # %bb.0: # %bb
1453 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1454 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1455 ; CHECK-NEXT: .p2align 4, 0x90
1456 ; CHECK-NEXT: .LBB42_1: # %bb1
1457 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1458 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1459 ; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1
1460 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1461 ; CHECK-NEXT: addq $64, %rax
1462 ; CHECK-NEXT: jne .LBB42_1
1463 ; CHECK-NEXT: # %bb.2: # %bb9
1464 ; CHECK-NEXT: vzeroupper
1469 bb1: ; preds = %bb1, %bb
1470 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1471 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1472 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1473 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1474 %tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1475 %tmp6 = bitcast float* %tmp2 to <16 x float>*
1476 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1477 %tmp7 = add i64 %tmp, 16
1478 %tmp8 = icmp eq i64 %tmp7, 1024
1479 br i1 %tmp8, label %bb9, label %bb1
1485 define void @bcast_unfold_fdiv_v8f32(float* nocapture %arg) {
1486 ; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
1487 ; CHECK: # %bb.0: # %bb
1488 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1489 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1490 ; CHECK-NEXT: .p2align 4, 0x90
1491 ; CHECK-NEXT: .LBB43_1: # %bb1
1492 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1493 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1494 ; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1
1495 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1496 ; CHECK-NEXT: addq $32, %rax
1497 ; CHECK-NEXT: jne .LBB43_1
1498 ; CHECK-NEXT: # %bb.2: # %bb9
1499 ; CHECK-NEXT: vzeroupper
1504 bb1: ; preds = %bb1, %bb
1505 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1506 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1507 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1508 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1509 %tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1510 %tmp6 = bitcast float* %tmp2 to <8 x float>*
1511 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1512 %tmp7 = add i64 %tmp, 8
1513 %tmp8 = icmp eq i64 %tmp7, 1024
1514 br i1 %tmp8, label %bb9, label %bb1
1520 define void @bcast_unfold_fdiv_v4f32(float* nocapture %arg) {
1521 ; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
1522 ; CHECK: # %bb.0: # %bb
1523 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1524 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1525 ; CHECK-NEXT: .p2align 4, 0x90
1526 ; CHECK-NEXT: .LBB44_1: # %bb1
1527 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1528 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1529 ; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1
1530 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1531 ; CHECK-NEXT: addq $16, %rax
1532 ; CHECK-NEXT: jne .LBB44_1
1533 ; CHECK-NEXT: # %bb.2: # %bb9
1538 bb1: ; preds = %bb1, %bb
1539 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1540 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1541 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1542 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1543 %tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1544 %tmp6 = bitcast float* %tmp2 to <4 x float>*
1545 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1546 %tmp7 = add i64 %tmp, 4
1547 %tmp8 = icmp eq i64 %tmp7, 1024
1548 br i1 %tmp8, label %bb9, label %bb1
1554 define void @bcast_unfold_fdiv_v8f64(double* nocapture %arg) {
1555 ; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
1556 ; CHECK: # %bb.0: # %bb
1557 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1558 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1559 ; CHECK-NEXT: .p2align 4, 0x90
1560 ; CHECK-NEXT: .LBB45_1: # %bb1
1561 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1562 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
1563 ; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1
1564 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
1565 ; CHECK-NEXT: addq $64, %rax
1566 ; CHECK-NEXT: jne .LBB45_1
1567 ; CHECK-NEXT: # %bb.2: # %bb9
1568 ; CHECK-NEXT: vzeroupper
1573 bb1: ; preds = %bb1, %bb
1574 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1575 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1576 %tmp3 = bitcast double* %tmp2 to <8 x double>*
1577 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1578 %tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1579 %tmp6 = bitcast double* %tmp2 to <8 x double>*
1580 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1581 %tmp7 = add i64 %tmp, 8
1582 %tmp8 = icmp eq i64 %tmp7, 1024
1583 br i1 %tmp8, label %bb9, label %bb1
1589 define void @bcast_unfold_fdiv_v4f64(double* nocapture %arg) {
1590 ; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
1591 ; CHECK: # %bb.0: # %bb
1592 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1593 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1594 ; CHECK-NEXT: .p2align 4, 0x90
1595 ; CHECK-NEXT: .LBB46_1: # %bb1
1596 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1597 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1598 ; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1
1599 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1600 ; CHECK-NEXT: addq $32, %rax
1601 ; CHECK-NEXT: jne .LBB46_1
1602 ; CHECK-NEXT: # %bb.2: # %bb9
1603 ; CHECK-NEXT: vzeroupper
1608 bb1: ; preds = %bb1, %bb
1609 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1610 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1611 %tmp3 = bitcast double* %tmp2 to <4 x double>*
1612 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1613 %tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1614 %tmp6 = bitcast double* %tmp2 to <4 x double>*
1615 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1616 %tmp7 = add i64 %tmp, 4
1617 %tmp8 = icmp eq i64 %tmp7, 1024
1618 br i1 %tmp8, label %bb9, label %bb1
1624 define void @bcast_unfold_fdiv_v2f64(double* nocapture %arg) {
1625 ; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
1626 ; CHECK: # %bb.0: # %bb
1627 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1628 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1629 ; CHECK-NEXT: # xmm0 = mem[0,0]
1630 ; CHECK-NEXT: .p2align 4, 0x90
1631 ; CHECK-NEXT: .LBB47_1: # %bb1
1632 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1633 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1634 ; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1
1635 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1636 ; CHECK-NEXT: addq $16, %rax
1637 ; CHECK-NEXT: jne .LBB47_1
1638 ; CHECK-NEXT: # %bb.2: # %bb9
1643 bb1: ; preds = %bb1, %bb
1644 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1645 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1646 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1647 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1648 %tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1649 %tmp6 = bitcast double* %tmp2 to <2 x double>*
1650 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1651 %tmp7 = add i64 %tmp, 2
1652 %tmp8 = icmp eq i64 %tmp7, 1024
1653 br i1 %tmp8, label %bb9, label %bb1
1659 define void @bcast_unfold_fma213_v4f32(float* %arg) {
1660 ; CHECK-LABEL: bcast_unfold_fma213_v4f32:
1661 ; CHECK: # %bb.0: # %bb
1662 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1663 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1664 ; CHECK-NEXT: .p2align 4, 0x90
1665 ; CHECK-NEXT: .LBB48_1: # %bb2
1666 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1667 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1668 ; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1669 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1670 ; CHECK-NEXT: addq $16, %rax
1671 ; CHECK-NEXT: jne .LBB48_1
1672 ; CHECK-NEXT: # %bb.2: # %bb11
1677 bb2: ; preds = %bb2, %bb
1678 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1679 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1680 %tmp4 = bitcast float* %tmp3 to <4 x float>*
1681 %tmp5 = load <4 x float>, <4 x float>* %tmp4, align 4
1682 %tmp6 = fmul contract <4 x float> %tmp5, %tmp5
1683 %tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1684 %tmp8 = bitcast float* %tmp3 to <4 x float>*
1685 store <4 x float> %tmp7, <4 x float>* %tmp8, align 4
1686 %tmp9 = add i64 %tmp, 4
1687 %tmp10 = icmp eq i64 %tmp9, 1024
1688 br i1 %tmp10, label %bb11, label %bb2
1690 bb11: ; preds = %bb2
1694 define void @bcast_unfold_fma231_v4f32(float* %arg) {
1695 ; CHECK-LABEL: bcast_unfold_fma231_v4f32:
1696 ; CHECK: # %bb.0: # %bb
1697 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1698 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1699 ; CHECK-NEXT: .p2align 4, 0x90
1700 ; CHECK-NEXT: .LBB49_1: # %bb1
1701 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1702 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
1703 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1704 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
1705 ; CHECK-NEXT: addq $16, %rax
1706 ; CHECK-NEXT: jne .LBB49_1
1707 ; CHECK-NEXT: # %bb.2: # %bb10
1712 bb1: ; preds = %bb1, %bb
1713 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1714 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1715 %tmp3 = bitcast float* %tmp2 to <4 x float>*
1716 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1717 %tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1718 %tmp6 = fadd contract <4 x float> %tmp4, %tmp5
1719 %tmp7 = bitcast float* %tmp2 to <4 x float>*
1720 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
1721 %tmp8 = add i64 %tmp, 4
1722 %tmp9 = icmp eq i64 %tmp8, 1024
1723 br i1 %tmp9, label %bb10, label %bb1
1725 bb10: ; preds = %bb1
1729 define void @bcast_unfold_fma213_v8f32(float* %arg) {
1730 ; CHECK-LABEL: bcast_unfold_fma213_v8f32:
1731 ; CHECK: # %bb.0: # %bb
1732 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1733 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1734 ; CHECK-NEXT: .p2align 4, 0x90
1735 ; CHECK-NEXT: .LBB50_1: # %bb2
1736 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1737 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1738 ; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1739 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1740 ; CHECK-NEXT: addq $32, %rax
1741 ; CHECK-NEXT: jne .LBB50_1
1742 ; CHECK-NEXT: # %bb.2: # %bb11
1743 ; CHECK-NEXT: vzeroupper
1748 bb2: ; preds = %bb2, %bb
1749 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1750 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1751 %tmp4 = bitcast float* %tmp3 to <8 x float>*
1752 %tmp5 = load <8 x float>, <8 x float>* %tmp4, align 4
1753 %tmp6 = fmul contract <8 x float> %tmp5, %tmp5
1754 %tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1755 %tmp8 = bitcast float* %tmp3 to <8 x float>*
1756 store <8 x float> %tmp7, <8 x float>* %tmp8, align 4
1757 %tmp9 = add i64 %tmp, 8
1758 %tmp10 = icmp eq i64 %tmp9, 1024
1759 br i1 %tmp10, label %bb11, label %bb2
1761 bb11: ; preds = %bb2
1765 define void @bcast_unfold_fma231_v8f32(float* %arg) {
1766 ; CHECK-LABEL: bcast_unfold_fma231_v8f32:
1767 ; CHECK: # %bb.0: # %bb
1768 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1769 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1770 ; CHECK-NEXT: .p2align 4, 0x90
1771 ; CHECK-NEXT: .LBB51_1: # %bb1
1772 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1773 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
1774 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1775 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
1776 ; CHECK-NEXT: addq $32, %rax
1777 ; CHECK-NEXT: jne .LBB51_1
1778 ; CHECK-NEXT: # %bb.2: # %bb10
1779 ; CHECK-NEXT: vzeroupper
1784 bb1: ; preds = %bb1, %bb
1785 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1786 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1787 %tmp3 = bitcast float* %tmp2 to <8 x float>*
1788 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1789 %tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1790 %tmp6 = fadd contract <8 x float> %tmp4, %tmp5
1791 %tmp7 = bitcast float* %tmp2 to <8 x float>*
1792 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
1793 %tmp8 = add i64 %tmp, 8
1794 %tmp9 = icmp eq i64 %tmp8, 1024
1795 br i1 %tmp9, label %bb10, label %bb1
1797 bb10: ; preds = %bb1
1801 define void @bcast_unfold_fma213_v16f32(float* %arg) {
1802 ; CHECK-LABEL: bcast_unfold_fma213_v16f32:
1803 ; CHECK: # %bb.0: # %bb
1804 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1805 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1806 ; CHECK-NEXT: .p2align 4, 0x90
1807 ; CHECK-NEXT: .LBB52_1: # %bb2
1808 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1809 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1810 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
1811 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1812 ; CHECK-NEXT: addq $64, %rax
1813 ; CHECK-NEXT: jne .LBB52_1
1814 ; CHECK-NEXT: # %bb.2: # %bb11
1815 ; CHECK-NEXT: vzeroupper
1820 bb2: ; preds = %bb2, %bb
1821 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1822 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1823 %tmp4 = bitcast float* %tmp3 to <16 x float>*
1824 %tmp5 = load <16 x float>, <16 x float>* %tmp4, align 4
1825 %tmp6 = fmul contract <16 x float> %tmp5, %tmp5
1826 %tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1827 %tmp8 = bitcast float* %tmp3 to <16 x float>*
1828 store <16 x float> %tmp7, <16 x float>* %tmp8, align 4
1829 %tmp9 = add i64 %tmp, 16
1830 %tmp10 = icmp eq i64 %tmp9, 1024
1831 br i1 %tmp10, label %bb11, label %bb2
1833 bb11: ; preds = %bb2
1837 define void @bcast_unfold_fma231_v16f32(float* %arg) {
1838 ; CHECK-LABEL: bcast_unfold_fma231_v16f32:
1839 ; CHECK: # %bb.0: # %bb
1840 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
1841 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1842 ; CHECK-NEXT: .p2align 4, 0x90
1843 ; CHECK-NEXT: .LBB53_1: # %bb1
1844 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1845 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
1846 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
1847 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
1848 ; CHECK-NEXT: addq $64, %rax
1849 ; CHECK-NEXT: jne .LBB53_1
1850 ; CHECK-NEXT: # %bb.2: # %bb10
1851 ; CHECK-NEXT: vzeroupper
1856 bb1: ; preds = %bb1, %bb
1857 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1858 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1859 %tmp3 = bitcast float* %tmp2 to <16 x float>*
1860 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1861 %tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1862 %tmp6 = fadd contract <16 x float> %tmp4, %tmp5
1863 %tmp7 = bitcast float* %tmp2 to <16 x float>*
1864 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
1865 %tmp8 = add i64 %tmp, 16
1866 %tmp9 = icmp eq i64 %tmp8, 1024
1867 br i1 %tmp9, label %bb10, label %bb1
1869 bb10: ; preds = %bb1
1873 define void @bcast_unfold_fma213_v2f64(double* %arg) {
1874 ; CHECK-LABEL: bcast_unfold_fma213_v2f64:
1875 ; CHECK: # %bb.0: # %bb
1876 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1877 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1878 ; CHECK-NEXT: # xmm0 = mem[0,0]
1879 ; CHECK-NEXT: .p2align 4, 0x90
1880 ; CHECK-NEXT: .LBB54_1: # %bb2
1881 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1882 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1883 ; CHECK-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1884 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1885 ; CHECK-NEXT: addq $16, %rax
1886 ; CHECK-NEXT: jne .LBB54_1
1887 ; CHECK-NEXT: # %bb.2: # %bb11
1892 bb2: ; preds = %bb2, %bb
1893 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1894 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
1895 %tmp4 = bitcast double* %tmp3 to <2 x double>*
1896 %tmp5 = load <2 x double>, <2 x double>* %tmp4, align 4
1897 %tmp6 = fmul contract <2 x double> %tmp5, %tmp5
1898 %tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00>
1899 %tmp8 = bitcast double* %tmp3 to <2 x double>*
1900 store <2 x double> %tmp7, <2 x double>* %tmp8, align 8
1901 %tmp9 = add i64 %tmp, 2
1902 %tmp10 = icmp eq i64 %tmp9, 1024
1903 br i1 %tmp10, label %bb11, label %bb2
1905 bb11: ; preds = %bb2
1909 define void @bcast_unfold_fma231_v2f64(double* %arg) {
1910 ; CHECK-LABEL: bcast_unfold_fma231_v2f64:
1911 ; CHECK: # %bb.0: # %bb
1912 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1913 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1914 ; CHECK-NEXT: # xmm0 = mem[0,0]
1915 ; CHECK-NEXT: .p2align 4, 0x90
1916 ; CHECK-NEXT: .LBB55_1: # %bb1
1917 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1918 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
1919 ; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1920 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
1921 ; CHECK-NEXT: addq $16, %rax
1922 ; CHECK-NEXT: jne .LBB55_1
1923 ; CHECK-NEXT: # %bb.2: # %bb10
1928 bb1: ; preds = %bb1, %bb
1929 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1930 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1931 %tmp3 = bitcast double* %tmp2 to <2 x double>*
1932 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1933 %tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1934 %tmp6 = fadd contract <2 x double> %tmp4, %tmp5
1935 %tmp7 = bitcast double* %tmp2 to <2 x double>*
1936 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
1937 %tmp8 = add i64 %tmp, 2
1938 %tmp9 = icmp eq i64 %tmp8, 1024
1939 br i1 %tmp9, label %bb10, label %bb1
1941 bb10: ; preds = %bb1
1945 define void @bcast_unfold_fma213_v4f64(double* %arg) {
1946 ; CHECK-LABEL: bcast_unfold_fma213_v4f64:
1947 ; CHECK: # %bb.0: # %bb
1948 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1949 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1950 ; CHECK-NEXT: .p2align 4, 0x90
1951 ; CHECK-NEXT: .LBB56_1: # %bb2
1952 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1953 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1954 ; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1955 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1956 ; CHECK-NEXT: addq $32, %rax
1957 ; CHECK-NEXT: jne .LBB56_1
1958 ; CHECK-NEXT: # %bb.2: # %bb11
1959 ; CHECK-NEXT: vzeroupper
1964 bb2: ; preds = %bb2, %bb
1965 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1966 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
1967 %tmp4 = bitcast double* %tmp3 to <4 x double>*
1968 %tmp5 = load <4 x double>, <4 x double>* %tmp4, align 8
1969 %tmp6 = fmul contract <4 x double> %tmp5, %tmp5
1970 %tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1971 %tmp8 = bitcast double* %tmp3 to <4 x double>*
1972 store <4 x double> %tmp7, <4 x double>* %tmp8, align 8
1973 %tmp9 = add i64 %tmp, 4
1974 %tmp10 = icmp eq i64 %tmp9, 1024
1975 br i1 %tmp10, label %bb11, label %bb2
1977 bb11: ; preds = %bb2
1981 define void @bcast_unfold_fma231_v4f64(double* %arg) {
1982 ; CHECK-LABEL: bcast_unfold_fma231_v4f64:
1983 ; CHECK: # %bb.0: # %bb
1984 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
1985 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1986 ; CHECK-NEXT: .p2align 4, 0x90
1987 ; CHECK-NEXT: .LBB57_1: # %bb1
1988 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1989 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
1990 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1991 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
1992 ; CHECK-NEXT: addq $32, %rax
1993 ; CHECK-NEXT: jne .LBB57_1
1994 ; CHECK-NEXT: # %bb.2: # %bb10
1995 ; CHECK-NEXT: vzeroupper
2000 bb1: ; preds = %bb1, %bb
2001 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2002 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2003 %tmp3 = bitcast double* %tmp2 to <4 x double>*
2004 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2005 %tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2006 %tmp6 = fadd contract <4 x double> %tmp4, %tmp5
2007 %tmp7 = bitcast double* %tmp2 to <4 x double>*
2008 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2009 %tmp8 = add i64 %tmp, 4
2010 %tmp9 = icmp eq i64 %tmp8, 1024
2011 br i1 %tmp9, label %bb10, label %bb1
2013 bb10: ; preds = %bb1
2017 define void @bcast_unfold_fma213_v8f64(double* %arg) {
2018 ; CHECK-LABEL: bcast_unfold_fma213_v8f64:
2019 ; CHECK: # %bb.0: # %bb
2020 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2021 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2022 ; CHECK-NEXT: .p2align 4, 0x90
2023 ; CHECK-NEXT: .LBB58_1: # %bb2
2024 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2025 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2026 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
2027 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2028 ; CHECK-NEXT: addq $64, %rax
2029 ; CHECK-NEXT: jne .LBB58_1
2030 ; CHECK-NEXT: # %bb.2: # %bb11
2031 ; CHECK-NEXT: vzeroupper
2036 bb2: ; preds = %bb2, %bb
2037 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
2038 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
2039 %tmp4 = bitcast double* %tmp3 to <8 x double>*
2040 %tmp5 = load <8 x double>, <8 x double>* %tmp4, align 8
2041 %tmp6 = fmul contract <8 x double> %tmp5, %tmp5
2042 %tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2043 %tmp8 = bitcast double* %tmp3 to <8 x double>*
2044 store <8 x double> %tmp7, <8 x double>* %tmp8, align 8
2045 %tmp9 = add i64 %tmp, 8
2046 %tmp10 = icmp eq i64 %tmp9, 1024
2047 br i1 %tmp10, label %bb11, label %bb2
2049 bb11: ; preds = %bb2
2053 define void @bcast_unfold_fma231_v8f64(double* %arg) {
2054 ; CHECK-LABEL: bcast_unfold_fma231_v8f64:
2055 ; CHECK: # %bb.0: # %bb
2056 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2057 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2058 ; CHECK-NEXT: .p2align 4, 0x90
2059 ; CHECK-NEXT: .LBB59_1: # %bb1
2060 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2061 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2062 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
2063 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2064 ; CHECK-NEXT: addq $64, %rax
2065 ; CHECK-NEXT: jne .LBB59_1
2066 ; CHECK-NEXT: # %bb.2: # %bb10
2067 ; CHECK-NEXT: vzeroupper
2072 bb1: ; preds = %bb1, %bb
2073 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2074 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2075 %tmp3 = bitcast double* %tmp2 to <8 x double>*
2076 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2077 %tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2078 %tmp6 = fadd contract <8 x double> %tmp4, %tmp5
2079 %tmp7 = bitcast double* %tmp2 to <8 x double>*
2080 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2081 %tmp8 = add i64 %tmp, 8
2082 %tmp9 = icmp eq i64 %tmp8, 1024
2083 br i1 %tmp9, label %bb10, label %bb1
2085 bb10: ; preds = %bb1
2089 define void @bcast_unfold_fmax_v4f32(float* %arg) {
2090 ; CHECK-LABEL: bcast_unfold_fmax_v4f32:
2091 ; CHECK: # %bb.0: # %bb
2092 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2093 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2094 ; CHECK-NEXT: .p2align 4, 0x90
2095 ; CHECK-NEXT: .LBB60_1: # %bb1
2096 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2097 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
2098 ; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1
2099 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
2100 ; CHECK-NEXT: addq $16, %rax
2101 ; CHECK-NEXT: jne .LBB60_1
2102 ; CHECK-NEXT: # %bb.2: # %bb10
2107 bb1: ; preds = %bb1, %bb
2108 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2109 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2110 %tmp3 = bitcast float* %tmp2 to <4 x float>*
2111 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
2112 %tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2113 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2114 %tmp7 = bitcast float* %tmp2 to <4 x float>*
2115 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
2116 %tmp8 = add i64 %tmp, 4
2117 %tmp9 = icmp eq i64 %tmp8, 1024
2118 br i1 %tmp9, label %bb10, label %bb1
2120 bb10: ; preds = %bb1
2124 define void @bcast_unfold_fmax_v8f32(float* %arg) {
2125 ; CHECK-LABEL: bcast_unfold_fmax_v8f32:
2126 ; CHECK: # %bb.0: # %bb
2127 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2128 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2129 ; CHECK-NEXT: .p2align 4, 0x90
2130 ; CHECK-NEXT: .LBB61_1: # %bb1
2131 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2132 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
2133 ; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1
2134 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
2135 ; CHECK-NEXT: addq $32, %rax
2136 ; CHECK-NEXT: jne .LBB61_1
2137 ; CHECK-NEXT: # %bb.2: # %bb10
2138 ; CHECK-NEXT: vzeroupper
2143 bb1: ; preds = %bb1, %bb
2144 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2145 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2146 %tmp3 = bitcast float* %tmp2 to <8 x float>*
2147 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
2148 %tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2149 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2150 %tmp7 = bitcast float* %tmp2 to <8 x float>*
2151 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
2152 %tmp8 = add i64 %tmp, 8
2153 %tmp9 = icmp eq i64 %tmp8, 1024
2154 br i1 %tmp9, label %bb10, label %bb1
2156 bb10: ; preds = %bb1
2160 define void @bcast_unfold_fmax_v16f32(float* %arg) {
2161 ; CHECK-LABEL: bcast_unfold_fmax_v16f32:
2162 ; CHECK: # %bb.0: # %bb
2163 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2164 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2165 ; CHECK-NEXT: .p2align 4, 0x90
2166 ; CHECK-NEXT: .LBB62_1: # %bb1
2167 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2168 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
2169 ; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1
2170 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
2171 ; CHECK-NEXT: addq $64, %rax
2172 ; CHECK-NEXT: jne .LBB62_1
2173 ; CHECK-NEXT: # %bb.2: # %bb10
2174 ; CHECK-NEXT: vzeroupper
2179 bb1: ; preds = %bb1, %bb
2180 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2181 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2182 %tmp3 = bitcast float* %tmp2 to <16 x float>*
2183 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
2184 %tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2185 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2186 %tmp7 = bitcast float* %tmp2 to <16 x float>*
2187 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
2188 %tmp8 = add i64 %tmp, 16
2189 %tmp9 = icmp eq i64 %tmp8, 1024
2190 br i1 %tmp9, label %bb10, label %bb1
2192 bb10: ; preds = %bb1
2196 define void @bcast_unfold_fmax_v2f64(double* %arg) {
2197 ; CHECK-LABEL: bcast_unfold_fmax_v2f64:
2198 ; CHECK: # %bb.0: # %bb
2199 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2200 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2201 ; CHECK-NEXT: # xmm0 = mem[0,0]
2202 ; CHECK-NEXT: .p2align 4, 0x90
2203 ; CHECK-NEXT: .LBB63_1: # %bb1
2204 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2205 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
2206 ; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1
2207 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
2208 ; CHECK-NEXT: addq $16, %rax
2209 ; CHECK-NEXT: jne .LBB63_1
2210 ; CHECK-NEXT: # %bb.2: # %bb10
2215 bb1: ; preds = %bb1, %bb
2216 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2217 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2218 %tmp3 = bitcast double* %tmp2 to <2 x double>*
2219 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
2220 %tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2221 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2222 %tmp7 = bitcast double* %tmp2 to <2 x double>*
2223 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
2224 %tmp8 = add i64 %tmp, 2
2225 %tmp9 = icmp eq i64 %tmp8, 1024
2226 br i1 %tmp9, label %bb10, label %bb1
2228 bb10: ; preds = %bb1
2232 define void @bcast_unfold_fmax_v4f64(double* %arg) {
2233 ; CHECK-LABEL: bcast_unfold_fmax_v4f64:
2234 ; CHECK: # %bb.0: # %bb
2235 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2236 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2237 ; CHECK-NEXT: .p2align 4, 0x90
2238 ; CHECK-NEXT: .LBB64_1: # %bb1
2239 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2240 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
2241 ; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1
2242 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
2243 ; CHECK-NEXT: addq $32, %rax
2244 ; CHECK-NEXT: jne .LBB64_1
2245 ; CHECK-NEXT: # %bb.2: # %bb10
2246 ; CHECK-NEXT: vzeroupper
2251 bb1: ; preds = %bb1, %bb
2252 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2253 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2254 %tmp3 = bitcast double* %tmp2 to <4 x double>*
2255 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2256 %tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2257 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2258 %tmp7 = bitcast double* %tmp2 to <4 x double>*
2259 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2260 %tmp8 = add i64 %tmp, 4
2261 %tmp9 = icmp eq i64 %tmp8, 1024
2262 br i1 %tmp9, label %bb10, label %bb1
2264 bb10: ; preds = %bb1
2268 define void @bcast_unfold_fmax_v8f64(double* %arg) {
2269 ; CHECK-LABEL: bcast_unfold_fmax_v8f64:
2270 ; CHECK: # %bb.0: # %bb
2271 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2272 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2273 ; CHECK-NEXT: .p2align 4, 0x90
2274 ; CHECK-NEXT: .LBB65_1: # %bb1
2275 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2276 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2277 ; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1
2278 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2279 ; CHECK-NEXT: addq $64, %rax
2280 ; CHECK-NEXT: jne .LBB65_1
2281 ; CHECK-NEXT: # %bb.2: # %bb10
2282 ; CHECK-NEXT: vzeroupper
2287 bb1: ; preds = %bb1, %bb
2288 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2289 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2290 %tmp3 = bitcast double* %tmp2 to <8 x double>*
2291 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2292 %tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2293 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2294 %tmp7 = bitcast double* %tmp2 to <8 x double>*
2295 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2296 %tmp8 = add i64 %tmp, 8
2297 %tmp9 = icmp eq i64 %tmp8, 1024
2298 br i1 %tmp9, label %bb10, label %bb1
2300 bb10: ; preds = %bb1
2304 define void @bcast_unfold_fmin_v4f32(float* %arg) {
2305 ; CHECK-LABEL: bcast_unfold_fmin_v4f32:
2306 ; CHECK: # %bb.0: # %bb
2307 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2308 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2309 ; CHECK-NEXT: .p2align 4, 0x90
2310 ; CHECK-NEXT: .LBB66_1: # %bb1
2311 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2312 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1
2313 ; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1
2314 ; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax)
2315 ; CHECK-NEXT: addq $16, %rax
2316 ; CHECK-NEXT: jne .LBB66_1
2317 ; CHECK-NEXT: # %bb.2: # %bb10
2322 bb1: ; preds = %bb1, %bb
2323 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2324 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2325 %tmp3 = bitcast float* %tmp2 to <4 x float>*
2326 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
2327 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2328 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2329 %tmp7 = bitcast float* %tmp2 to <4 x float>*
2330 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
2331 %tmp8 = add i64 %tmp, 4
2332 %tmp9 = icmp eq i64 %tmp8, 1024
2333 br i1 %tmp9, label %bb10, label %bb1
2335 bb10: ; preds = %bb1
2339 define void @bcast_unfold_fmin_v8f32(float* %arg) {
2340 ; CHECK-LABEL: bcast_unfold_fmin_v8f32:
2341 ; CHECK: # %bb.0: # %bb
2342 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2343 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2344 ; CHECK-NEXT: .p2align 4, 0x90
2345 ; CHECK-NEXT: .LBB67_1: # %bb1
2346 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2347 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1
2348 ; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1
2349 ; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax)
2350 ; CHECK-NEXT: addq $32, %rax
2351 ; CHECK-NEXT: jne .LBB67_1
2352 ; CHECK-NEXT: # %bb.2: # %bb10
2353 ; CHECK-NEXT: vzeroupper
2358 bb1: ; preds = %bb1, %bb
2359 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2360 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2361 %tmp3 = bitcast float* %tmp2 to <8 x float>*
2362 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
2363 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2364 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2365 %tmp7 = bitcast float* %tmp2 to <8 x float>*
2366 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
2367 %tmp8 = add i64 %tmp, 8
2368 %tmp9 = icmp eq i64 %tmp8, 1024
2369 br i1 %tmp9, label %bb10, label %bb1
2371 bb10: ; preds = %bb1
2375 define void @bcast_unfold_fmin_v16f32(float* %arg) {
2376 ; CHECK-LABEL: bcast_unfold_fmin_v16f32:
2377 ; CHECK: # %bb.0: # %bb
2378 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2379 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2380 ; CHECK-NEXT: .p2align 4, 0x90
2381 ; CHECK-NEXT: .LBB68_1: # %bb1
2382 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2383 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1
2384 ; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1
2385 ; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax)
2386 ; CHECK-NEXT: addq $64, %rax
2387 ; CHECK-NEXT: jne .LBB68_1
2388 ; CHECK-NEXT: # %bb.2: # %bb10
2389 ; CHECK-NEXT: vzeroupper
2394 bb1: ; preds = %bb1, %bb
2395 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2396 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2397 %tmp3 = bitcast float* %tmp2 to <16 x float>*
2398 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
2399 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2400 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2401 %tmp7 = bitcast float* %tmp2 to <16 x float>*
2402 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
2403 %tmp8 = add i64 %tmp, 16
2404 %tmp9 = icmp eq i64 %tmp8, 1024
2405 br i1 %tmp9, label %bb10, label %bb1
2407 bb10: ; preds = %bb1
2411 define void @bcast_unfold_fmin_v2f64(double* %arg) {
2412 ; CHECK-LABEL: bcast_unfold_fmin_v2f64:
2413 ; CHECK: # %bb.0: # %bb
2414 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2415 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2416 ; CHECK-NEXT: # xmm0 = mem[0,0]
2417 ; CHECK-NEXT: .p2align 4, 0x90
2418 ; CHECK-NEXT: .LBB69_1: # %bb1
2419 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2420 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1
2421 ; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1
2422 ; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax)
2423 ; CHECK-NEXT: addq $16, %rax
2424 ; CHECK-NEXT: jne .LBB69_1
2425 ; CHECK-NEXT: # %bb.2: # %bb10
2430 bb1: ; preds = %bb1, %bb
2431 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2432 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2433 %tmp3 = bitcast double* %tmp2 to <2 x double>*
2434 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
2435 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2436 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2437 %tmp7 = bitcast double* %tmp2 to <2 x double>*
2438 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
2439 %tmp8 = add i64 %tmp, 2
2440 %tmp9 = icmp eq i64 %tmp8, 1024
2441 br i1 %tmp9, label %bb10, label %bb1
2443 bb10: ; preds = %bb1
2447 define void @bcast_unfold_fmin_v4f64(double* %arg) {
2448 ; CHECK-LABEL: bcast_unfold_fmin_v4f64:
2449 ; CHECK: # %bb.0: # %bb
2450 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2451 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2452 ; CHECK-NEXT: .p2align 4, 0x90
2453 ; CHECK-NEXT: .LBB70_1: # %bb1
2454 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2455 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1
2456 ; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1
2457 ; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax)
2458 ; CHECK-NEXT: addq $32, %rax
2459 ; CHECK-NEXT: jne .LBB70_1
2460 ; CHECK-NEXT: # %bb.2: # %bb10
2461 ; CHECK-NEXT: vzeroupper
2466 bb1: ; preds = %bb1, %bb
2467 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2468 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2469 %tmp3 = bitcast double* %tmp2 to <4 x double>*
2470 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2471 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2472 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2473 %tmp7 = bitcast double* %tmp2 to <4 x double>*
2474 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2475 %tmp8 = add i64 %tmp, 4
2476 %tmp9 = icmp eq i64 %tmp8, 1024
2477 br i1 %tmp9, label %bb10, label %bb1
2479 bb10: ; preds = %bb1
2483 define void @bcast_unfold_fmin_v8f64(double* %arg) {
2484 ; CHECK-LABEL: bcast_unfold_fmin_v8f64:
2485 ; CHECK: # %bb.0: # %bb
2486 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2487 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2488 ; CHECK-NEXT: .p2align 4, 0x90
2489 ; CHECK-NEXT: .LBB71_1: # %bb1
2490 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2491 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1
2492 ; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1
2493 ; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax)
2494 ; CHECK-NEXT: addq $64, %rax
2495 ; CHECK-NEXT: jne .LBB71_1
2496 ; CHECK-NEXT: # %bb.2: # %bb10
2497 ; CHECK-NEXT: vzeroupper
2502 bb1: ; preds = %bb1, %bb
2503 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2504 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2505 %tmp3 = bitcast double* %tmp2 to <8 x double>*
2506 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2507 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2508 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2509 %tmp7 = bitcast double* %tmp2 to <8 x double>*
2510 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2511 %tmp8 = add i64 %tmp, 8
2512 %tmp9 = icmp eq i64 %tmp8, 1024
2513 br i1 %tmp9, label %bb10, label %bb1
2515 bb10: ; preds = %bb1
2519 define void @bcast_unfold_smin_v4i32(i32* %arg) {
2520 ; CHECK-LABEL: bcast_unfold_smin_v4i32:
2521 ; CHECK: # %bb.0: # %bb
2522 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2523 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2524 ; CHECK-NEXT: .p2align 4, 0x90
2525 ; CHECK-NEXT: .LBB72_1: # %bb1
2526 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2527 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
2528 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2529 ; CHECK-NEXT: addq $16, %rax
2530 ; CHECK-NEXT: jne .LBB72_1
2531 ; CHECK-NEXT: # %bb.2: # %bb10
2536 bb1: ; preds = %bb1, %bb
2537 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2538 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2539 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2540 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2541 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2542 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2543 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2544 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2545 %tmp8 = add i64 %tmp, 4
2546 %tmp9 = icmp eq i64 %tmp8, 1024
2547 br i1 %tmp9, label %bb10, label %bb1
2549 bb10: ; preds = %bb1
2553 define void @bcast_unfold_smin_v8i32(i32* %arg) {
2554 ; CHECK-LABEL: bcast_unfold_smin_v8i32:
2555 ; CHECK: # %bb.0: # %bb
2556 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2557 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2558 ; CHECK-NEXT: .p2align 4, 0x90
2559 ; CHECK-NEXT: .LBB73_1: # %bb1
2560 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2561 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
2562 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2563 ; CHECK-NEXT: addq $32, %rax
2564 ; CHECK-NEXT: jne .LBB73_1
2565 ; CHECK-NEXT: # %bb.2: # %bb10
2566 ; CHECK-NEXT: vzeroupper
2571 bb1: ; preds = %bb1, %bb
2572 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2573 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2574 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2575 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2576 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2577 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2578 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2579 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2580 %tmp8 = add i64 %tmp, 8
2581 %tmp9 = icmp eq i64 %tmp8, 1024
2582 br i1 %tmp9, label %bb10, label %bb1
2584 bb10: ; preds = %bb1
2588 define void @bcast_unfold_smin_v16i32(i32* %arg) {
2589 ; CHECK-LABEL: bcast_unfold_smin_v16i32:
2590 ; CHECK: # %bb.0: # %bb
2591 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2592 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2593 ; CHECK-NEXT: .p2align 4, 0x90
2594 ; CHECK-NEXT: .LBB74_1: # %bb1
2595 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2596 ; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
2597 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2598 ; CHECK-NEXT: addq $64, %rax
2599 ; CHECK-NEXT: jne .LBB74_1
2600 ; CHECK-NEXT: # %bb.2: # %bb10
2601 ; CHECK-NEXT: vzeroupper
2606 bb1: ; preds = %bb1, %bb
2607 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2608 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2609 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
2610 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
2611 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2612 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2613 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
2614 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
2615 %tmp8 = add i64 %tmp, 16
2616 %tmp9 = icmp eq i64 %tmp8, 1024
2617 br i1 %tmp9, label %bb10, label %bb1
2619 bb10: ; preds = %bb1
2623 define void @bcast_unfold_smin_v2i64(i64* %arg) {
2624 ; CHECK-LABEL: bcast_unfold_smin_v2i64:
2625 ; CHECK: # %bb.0: # %bb
2626 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2627 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
2628 ; CHECK-NEXT: .p2align 4, 0x90
2629 ; CHECK-NEXT: .LBB75_1: # %bb1
2630 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2631 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %xmm0, %xmm1
2632 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2633 ; CHECK-NEXT: addq $16, %rax
2634 ; CHECK-NEXT: jne .LBB75_1
2635 ; CHECK-NEXT: # %bb.2: # %bb10
2640 bb1: ; preds = %bb1, %bb
2641 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2642 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2643 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
2644 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
2645 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2>
2646 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2647 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
2648 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
2649 %tmp8 = add i64 %tmp, 2
2650 %tmp9 = icmp eq i64 %tmp8, 1024
2651 br i1 %tmp9, label %bb10, label %bb1
2653 bb10: ; preds = %bb1
2657 define void @bcast_unfold_smin_v4i64(i64* %arg) {
2658 ; CHECK-LABEL: bcast_unfold_smin_v4i64:
2659 ; CHECK: # %bb.0: # %bb
2660 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2661 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2662 ; CHECK-NEXT: .p2align 4, 0x90
2663 ; CHECK-NEXT: .LBB76_1: # %bb1
2664 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2665 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
2666 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2667 ; CHECK-NEXT: addq $32, %rax
2668 ; CHECK-NEXT: jne .LBB76_1
2669 ; CHECK-NEXT: # %bb.2: # %bb10
2670 ; CHECK-NEXT: vzeroupper
2675 bb1: ; preds = %bb1, %bb
2676 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2677 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2678 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
2679 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
2680 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2681 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2682 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
2683 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
2684 %tmp8 = add i64 %tmp, 4
2685 %tmp9 = icmp eq i64 %tmp8, 1024
2686 br i1 %tmp9, label %bb10, label %bb1
2688 bb10: ; preds = %bb1
2692 define void @bcast_unfold_smin_v8i64(i64* %arg) {
2693 ; CHECK-LABEL: bcast_unfold_smin_v8i64:
2694 ; CHECK: # %bb.0: # %bb
2695 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2696 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2697 ; CHECK-NEXT: .p2align 4, 0x90
2698 ; CHECK-NEXT: .LBB77_1: # %bb1
2699 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2700 ; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
2701 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2702 ; CHECK-NEXT: addq $64, %rax
2703 ; CHECK-NEXT: jne .LBB77_1
2704 ; CHECK-NEXT: # %bb.2: # %bb10
2705 ; CHECK-NEXT: vzeroupper
2710 bb1: ; preds = %bb1, %bb
2711 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2712 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2713 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
2714 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
2715 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2716 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2717 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
2718 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
2719 %tmp8 = add i64 %tmp, 8
2720 %tmp9 = icmp eq i64 %tmp8, 1024
2721 br i1 %tmp9, label %bb10, label %bb1
2723 bb10: ; preds = %bb1
2727 define void @bcast_unfold_smax_v4i32(i32* %arg) {
2728 ; CHECK-LABEL: bcast_unfold_smax_v4i32:
2729 ; CHECK: # %bb.0: # %bb
2730 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2731 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2732 ; CHECK-NEXT: .p2align 4, 0x90
2733 ; CHECK-NEXT: .LBB78_1: # %bb1
2734 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2735 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
2736 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2737 ; CHECK-NEXT: addq $16, %rax
2738 ; CHECK-NEXT: jne .LBB78_1
2739 ; CHECK-NEXT: # %bb.2: # %bb10
2744 bb1: ; preds = %bb1, %bb
2745 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2746 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2747 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2748 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2749 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2750 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2751 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2752 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2753 %tmp8 = add i64 %tmp, 4
2754 %tmp9 = icmp eq i64 %tmp8, 1024
2755 br i1 %tmp9, label %bb10, label %bb1
2757 bb10: ; preds = %bb1
2761 define void @bcast_unfold_smax_v8i32(i32* %arg) {
2762 ; CHECK-LABEL: bcast_unfold_smax_v8i32:
2763 ; CHECK: # %bb.0: # %bb
2764 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2765 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2766 ; CHECK-NEXT: .p2align 4, 0x90
2767 ; CHECK-NEXT: .LBB79_1: # %bb1
2768 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2769 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
2770 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2771 ; CHECK-NEXT: addq $32, %rax
2772 ; CHECK-NEXT: jne .LBB79_1
2773 ; CHECK-NEXT: # %bb.2: # %bb10
2774 ; CHECK-NEXT: vzeroupper
2779 bb1: ; preds = %bb1, %bb
2780 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2781 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2782 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2783 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2784 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2785 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2786 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2787 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2788 %tmp8 = add i64 %tmp, 8
2789 %tmp9 = icmp eq i64 %tmp8, 1024
2790 br i1 %tmp9, label %bb10, label %bb1
2792 bb10: ; preds = %bb1
2796 define void @bcast_unfold_smax_v16i32(i32* %arg) {
2797 ; CHECK-LABEL: bcast_unfold_smax_v16i32:
2798 ; CHECK: # %bb.0: # %bb
2799 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2800 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2801 ; CHECK-NEXT: .p2align 4, 0x90
2802 ; CHECK-NEXT: .LBB80_1: # %bb1
2803 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2804 ; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
2805 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
2806 ; CHECK-NEXT: addq $64, %rax
2807 ; CHECK-NEXT: jne .LBB80_1
2808 ; CHECK-NEXT: # %bb.2: # %bb10
2809 ; CHECK-NEXT: vzeroupper
2814 bb1: ; preds = %bb1, %bb
2815 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2816 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2817 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
2818 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
2819 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2820 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2821 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
2822 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
2823 %tmp8 = add i64 %tmp, 16
2824 %tmp9 = icmp eq i64 %tmp8, 1024
2825 br i1 %tmp9, label %bb10, label %bb1
2827 bb10: ; preds = %bb1
2831 define void @bcast_unfold_smax_v2i64(i64* %arg) {
2832 ; CHECK-LABEL: bcast_unfold_smax_v2i64:
2833 ; CHECK: # %bb.0: # %bb
2834 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2835 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
2836 ; CHECK-NEXT: .p2align 4, 0x90
2837 ; CHECK-NEXT: .LBB81_1: # %bb1
2838 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2839 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1
2840 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
2841 ; CHECK-NEXT: addq $16, %rax
2842 ; CHECK-NEXT: jne .LBB81_1
2843 ; CHECK-NEXT: # %bb.2: # %bb10
2848 bb1: ; preds = %bb1, %bb
2849 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2850 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2851 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
2852 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
2853 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2>
2854 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2855 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
2856 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
2857 %tmp8 = add i64 %tmp, 2
2858 %tmp9 = icmp eq i64 %tmp8, 1024
2859 br i1 %tmp9, label %bb10, label %bb1
2861 bb10: ; preds = %bb1
2865 define void @bcast_unfold_smax_v4i64(i64* %arg) {
2866 ; CHECK-LABEL: bcast_unfold_smax_v4i64:
2867 ; CHECK: # %bb.0: # %bb
2868 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2869 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2870 ; CHECK-NEXT: .p2align 4, 0x90
2871 ; CHECK-NEXT: .LBB82_1: # %bb1
2872 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2873 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
2874 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
2875 ; CHECK-NEXT: addq $32, %rax
2876 ; CHECK-NEXT: jne .LBB82_1
2877 ; CHECK-NEXT: # %bb.2: # %bb10
2878 ; CHECK-NEXT: vzeroupper
2883 bb1: ; preds = %bb1, %bb
2884 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2885 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2886 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
2887 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
2888 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2889 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2890 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
2891 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
2892 %tmp8 = add i64 %tmp, 4
2893 %tmp9 = icmp eq i64 %tmp8, 1024
2894 br i1 %tmp9, label %bb10, label %bb1
2896 bb10: ; preds = %bb1
2900 define void @bcast_unfold_smax_v8i64(i64* %arg) {
2901 ; CHECK-LABEL: bcast_unfold_smax_v8i64:
2902 ; CHECK: # %bb.0: # %bb
2903 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
2904 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2905 ; CHECK-NEXT: .p2align 4, 0x90
2906 ; CHECK-NEXT: .LBB83_1: # %bb1
2907 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2908 ; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
2909 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
2910 ; CHECK-NEXT: addq $64, %rax
2911 ; CHECK-NEXT: jne .LBB83_1
2912 ; CHECK-NEXT: # %bb.2: # %bb10
2913 ; CHECK-NEXT: vzeroupper
2918 bb1: ; preds = %bb1, %bb
2919 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2920 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2921 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
2922 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
2923 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2924 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2925 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
2926 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
2927 %tmp8 = add i64 %tmp, 8
2928 %tmp9 = icmp eq i64 %tmp8, 1024
2929 br i1 %tmp9, label %bb10, label %bb1
2931 bb10: ; preds = %bb1
2935 define void @bcast_unfold_umin_v4i32(i32* %arg) {
2936 ; CHECK-LABEL: bcast_unfold_umin_v4i32:
2937 ; CHECK: # %bb.0: # %bb
2938 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2939 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2940 ; CHECK-NEXT: .p2align 4, 0x90
2941 ; CHECK-NEXT: .LBB84_1: # %bb1
2942 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2943 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1
2944 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
2945 ; CHECK-NEXT: addq $16, %rax
2946 ; CHECK-NEXT: jne .LBB84_1
2947 ; CHECK-NEXT: # %bb.2: # %bb10
2952 bb1: ; preds = %bb1, %bb
2953 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2954 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2955 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2956 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2957 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2958 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2959 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2960 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2961 %tmp8 = add i64 %tmp, 4
2962 %tmp9 = icmp eq i64 %tmp8, 1024
2963 br i1 %tmp9, label %bb10, label %bb1
2965 bb10: ; preds = %bb1
2969 define void @bcast_unfold_umin_v8i32(i32* %arg) {
2970 ; CHECK-LABEL: bcast_unfold_umin_v8i32:
2971 ; CHECK: # %bb.0: # %bb
2972 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
2973 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2974 ; CHECK-NEXT: .p2align 4, 0x90
2975 ; CHECK-NEXT: .LBB85_1: # %bb1
2976 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
2977 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1
2978 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
2979 ; CHECK-NEXT: addq $32, %rax
2980 ; CHECK-NEXT: jne .LBB85_1
2981 ; CHECK-NEXT: # %bb.2: # %bb10
2982 ; CHECK-NEXT: vzeroupper
2987 bb1: ; preds = %bb1, %bb
2988 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2989 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2990 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2991 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2992 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2993 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2994 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2995 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2996 %tmp8 = add i64 %tmp, 8
2997 %tmp9 = icmp eq i64 %tmp8, 1024
2998 br i1 %tmp9, label %bb10, label %bb1
3000 bb10: ; preds = %bb1
3004 define void @bcast_unfold_umin_v16i32(i32* %arg) {
3005 ; CHECK-LABEL: bcast_unfold_umin_v16i32:
3006 ; CHECK: # %bb.0: # %bb
3007 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3008 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3009 ; CHECK-NEXT: .p2align 4, 0x90
3010 ; CHECK-NEXT: .LBB86_1: # %bb1
3011 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3012 ; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1
3013 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3014 ; CHECK-NEXT: addq $64, %rax
3015 ; CHECK-NEXT: jne .LBB86_1
3016 ; CHECK-NEXT: # %bb.2: # %bb10
3017 ; CHECK-NEXT: vzeroupper
3022 bb1: ; preds = %bb1, %bb
3023 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3024 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3025 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3026 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3027 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3028 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3029 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3030 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3031 %tmp8 = add i64 %tmp, 16
3032 %tmp9 = icmp eq i64 %tmp8, 1024
3033 br i1 %tmp9, label %bb10, label %bb1
3035 bb10: ; preds = %bb1
3039 define void @bcast_unfold_umin_v2i64(i64* %arg) {
3040 ; CHECK-LABEL: bcast_unfold_umin_v2i64:
3041 ; CHECK: # %bb.0: # %bb
3042 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3043 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
3044 ; CHECK-NEXT: .p2align 4, 0x90
3045 ; CHECK-NEXT: .LBB87_1: # %bb1
3046 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3047 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %xmm0, %xmm1
3048 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3049 ; CHECK-NEXT: addq $16, %rax
3050 ; CHECK-NEXT: jne .LBB87_1
3051 ; CHECK-NEXT: # %bb.2: # %bb10
3056 bb1: ; preds = %bb1, %bb
3057 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3058 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3059 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3060 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
3061 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
3062 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3063 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3064 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
3065 %tmp8 = add i64 %tmp, 2
3066 %tmp9 = icmp eq i64 %tmp8, 1024
3067 br i1 %tmp9, label %bb10, label %bb1
3069 bb10: ; preds = %bb1
3073 define void @bcast_unfold_umin_v4i64(i64* %arg) {
3074 ; CHECK-LABEL: bcast_unfold_umin_v4i64:
3075 ; CHECK: # %bb.0: # %bb
3076 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3077 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3078 ; CHECK-NEXT: .p2align 4, 0x90
3079 ; CHECK-NEXT: .LBB88_1: # %bb1
3080 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3081 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
3082 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3083 ; CHECK-NEXT: addq $32, %rax
3084 ; CHECK-NEXT: jne .LBB88_1
3085 ; CHECK-NEXT: # %bb.2: # %bb10
3086 ; CHECK-NEXT: vzeroupper
3091 bb1: ; preds = %bb1, %bb
3092 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3093 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3094 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3095 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
3096 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3097 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3098 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3099 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
3100 %tmp8 = add i64 %tmp, 4
3101 %tmp9 = icmp eq i64 %tmp8, 1024
3102 br i1 %tmp9, label %bb10, label %bb1
3104 bb10: ; preds = %bb1
3108 define void @bcast_unfold_umin_v8i64(i64* %arg) {
3109 ; CHECK-LABEL: bcast_unfold_umin_v8i64:
3110 ; CHECK: # %bb.0: # %bb
3111 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3112 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3113 ; CHECK-NEXT: .p2align 4, 0x90
3114 ; CHECK-NEXT: .LBB89_1: # %bb1
3115 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3116 ; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
3117 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3118 ; CHECK-NEXT: addq $64, %rax
3119 ; CHECK-NEXT: jne .LBB89_1
3120 ; CHECK-NEXT: # %bb.2: # %bb10
3121 ; CHECK-NEXT: vzeroupper
3126 bb1: ; preds = %bb1, %bb
3127 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3128 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3129 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3130 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
3131 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3132 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3133 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3134 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
3135 %tmp8 = add i64 %tmp, 8
3136 %tmp9 = icmp eq i64 %tmp8, 1024
3137 br i1 %tmp9, label %bb10, label %bb1
3139 bb10: ; preds = %bb1
3143 define void @bcast_unfold_umax_v4i32(i32* %arg) {
3144 ; CHECK-LABEL: bcast_unfold_umax_v4i32:
3145 ; CHECK: # %bb.0: # %bb
3146 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3147 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
3148 ; CHECK-NEXT: .p2align 4, 0x90
3149 ; CHECK-NEXT: .LBB90_1: # %bb1
3150 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3151 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
3152 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3153 ; CHECK-NEXT: addq $16, %rax
3154 ; CHECK-NEXT: jne .LBB90_1
3155 ; CHECK-NEXT: # %bb.2: # %bb10
3160 bb1: ; preds = %bb1, %bb
3161 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3162 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3163 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3164 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3165 %tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
3166 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
3167 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3168 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3169 %tmp8 = add i64 %tmp, 4
3170 %tmp9 = icmp eq i64 %tmp8, 1024
3171 br i1 %tmp9, label %bb10, label %bb1
3173 bb10: ; preds = %bb1
3177 define void @bcast_unfold_umax_v8i32(i32* %arg) {
3178 ; CHECK-LABEL: bcast_unfold_umax_v8i32:
3179 ; CHECK: # %bb.0: # %bb
3180 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3181 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
3182 ; CHECK-NEXT: .p2align 4, 0x90
3183 ; CHECK-NEXT: .LBB91_1: # %bb1
3184 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3185 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
3186 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3187 ; CHECK-NEXT: addq $32, %rax
3188 ; CHECK-NEXT: jne .LBB91_1
3189 ; CHECK-NEXT: # %bb.2: # %bb10
3190 ; CHECK-NEXT: vzeroupper
3195 bb1: ; preds = %bb1, %bb
3196 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3197 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3198 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3199 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3200 %tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3201 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3202 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3203 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3204 %tmp8 = add i64 %tmp, 8
3205 %tmp9 = icmp eq i64 %tmp8, 1024
3206 br i1 %tmp9, label %bb10, label %bb1
3208 bb10: ; preds = %bb1
3212 define void @bcast_unfold_umax_v16i32(i32* %arg) {
3213 ; CHECK-LABEL: bcast_unfold_umax_v16i32:
3214 ; CHECK: # %bb.0: # %bb
3215 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3216 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3217 ; CHECK-NEXT: .p2align 4, 0x90
3218 ; CHECK-NEXT: .LBB92_1: # %bb1
3219 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3220 ; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
3221 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3222 ; CHECK-NEXT: addq $64, %rax
3223 ; CHECK-NEXT: jne .LBB92_1
3224 ; CHECK-NEXT: # %bb.2: # %bb10
3225 ; CHECK-NEXT: vzeroupper
3230 bb1: ; preds = %bb1, %bb
3231 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3232 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3233 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3234 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3235 %tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3236 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3237 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3238 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3239 %tmp8 = add i64 %tmp, 16
3240 %tmp9 = icmp eq i64 %tmp8, 1024
3241 br i1 %tmp9, label %bb10, label %bb1
3243 bb10: ; preds = %bb1
3247 define void @bcast_unfold_umax_v2i64(i64* %arg) {
3248 ; CHECK-LABEL: bcast_unfold_umax_v2i64:
3249 ; CHECK: # %bb.0: # %bb
3250 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3251 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
3252 ; CHECK-NEXT: .p2align 4, 0x90
3253 ; CHECK-NEXT: .LBB93_1: # %bb1
3254 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3255 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1
3256 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3257 ; CHECK-NEXT: addq $16, %rax
3258 ; CHECK-NEXT: jne .LBB93_1
3259 ; CHECK-NEXT: # %bb.2: # %bb10
3264 bb1: ; preds = %bb1, %bb
3265 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3266 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3267 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3268 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
3269 %tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2>
3270 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3271 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3272 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
3273 %tmp8 = add i64 %tmp, 2
3274 %tmp9 = icmp eq i64 %tmp8, 1024
3275 br i1 %tmp9, label %bb10, label %bb1
3277 bb10: ; preds = %bb1
3281 define void @bcast_unfold_umax_v4i64(i64* %arg) {
3282 ; CHECK-LABEL: bcast_unfold_umax_v4i64:
3283 ; CHECK: # %bb.0: # %bb
3284 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3285 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3286 ; CHECK-NEXT: .p2align 4, 0x90
3287 ; CHECK-NEXT: .LBB94_1: # %bb1
3288 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3289 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
3290 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3291 ; CHECK-NEXT: addq $32, %rax
3292 ; CHECK-NEXT: jne .LBB94_1
3293 ; CHECK-NEXT: # %bb.2: # %bb10
3294 ; CHECK-NEXT: vzeroupper
3299 bb1: ; preds = %bb1, %bb
3300 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3301 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3302 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3303 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
3304 %tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3305 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3306 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3307 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
3308 %tmp8 = add i64 %tmp, 4
3309 %tmp9 = icmp eq i64 %tmp8, 1024
3310 br i1 %tmp9, label %bb10, label %bb1
3312 bb10: ; preds = %bb1
3316 define void @bcast_unfold_umax_v8i64(i64* %arg) {
3317 ; CHECK-LABEL: bcast_unfold_umax_v8i64:
3318 ; CHECK: # %bb.0: # %bb
3319 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3320 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3321 ; CHECK-NEXT: .p2align 4, 0x90
3322 ; CHECK-NEXT: .LBB95_1: # %bb1
3323 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3324 ; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
3325 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3326 ; CHECK-NEXT: addq $64, %rax
3327 ; CHECK-NEXT: jne .LBB95_1
3328 ; CHECK-NEXT: # %bb.2: # %bb10
3329 ; CHECK-NEXT: vzeroupper
3334 bb1: ; preds = %bb1, %bb
3335 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3336 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3337 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3338 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
3339 %tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3340 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3341 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3342 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
3343 %tmp8 = add i64 %tmp, 8
3344 %tmp9 = icmp eq i64 %tmp8, 1024
3345 br i1 %tmp9, label %bb10, label %bb1
3347 bb10: ; preds = %bb1
3351 define void @bcast_unfold_pcmpgt_v4i32(i32* %arg) {
3352 ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32:
3353 ; CHECK: # %bb.0: # %bb
3354 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3355 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3356 ; CHECK-NEXT: .p2align 4, 0x90
3357 ; CHECK-NEXT: .LBB96_1: # %bb1
3358 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3359 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
3360 ; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
3361 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
3362 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3363 ; CHECK-NEXT: addq $16, %rax
3364 ; CHECK-NEXT: jne .LBB96_1
3365 ; CHECK-NEXT: # %bb.2: # %bb10
3370 bb1: ; preds = %bb1, %bb
3371 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3372 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3373 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3374 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3375 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3376 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3377 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3378 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3379 %tmp8 = add i64 %tmp, 4
3380 %tmp9 = icmp eq i64 %tmp8, 1024
3381 br i1 %tmp9, label %bb10, label %bb1
3383 bb10: ; preds = %bb1
3387 define void @bcast_unfold_pcmpgt_v8i32(i32* %arg) {
3388 ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32:
3389 ; CHECK: # %bb.0: # %bb
3390 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3391 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3392 ; CHECK-NEXT: .p2align 4, 0x90
3393 ; CHECK-NEXT: .LBB97_1: # %bb1
3394 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3395 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
3396 ; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
3397 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
3398 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3399 ; CHECK-NEXT: addq $32, %rax
3400 ; CHECK-NEXT: jne .LBB97_1
3401 ; CHECK-NEXT: # %bb.2: # %bb10
3402 ; CHECK-NEXT: vzeroupper
3407 bb1: ; preds = %bb1, %bb
3408 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3409 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3410 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3411 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3412 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3413 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3414 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3415 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3416 %tmp8 = add i64 %tmp, 8
3417 %tmp9 = icmp eq i64 %tmp8, 1024
3418 br i1 %tmp9, label %bb10, label %bb1
3420 bb10: ; preds = %bb1
3424 define void @bcast_unfold_pcmpgt_v16i32(i32* %arg) {
3425 ; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32:
3426 ; CHECK: # %bb.0: # %bb
3427 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3428 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3429 ; CHECK-NEXT: .p2align 4, 0x90
3430 ; CHECK-NEXT: .LBB98_1: # %bb1
3431 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3432 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
3433 ; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
3434 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
3435 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3436 ; CHECK-NEXT: addq $64, %rax
3437 ; CHECK-NEXT: jne .LBB98_1
3438 ; CHECK-NEXT: # %bb.2: # %bb10
3439 ; CHECK-NEXT: vzeroupper
3444 bb1: ; preds = %bb1, %bb
3445 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3446 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3447 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3448 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3449 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3450 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3451 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3452 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3453 %tmp8 = add i64 %tmp, 16
3454 %tmp9 = icmp eq i64 %tmp8, 1024
3455 br i1 %tmp9, label %bb10, label %bb1
3457 bb10: ; preds = %bb1
3461 define void @bcast_unfold_pcmpgt_v2i64(i64* %arg) {
3462 ; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64:
3463 ; CHECK: # %bb.0: # %bb
3464 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3465 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
3466 ; CHECK-NEXT: .p2align 4, 0x90
3467 ; CHECK-NEXT: .LBB99_1: # %bb1
3468 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3469 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
3470 ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
3471 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
3472 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3473 ; CHECK-NEXT: addq $16, %rax
3474 ; CHECK-NEXT: jne .LBB99_1
3475 ; CHECK-NEXT: # %bb.2: # %bb10
3480 bb1: ; preds = %bb1, %bb
3481 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3482 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3483 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3484 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3485 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1>
3486 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3487 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3488 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3489 %tmp8 = add i64 %tmp, 2
3490 %tmp9 = icmp eq i64 %tmp8, 1024
3491 br i1 %tmp9, label %bb10, label %bb1
3493 bb10: ; preds = %bb1
3496 define void @bcast_unfold_pcmpgt_v4i64(i64* %arg) {
3497 ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64:
3498 ; CHECK: # %bb.0: # %bb
3499 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3500 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3501 ; CHECK-NEXT: .p2align 4, 0x90
3502 ; CHECK-NEXT: .LBB100_1: # %bb1
3503 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3504 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
3505 ; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
3506 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
3507 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3508 ; CHECK-NEXT: addq $32, %rax
3509 ; CHECK-NEXT: jne .LBB100_1
3510 ; CHECK-NEXT: # %bb.2: # %bb10
3511 ; CHECK-NEXT: vzeroupper
3516 bb1: ; preds = %bb1, %bb
3517 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3518 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3519 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3520 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3521 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3522 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3523 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3524 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3525 %tmp8 = add i64 %tmp, 4
3526 %tmp9 = icmp eq i64 %tmp8, 1024
3527 br i1 %tmp9, label %bb10, label %bb1
3529 bb10: ; preds = %bb1
3533 define void @bcast_unfold_pcmpgt_v8i64(i64* %arg) {
3534 ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64:
3535 ; CHECK: # %bb.0: # %bb
3536 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3537 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3538 ; CHECK-NEXT: .p2align 4, 0x90
3539 ; CHECK-NEXT: .LBB101_1: # %bb1
3540 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3541 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
3542 ; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
3543 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
3544 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3545 ; CHECK-NEXT: addq $64, %rax
3546 ; CHECK-NEXT: jne .LBB101_1
3547 ; CHECK-NEXT: # %bb.2: # %bb10
3548 ; CHECK-NEXT: vzeroupper
3553 bb1: ; preds = %bb1, %bb
3554 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3555 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3556 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3557 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3558 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3559 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3560 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3561 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3562 %tmp8 = add i64 %tmp, 8
3563 %tmp9 = icmp eq i64 %tmp8, 1024
3564 br i1 %tmp9, label %bb10, label %bb1
3566 bb10: ; preds = %bb1
3570 define void @bcast_unfold_pcmpeq_v4i32(i32* %arg) {
3571 ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32:
3572 ; CHECK: # %bb.0: # %bb
3573 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3574 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3575 ; CHECK-NEXT: .p2align 4, 0x90
3576 ; CHECK-NEXT: .LBB102_1: # %bb1
3577 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3578 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
3579 ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
3580 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
3581 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
3582 ; CHECK-NEXT: addq $16, %rax
3583 ; CHECK-NEXT: jne .LBB102_1
3584 ; CHECK-NEXT: # %bb.2: # %bb10
3589 bb1: ; preds = %bb1, %bb
3590 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3591 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3592 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3593 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3594 %tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3595 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3596 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3597 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3598 %tmp8 = add i64 %tmp, 4
3599 %tmp9 = icmp eq i64 %tmp8, 1024
3600 br i1 %tmp9, label %bb10, label %bb1
3602 bb10: ; preds = %bb1
3606 define void @bcast_unfold_pcmpeq_v8i32(i32* %arg) {
3607 ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32:
3608 ; CHECK: # %bb.0: # %bb
3609 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3610 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3611 ; CHECK-NEXT: .p2align 4, 0x90
3612 ; CHECK-NEXT: .LBB103_1: # %bb1
3613 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3614 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1
3615 ; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1
3616 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
3617 ; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
3618 ; CHECK-NEXT: addq $32, %rax
3619 ; CHECK-NEXT: jne .LBB103_1
3620 ; CHECK-NEXT: # %bb.2: # %bb10
3621 ; CHECK-NEXT: vzeroupper
3626 bb1: ; preds = %bb1, %bb
3627 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3628 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3629 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3630 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3631 %tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3632 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3633 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3634 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3635 %tmp8 = add i64 %tmp, 8
3636 %tmp9 = icmp eq i64 %tmp8, 1024
3637 br i1 %tmp9, label %bb10, label %bb1
3639 bb10: ; preds = %bb1
3643 define void @bcast_unfold_pcmpeq_v16i32(i32* %arg) {
3644 ; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32:
3645 ; CHECK: # %bb.0: # %bb
3646 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
3647 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3648 ; CHECK-NEXT: .p2align 4, 0x90
3649 ; CHECK-NEXT: .LBB104_1: # %bb1
3650 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3651 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
3652 ; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1
3653 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
3654 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
3655 ; CHECK-NEXT: addq $64, %rax
3656 ; CHECK-NEXT: jne .LBB104_1
3657 ; CHECK-NEXT: # %bb.2: # %bb10
3658 ; CHECK-NEXT: vzeroupper
3663 bb1: ; preds = %bb1, %bb
3664 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3665 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3666 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3667 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3668 %tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3669 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3670 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3671 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3672 %tmp8 = add i64 %tmp, 16
3673 %tmp9 = icmp eq i64 %tmp8, 1024
3674 br i1 %tmp9, label %bb10, label %bb1
3676 bb10: ; preds = %bb1
3680 define void @bcast_unfold_pcmpeq_v2i64(i64* %arg) {
3681 ; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64:
3682 ; CHECK: # %bb.0: # %bb
3683 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3684 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
3685 ; CHECK-NEXT: .p2align 4, 0x90
3686 ; CHECK-NEXT: .LBB105_1: # %bb1
3687 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3688 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1
3689 ; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1
3690 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
3691 ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax)
3692 ; CHECK-NEXT: addq $16, %rax
3693 ; CHECK-NEXT: jne .LBB105_1
3694 ; CHECK-NEXT: # %bb.2: # %bb10
3699 bb1: ; preds = %bb1, %bb
3700 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3701 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3702 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3703 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3704 %tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1>
3705 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3706 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3707 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3708 %tmp8 = add i64 %tmp, 2
3709 %tmp9 = icmp eq i64 %tmp8, 1024
3710 br i1 %tmp9, label %bb10, label %bb1
3712 bb10: ; preds = %bb1
3715 define void @bcast_unfold_pcmpeq_v4i64(i64* %arg) {
3716 ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64:
3717 ; CHECK: # %bb.0: # %bb
3718 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3719 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3720 ; CHECK-NEXT: .p2align 4, 0x90
3721 ; CHECK-NEXT: .LBB106_1: # %bb1
3722 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3723 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
3724 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1
3725 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
3726 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
3727 ; CHECK-NEXT: addq $32, %rax
3728 ; CHECK-NEXT: jne .LBB106_1
3729 ; CHECK-NEXT: # %bb.2: # %bb10
3730 ; CHECK-NEXT: vzeroupper
3735 bb1: ; preds = %bb1, %bb
3736 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3737 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3738 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3739 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3740 %tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3741 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3742 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3743 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3744 %tmp8 = add i64 %tmp, 4
3745 %tmp9 = icmp eq i64 %tmp8, 1024
3746 br i1 %tmp9, label %bb10, label %bb1
3748 bb10: ; preds = %bb1
3752 define void @bcast_unfold_pcmpeq_v8i64(i64* %arg) {
3753 ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64:
3754 ; CHECK: # %bb.0: # %bb
3755 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
3756 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3757 ; CHECK-NEXT: .p2align 4, 0x90
3758 ; CHECK-NEXT: .LBB107_1: # %bb1
3759 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3760 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1
3761 ; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1
3762 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
3763 ; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax)
3764 ; CHECK-NEXT: addq $64, %rax
3765 ; CHECK-NEXT: jne .LBB107_1
3766 ; CHECK-NEXT: # %bb.2: # %bb10
3767 ; CHECK-NEXT: vzeroupper
3772 bb1: ; preds = %bb1, %bb
3773 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3774 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3775 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3776 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3777 %tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3778 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3779 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3780 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3781 %tmp8 = add i64 %tmp, 8
3782 %tmp9 = icmp eq i64 %tmp8, 1024
3783 br i1 %tmp9, label %bb10, label %bb1
3785 bb10: ; preds = %bb1
3789 define void @bcast_unfold_pcmp_v4i32(i32* %arg) {
3790 ; CHECK-LABEL: bcast_unfold_pcmp_v4i32:
3791 ; CHECK: # %bb.0: # %bb
3792 ; CHECK-NEXT: xorl %eax, %eax
3793 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3794 ; CHECK-NEXT: .p2align 4, 0x90
3795 ; CHECK-NEXT: .LBB108_1: # %bb1
3796 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3797 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
3798 ; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1
3799 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
3800 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
3801 ; CHECK-NEXT: addq $4, %rax
3802 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3803 ; CHECK-NEXT: jg .LBB108_1
3804 ; CHECK-NEXT: # %bb.2: # %bb10
3809 bb1: ; preds = %bb1, %bb
3810 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3811 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3812 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3813 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3814 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3815 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3816 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3817 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3818 %tmp8 = add i64 %tmp, 4
3819 %tmp9 = icmp slt i64 %tmp8, 1024
3820 br i1 %tmp9, label %bb10, label %bb1
3822 bb10: ; preds = %bb1
3826 define void @bcast_unfold_pcmp_v8i32(i32* %arg) {
3827 ; CHECK-LABEL: bcast_unfold_pcmp_v8i32:
3828 ; CHECK: # %bb.0: # %bb
3829 ; CHECK-NEXT: xorl %eax, %eax
3830 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3831 ; CHECK-NEXT: .p2align 4, 0x90
3832 ; CHECK-NEXT: .LBB109_1: # %bb1
3833 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3834 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
3835 ; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1
3836 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
3837 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
3838 ; CHECK-NEXT: addq $8, %rax
3839 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3840 ; CHECK-NEXT: jg .LBB109_1
3841 ; CHECK-NEXT: # %bb.2: # %bb10
3842 ; CHECK-NEXT: vzeroupper
3847 bb1: ; preds = %bb1, %bb
3848 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3849 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3850 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3851 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3852 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3853 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3854 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3855 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3856 %tmp8 = add i64 %tmp, 8
3857 %tmp9 = icmp slt i64 %tmp8, 1024
3858 br i1 %tmp9, label %bb10, label %bb1
3860 bb10: ; preds = %bb1
3864 define void @bcast_unfold_pcmp_v16i32(i32* %arg) {
3865 ; CHECK-LABEL: bcast_unfold_pcmp_v16i32:
3866 ; CHECK: # %bb.0: # %bb
3867 ; CHECK-NEXT: xorl %eax, %eax
3868 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3869 ; CHECK-NEXT: .p2align 4, 0x90
3870 ; CHECK-NEXT: .LBB110_1: # %bb1
3871 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3872 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
3873 ; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1
3874 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
3875 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
3876 ; CHECK-NEXT: addq $16, %rax
3877 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3878 ; CHECK-NEXT: jg .LBB110_1
3879 ; CHECK-NEXT: # %bb.2: # %bb10
3880 ; CHECK-NEXT: vzeroupper
3885 bb1: ; preds = %bb1, %bb
3886 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3887 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3888 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3889 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3890 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3891 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3892 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3893 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3894 %tmp8 = add i64 %tmp, 16
3895 %tmp9 = icmp slt i64 %tmp8, 1024
3896 br i1 %tmp9, label %bb10, label %bb1
3898 bb10: ; preds = %bb1
3902 define void @bcast_unfold_pcmp_v2i64(i64* %arg) {
3903 ; CHECK-LABEL: bcast_unfold_pcmp_v2i64:
3904 ; CHECK: # %bb.0: # %bb
3905 ; CHECK-NEXT: xorl %eax, %eax
3906 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1]
3907 ; CHECK-NEXT: .p2align 4, 0x90
3908 ; CHECK-NEXT: .LBB111_1: # %bb1
3909 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3910 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
3911 ; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1
3912 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
3913 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
3914 ; CHECK-NEXT: addq $2, %rax
3915 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3916 ; CHECK-NEXT: jg .LBB111_1
3917 ; CHECK-NEXT: # %bb.2: # %bb10
3922 bb1: ; preds = %bb1, %bb
3923 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3924 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3925 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3926 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3927 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1>
3928 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3929 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3930 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3931 %tmp8 = add i64 %tmp, 2
3932 %tmp9 = icmp slt i64 %tmp8, 1024
3933 br i1 %tmp9, label %bb10, label %bb1
3935 bb10: ; preds = %bb1
3938 define void @bcast_unfold_pcmp_v4i64(i64* %arg) {
3939 ; CHECK-LABEL: bcast_unfold_pcmp_v4i64:
3940 ; CHECK: # %bb.0: # %bb
3941 ; CHECK-NEXT: xorl %eax, %eax
3942 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3943 ; CHECK-NEXT: .p2align 4, 0x90
3944 ; CHECK-NEXT: .LBB112_1: # %bb1
3945 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3946 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
3947 ; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1
3948 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
3949 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
3950 ; CHECK-NEXT: addq $4, %rax
3951 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3952 ; CHECK-NEXT: jg .LBB112_1
3953 ; CHECK-NEXT: # %bb.2: # %bb10
3954 ; CHECK-NEXT: vzeroupper
3959 bb1: ; preds = %bb1, %bb
3960 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3961 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3962 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3963 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3964 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3965 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3966 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3967 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3968 %tmp8 = add i64 %tmp, 4
3969 %tmp9 = icmp slt i64 %tmp8, 1024
3970 br i1 %tmp9, label %bb10, label %bb1
3972 bb10: ; preds = %bb1
3976 define void @bcast_unfold_pcmp_v8i64(i64* %arg) {
3977 ; CHECK-LABEL: bcast_unfold_pcmp_v8i64:
3978 ; CHECK: # %bb.0: # %bb
3979 ; CHECK-NEXT: xorl %eax, %eax
3980 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3981 ; CHECK-NEXT: .p2align 4, 0x90
3982 ; CHECK-NEXT: .LBB113_1: # %bb1
3983 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
3984 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
3985 ; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1
3986 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
3987 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
3988 ; CHECK-NEXT: addq $8, %rax
3989 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
3990 ; CHECK-NEXT: jg .LBB113_1
3991 ; CHECK-NEXT: # %bb.2: # %bb10
3992 ; CHECK-NEXT: vzeroupper
3997 bb1: ; preds = %bb1, %bb
3998 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3999 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4000 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
4001 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
4002 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
4003 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
4004 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
4005 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
4006 %tmp8 = add i64 %tmp, 8
4007 %tmp9 = icmp slt i64 %tmp8, 1024
4008 br i1 %tmp9, label %bb10, label %bb1
4010 bb10: ; preds = %bb1
4014 define void @bcast_unfold_pcmpu_v4i32(i32* %arg) {
4015 ; CHECK-LABEL: bcast_unfold_pcmpu_v4i32:
4016 ; CHECK: # %bb.0: # %bb
4017 ; CHECK-NEXT: xorl %eax, %eax
4018 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4019 ; CHECK-NEXT: .p2align 4, 0x90
4020 ; CHECK-NEXT: .LBB114_1: # %bb1
4021 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4022 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1
4023 ; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1
4024 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
4025 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4)
4026 ; CHECK-NEXT: addq $4, %rax
4027 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4028 ; CHECK-NEXT: ja .LBB114_1
4029 ; CHECK-NEXT: # %bb.2: # %bb10
4034 bb1: ; preds = %bb1, %bb
4035 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4036 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4037 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4038 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4039 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4040 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4041 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4042 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4043 %tmp8 = add i64 %tmp, 4
4044 %tmp9 = icmp ult i64 %tmp8, 1024
4045 br i1 %tmp9, label %bb10, label %bb1
4047 bb10: ; preds = %bb1
4051 define void @bcast_unfold_pcmpu_v8i32(i32* %arg) {
4052 ; CHECK-LABEL: bcast_unfold_pcmpu_v8i32:
4053 ; CHECK: # %bb.0: # %bb
4054 ; CHECK-NEXT: xorl %eax, %eax
4055 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
4056 ; CHECK-NEXT: .p2align 4, 0x90
4057 ; CHECK-NEXT: .LBB115_1: # %bb1
4058 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4059 ; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1
4060 ; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1
4061 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
4062 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4)
4063 ; CHECK-NEXT: addq $8, %rax
4064 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4065 ; CHECK-NEXT: ja .LBB115_1
4066 ; CHECK-NEXT: # %bb.2: # %bb10
4067 ; CHECK-NEXT: vzeroupper
4072 bb1: ; preds = %bb1, %bb
4073 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4074 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4075 %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
4076 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
4077 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
4078 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
4079 %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
4080 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
4081 %tmp8 = add i64 %tmp, 8
4082 %tmp9 = icmp ult i64 %tmp8, 1024
4083 br i1 %tmp9, label %bb10, label %bb1
4085 bb10: ; preds = %bb1
4089 define void @bcast_unfold_pcmpu_v16i32(i32* %arg) {
4090 ; CHECK-LABEL: bcast_unfold_pcmpu_v16i32:
4091 ; CHECK: # %bb.0: # %bb
4092 ; CHECK-NEXT: xorl %eax, %eax
4093 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
4094 ; CHECK-NEXT: .p2align 4, 0x90
4095 ; CHECK-NEXT: .LBB116_1: # %bb1
4096 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4097 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1
4098 ; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1
4099 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
4100 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4)
4101 ; CHECK-NEXT: addq $16, %rax
4102 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4103 ; CHECK-NEXT: ja .LBB116_1
4104 ; CHECK-NEXT: # %bb.2: # %bb10
4105 ; CHECK-NEXT: vzeroupper
4110 bb1: ; preds = %bb1, %bb
4111 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4112 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4113 %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
4114 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
4115 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
4116 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
4117 %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
4118 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
4119 %tmp8 = add i64 %tmp, 16
4120 %tmp9 = icmp ult i64 %tmp8, 1024
4121 br i1 %tmp9, label %bb10, label %bb1
4123 bb10: ; preds = %bb1
4127 define void @bcast_unfold_pcmpu_v2i64(i64* %arg) {
4128 ; CHECK-LABEL: bcast_unfold_pcmpu_v2i64:
4129 ; CHECK: # %bb.0: # %bb
4130 ; CHECK-NEXT: xorl %eax, %eax
4131 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2]
4132 ; CHECK-NEXT: .p2align 4, 0x90
4133 ; CHECK-NEXT: .LBB117_1: # %bb1
4134 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4135 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1
4136 ; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1
4137 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
4138 ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8)
4139 ; CHECK-NEXT: addq $2, %rax
4140 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4141 ; CHECK-NEXT: ja .LBB117_1
4142 ; CHECK-NEXT: # %bb.2: # %bb10
4147 bb1: ; preds = %bb1, %bb
4148 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4149 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4150 %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
4151 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
4152 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
4153 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
4154 %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
4155 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
4156 %tmp8 = add i64 %tmp, 2
4157 %tmp9 = icmp ult i64 %tmp8, 1024
4158 br i1 %tmp9, label %bb10, label %bb1
4160 bb10: ; preds = %bb1
4163 define void @bcast_unfold_pcmpu_v4i64(i64* %arg) {
4164 ; CHECK-LABEL: bcast_unfold_pcmpu_v4i64:
4165 ; CHECK: # %bb.0: # %bb
4166 ; CHECK-NEXT: xorl %eax, %eax
4167 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4168 ; CHECK-NEXT: .p2align 4, 0x90
4169 ; CHECK-NEXT: .LBB118_1: # %bb1
4170 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4171 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1
4172 ; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1
4173 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
4174 ; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8)
4175 ; CHECK-NEXT: addq $4, %rax
4176 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4177 ; CHECK-NEXT: ja .LBB118_1
4178 ; CHECK-NEXT: # %bb.2: # %bb10
4179 ; CHECK-NEXT: vzeroupper
4184 bb1: ; preds = %bb1, %bb
4185 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4186 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4187 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4188 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
4189 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4190 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4191 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4192 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
4193 %tmp8 = add i64 %tmp, 4
4194 %tmp9 = icmp ult i64 %tmp8, 1024
4195 br i1 %tmp9, label %bb10, label %bb1
4197 bb10: ; preds = %bb1
4201 define void @bcast_unfold_pcmpu_v8i64(i64* %arg) {
4202 ; CHECK-LABEL: bcast_unfold_pcmpu_v8i64:
4203 ; CHECK: # %bb.0: # %bb
4204 ; CHECK-NEXT: xorl %eax, %eax
4205 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
4206 ; CHECK-NEXT: .p2align 4, 0x90
4207 ; CHECK-NEXT: .LBB119_1: # %bb1
4208 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4209 ; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1
4210 ; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1
4211 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
4212 ; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8)
4213 ; CHECK-NEXT: addq $8, %rax
4214 ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF
4215 ; CHECK-NEXT: ja .LBB119_1
4216 ; CHECK-NEXT: # %bb.2: # %bb10
4217 ; CHECK-NEXT: vzeroupper
4222 bb1: ; preds = %bb1, %bb
4223 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4224 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4225 %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
4226 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
4227 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
4228 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
4229 %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
4230 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
4231 %tmp8 = add i64 %tmp, 8
4232 %tmp9 = icmp ult i64 %tmp8, 1024
4233 br i1 %tmp9, label %bb10, label %bb1
4235 bb10: ; preds = %bb1
4239 define void @bcast_unfold_cmp_v4f32(float* %arg) {
4240 ; CHECK-LABEL: bcast_unfold_cmp_v4f32:
4241 ; CHECK: # %bb.0: # %bb
4242 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4243 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4244 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4245 ; CHECK-NEXT: .p2align 4, 0x90
4246 ; CHECK-NEXT: .LBB120_1: # %bb1
4247 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4248 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2
4249 ; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1
4250 ; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1}
4251 ; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax)
4252 ; CHECK-NEXT: addq $16, %rax
4253 ; CHECK-NEXT: jne .LBB120_1
4254 ; CHECK-NEXT: # %bb.2: # %bb10
4259 bb1: ; preds = %bb1, %bb
4260 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4261 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4262 %tmp3 = bitcast float* %tmp2 to <4 x float>*
4263 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
4264 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4265 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4266 %tmp7 = bitcast float* %tmp2 to <4 x float>*
4267 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
4268 %tmp8 = add i64 %tmp, 4
4269 %tmp9 = icmp eq i64 %tmp8, 1024
4270 br i1 %tmp9, label %bb10, label %bb1
4272 bb10: ; preds = %bb1
4276 define void @bcast_unfold_cmp_v8f32(float* %arg) {
4277 ; CHECK-LABEL: bcast_unfold_cmp_v8f32:
4278 ; CHECK: # %bb.0: # %bb
4279 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4280 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4281 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4282 ; CHECK-NEXT: .p2align 4, 0x90
4283 ; CHECK-NEXT: .LBB121_1: # %bb1
4284 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4285 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2
4286 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1
4287 ; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1}
4288 ; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
4289 ; CHECK-NEXT: addq $32, %rax
4290 ; CHECK-NEXT: jne .LBB121_1
4291 ; CHECK-NEXT: # %bb.2: # %bb10
4292 ; CHECK-NEXT: vzeroupper
4297 bb1: ; preds = %bb1, %bb
4298 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4299 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4300 %tmp3 = bitcast float* %tmp2 to <8 x float>*
4301 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
4302 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4303 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4304 %tmp7 = bitcast float* %tmp2 to <8 x float>*
4305 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
4306 %tmp8 = add i64 %tmp, 8
4307 %tmp9 = icmp eq i64 %tmp8, 1024
4308 br i1 %tmp9, label %bb10, label %bb1
4310 bb10: ; preds = %bb1
4314 define void @bcast_unfold_cmp_v16f32(float* %arg) {
4315 ; CHECK-LABEL: bcast_unfold_cmp_v16f32:
4316 ; CHECK: # %bb.0: # %bb
4317 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4318 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4319 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4320 ; CHECK-NEXT: .p2align 4, 0x90
4321 ; CHECK-NEXT: .LBB122_1: # %bb1
4322 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4323 ; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2
4324 ; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1
4325 ; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1}
4326 ; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax)
4327 ; CHECK-NEXT: addq $64, %rax
4328 ; CHECK-NEXT: jne .LBB122_1
4329 ; CHECK-NEXT: # %bb.2: # %bb10
4330 ; CHECK-NEXT: vzeroupper
4335 bb1: ; preds = %bb1, %bb
4336 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4337 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4338 %tmp3 = bitcast float* %tmp2 to <16 x float>*
4339 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
4340 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4341 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4342 %tmp7 = bitcast float* %tmp2 to <16 x float>*
4343 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
4344 %tmp8 = add i64 %tmp, 16
4345 %tmp9 = icmp eq i64 %tmp8, 1024
4346 br i1 %tmp9, label %bb10, label %bb1
4348 bb10: ; preds = %bb1
4352 define void @bcast_unfold_cmp_v2f64(double* %arg) {
4353 ; CHECK-LABEL: bcast_unfold_cmp_v2f64:
4354 ; CHECK: # %bb.0: # %bb
4355 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4356 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
4357 ; CHECK-NEXT: # xmm0 = mem[0,0]
4358 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [3.0E+0,3.0E+0]
4359 ; CHECK-NEXT: # xmm1 = mem[0,0]
4360 ; CHECK-NEXT: .p2align 4, 0x90
4361 ; CHECK-NEXT: .LBB123_1: # %bb1
4362 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4363 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2
4364 ; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1
4365 ; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1}
4366 ; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax)
4367 ; CHECK-NEXT: addq $16, %rax
4368 ; CHECK-NEXT: jne .LBB123_1
4369 ; CHECK-NEXT: # %bb.2: # %bb10
4374 bb1: ; preds = %bb1, %bb
4375 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4376 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4377 %tmp3 = bitcast double* %tmp2 to <2 x double>*
4378 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
4379 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
4380 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00>
4381 %tmp7 = bitcast double* %tmp2 to <2 x double>*
4382 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
4383 %tmp8 = add i64 %tmp, 2
4384 %tmp9 = icmp eq i64 %tmp8, 1024
4385 br i1 %tmp9, label %bb10, label %bb1
4387 bb10: ; preds = %bb1
4391 define void @bcast_unfold_cmp_v4f64(double* %arg) {
4392 ; CHECK-LABEL: bcast_unfold_cmp_v4f64:
4393 ; CHECK: # %bb.0: # %bb
4394 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4395 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4396 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4397 ; CHECK-NEXT: .p2align 4, 0x90
4398 ; CHECK-NEXT: .LBB124_1: # %bb1
4399 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4400 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2
4401 ; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1
4402 ; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1}
4403 ; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax)
4404 ; CHECK-NEXT: addq $32, %rax
4405 ; CHECK-NEXT: jne .LBB124_1
4406 ; CHECK-NEXT: # %bb.2: # %bb10
4407 ; CHECK-NEXT: vzeroupper
4412 bb1: ; preds = %bb1, %bb
4413 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4414 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4415 %tmp3 = bitcast double* %tmp2 to <4 x double>*
4416 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
4417 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4418 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4419 %tmp7 = bitcast double* %tmp2 to <4 x double>*
4420 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
4421 %tmp8 = add i64 %tmp, 4
4422 %tmp9 = icmp eq i64 %tmp8, 1024
4423 br i1 %tmp9, label %bb10, label %bb1
4425 bb10: ; preds = %bb1
4429 define void @bcast_unfold_cmp_v8f64(double* %arg) {
4430 ; CHECK-LABEL: bcast_unfold_cmp_v8f64:
4431 ; CHECK: # %bb.0: # %bb
4432 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4433 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4434 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4435 ; CHECK-NEXT: .p2align 4, 0x90
4436 ; CHECK-NEXT: .LBB125_1: # %bb1
4437 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4438 ; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2
4439 ; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1
4440 ; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1}
4441 ; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax)
4442 ; CHECK-NEXT: addq $64, %rax
4443 ; CHECK-NEXT: jne .LBB125_1
4444 ; CHECK-NEXT: # %bb.2: # %bb10
4445 ; CHECK-NEXT: vzeroupper
4450 bb1: ; preds = %bb1, %bb
4451 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4452 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4453 %tmp3 = bitcast double* %tmp2 to <8 x double>*
4454 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
4455 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4456 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4457 %tmp7 = bitcast double* %tmp2 to <8 x double>*
4458 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
4459 %tmp8 = add i64 %tmp, 8
4460 %tmp9 = icmp eq i64 %tmp8, 1024
4461 br i1 %tmp9, label %bb10, label %bb1
4463 bb10: ; preds = %bb1
4467 define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) {
4468 ; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold:
4470 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4471 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4472 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4473 ; CHECK-NEXT: .p2align 4, 0x90
4474 ; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1
4475 ; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
4476 ; CHECK-NEXT: vblendmps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
4477 ; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax)
4478 ; CHECK-NEXT: addq $32, %rax
4479 ; CHECK-NEXT: jne .LBB126_1
4480 ; CHECK-NEXT: # %bb.2:
4481 ; CHECK-NEXT: vzeroupper
4486 %3 = phi i64 [ 0, %1 ], [ %10, %2 ]
4487 %4 = getelementptr inbounds float, float* %0, i64 %3
4488 %5 = bitcast float* %4 to <8 x float>*
4489 %6 = load <8 x float>, <8 x float>* %5, align 4
4490 %7 = fcmp olt <8 x float> %6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4491 %8 = select <8 x i1> %7, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4492 %9 = bitcast float* %4 to <8 x float>*
4493 store <8 x float> %8, <8 x float>* %9, align 4
4495 %11 = icmp eq i64 %10, 1024
4496 br i1 %11, label %12, label %2
4502 define void @bcast_unfold_ptestm_v4i32(i32* %arg) {
4503 ; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
4504 ; CHECK: # %bb.0: # %bb
4505 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4506 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4507 ; CHECK-NEXT: .p2align 4, 0x90
4508 ; CHECK-NEXT: .LBB127_1: # %bb1
4509 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4510 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
4511 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1
4512 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
4513 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
4514 ; CHECK-NEXT: addq $16, %rax
4515 ; CHECK-NEXT: jne .LBB127_1
4516 ; CHECK-NEXT: # %bb.2: # %bb10
4521 bb1: ; preds = %bb1, %bb
4522 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4523 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4524 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4525 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4526 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4527 %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
4528 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4529 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4530 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4531 %tmp8 = add i64 %tmp, 4
4532 %tmp9 = icmp eq i64 %tmp8, 1024
4533 br i1 %tmp9, label %bb10, label %bb1
4535 bb10: ; preds = %bb1
4539 define void @bcast_unfold_ptestnm_v4i32(i32* %arg) {
4540 ; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
4541 ; CHECK: # %bb.0: # %bb
4542 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4543 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4544 ; CHECK-NEXT: .p2align 4, 0x90
4545 ; CHECK-NEXT: .LBB128_1: # %bb1
4546 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4547 ; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1
4548 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1
4549 ; CHECK-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1}
4550 ; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
4551 ; CHECK-NEXT: addq $16, %rax
4552 ; CHECK-NEXT: jne .LBB128_1
4553 ; CHECK-NEXT: # %bb.2: # %bb10
4558 bb1: ; preds = %bb1, %bb
4559 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4560 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4561 %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4562 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4563 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4564 %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
4565 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4566 %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4567 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4568 %tmp8 = add i64 %tmp, 4
4569 %tmp9 = icmp eq i64 %tmp8, 1024
4570 br i1 %tmp9, label %bb10, label %bb1
4572 bb10: ; preds = %bb1
4576 define void @bcast_unfold_ptestm_v4i64(i64* %arg) {
4577 ; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
4578 ; CHECK: # %bb.0: # %bb
4579 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4580 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4581 ; CHECK-NEXT: .p2align 4, 0x90
4582 ; CHECK-NEXT: .LBB129_1: # %bb1
4583 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4584 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
4585 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1
4586 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
4587 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
4588 ; CHECK-NEXT: addq $32, %rax
4589 ; CHECK-NEXT: jne .LBB129_1
4590 ; CHECK-NEXT: # %bb.2: # %bb10
4591 ; CHECK-NEXT: vzeroupper
4596 bb1: ; preds = %bb1, %bb
4597 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4598 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4599 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4600 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
4601 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4602 %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
4603 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4604 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4605 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
4606 %tmp8 = add i64 %tmp, 4
4607 %tmp9 = icmp eq i64 %tmp8, 1024
4608 br i1 %tmp9, label %bb10, label %bb1
4610 bb10: ; preds = %bb1
4614 define void @bcast_unfold_ptestnm_v4i64(i64* %arg) {
4615 ; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
4616 ; CHECK: # %bb.0: # %bb
4617 ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000
4618 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4619 ; CHECK-NEXT: .p2align 4, 0x90
4620 ; CHECK-NEXT: .LBB130_1: # %bb1
4621 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4622 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1
4623 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1
4624 ; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1}
4625 ; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax)
4626 ; CHECK-NEXT: addq $32, %rax
4627 ; CHECK-NEXT: jne .LBB130_1
4628 ; CHECK-NEXT: # %bb.2: # %bb10
4629 ; CHECK-NEXT: vzeroupper
4634 bb1: ; preds = %bb1, %bb
4635 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4636 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4637 %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4638 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
4639 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4640 %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
4641 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4642 %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4643 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
4644 %tmp8 = add i64 %tmp, 4
4645 %tmp9 = icmp eq i64 %tmp8, 1024
4646 br i1 %tmp9, label %bb10, label %bb1
4648 bb10: ; preds = %bb1
4652 ; The or/and pattern here should be turned into vpternlog. The multiply is
4653 ; there to increase the use count of the loads so they can't fold. We want to
4654 ; unfold the broadcast and pull it out of the loop.
4655 define void @bcast_unfold_vpternlog_v16i32(i32* %arg, i32* %arg1) {
4656 ; CHECK-LABEL: bcast_unfold_vpternlog_v16i32:
4657 ; CHECK: # %bb.0: # %bb
4658 ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
4659 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
4660 ; CHECK-NEXT: .p2align 4, 0x90
4661 ; CHECK-NEXT: .LBB131_1: # %bb2
4662 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
4663 ; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1
4664 ; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm2
4665 ; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm3
4666 ; CHECK-NEXT: vpternlogd $216, %zmm0, %zmm1, %zmm2
4667 ; CHECK-NEXT: vpmulld %zmm3, %zmm2, %zmm1
4668 ; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
4669 ; CHECK-NEXT: addq $64, %rax
4670 ; CHECK-NEXT: jne .LBB131_1
4671 ; CHECK-NEXT: # %bb.2: # %bb20
4672 ; CHECK-NEXT: vzeroupper
4677 bb2: ; preds = %bb2, %bb
4678 %tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ]
4679 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4680 %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
4681 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
4682 %tmp6 = getelementptr inbounds i32, i32* %arg1, i64 %tmp
4683 %tmp10 = bitcast i32* %tmp6 to <16 x i32>*
4684 %tmp11 = load <16 x i32>, <16 x i32>* %tmp10, align 4
4685 %tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
4686 %tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
4687 %tmp14 = or <16 x i32> %tmp12, %tmp13
4688 %tmp15 = mul <16 x i32> %tmp14, %tmp5
4689 %tmp16 = mul <16 x i32> %tmp15, %tmp11
4690 %tmp17 = bitcast i32* %tmp3 to <16 x i32>*
4691 store <16 x i32> %tmp16, <16 x i32>* %tmp17, align 4
4692 %tmp18 = add i64 %tmp, 16
4693 %tmp19 = icmp eq i64 %tmp18, 1024
4694 br i1 %tmp19, label %bb20, label %bb2
4696 bb20: ; preds = %bb2