1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc < %s | FileCheck %s
4 ; Check that the SCEVs produced from the multiple loops don't attempt to get
5 ; combines in invalid ways. The LSR filtering could attempt to combine addrecs
6 ; from different loops.
8 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
9 target triple = "x86_64-unknown-linux-gnu"
11 define void @in4dob_(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2, i64 %3, i1 %min.iters.check840) "target-cpu"="icelake-server" {
12 ; CHECK-LABEL: in4dob_:
13 ; CHECK: # %bb.0: # %.preheader263
14 ; CHECK-NEXT: leaq (,%rcx,4), %r9
15 ; CHECK-NEXT: movl $1, %r10d
16 ; CHECK-NEXT: xorl %eax, %eax
17 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
18 ; CHECK-NEXT: jmp .LBB0_1
19 ; CHECK-NEXT: .p2align 4, 0x90
20 ; CHECK-NEXT: .LBB0_20: # in Loop: Header=BB0_1 Depth=1
21 ; CHECK-NEXT: incq %r10
22 ; CHECK-NEXT: addq %r9, %rax
23 ; CHECK-NEXT: cmpq %r10, %rcx
24 ; CHECK-NEXT: je .LBB0_18
25 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
26 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
27 ; CHECK-NEXT: vucomiss %xmm0, %xmm1
28 ; CHECK-NEXT: jne .LBB0_20
29 ; CHECK-NEXT: jp .LBB0_20
30 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1
31 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
32 ; CHECK-NEXT: vucomiss %xmm0, %xmm1
33 ; CHECK-NEXT: jne .LBB0_20
34 ; CHECK-NEXT: jp .LBB0_20
35 ; CHECK-NEXT: # %bb.3: # %vector.body807.preheader
36 ; CHECK-NEXT: leaq 1(%rcx), %rdx
37 ; CHECK-NEXT: movl %edx, %esi
38 ; CHECK-NEXT: andl $7, %esi
39 ; CHECK-NEXT: cmpq $7, %rcx
40 ; CHECK-NEXT: jae .LBB0_5
41 ; CHECK-NEXT: # %bb.4:
42 ; CHECK-NEXT: xorl %r9d, %r9d
43 ; CHECK-NEXT: jmp .LBB0_7
44 ; CHECK-NEXT: .LBB0_5: # %vector.body807.preheader.new
45 ; CHECK-NEXT: movq %rdx, %r10
46 ; CHECK-NEXT: andq $-8, %r10
47 ; CHECK-NEXT: xorl %r9d, %r9d
48 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
49 ; CHECK-NEXT: .p2align 4, 0x90
50 ; CHECK-NEXT: .LBB0_6: # %vector.body807
51 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
52 ; CHECK-NEXT: leaq (%rdi,%r9), %r11
53 ; CHECK-NEXT: vmovups %ymm0, (%rax,%r11)
54 ; CHECK-NEXT: vmovups %ymm0, 1(%rax,%r11)
55 ; CHECK-NEXT: vmovups %ymm0, 2(%rax,%r11)
56 ; CHECK-NEXT: vmovups %ymm0, 3(%rax,%r11)
57 ; CHECK-NEXT: vmovups %ymm0, 4(%rax,%r11)
58 ; CHECK-NEXT: vmovups %ymm0, 5(%rax,%r11)
59 ; CHECK-NEXT: vmovups %ymm0, 6(%rax,%r11)
60 ; CHECK-NEXT: vmovups %ymm0, 7(%rax,%r11)
61 ; CHECK-NEXT: addq $8, %r9
62 ; CHECK-NEXT: cmpq %r9, %r10
63 ; CHECK-NEXT: jne .LBB0_6
64 ; CHECK-NEXT: .LBB0_7: # %.lr.ph373.unr-lcssa
65 ; CHECK-NEXT: testq %rsi, %rsi
66 ; CHECK-NEXT: je .LBB0_10
67 ; CHECK-NEXT: # %bb.8: # %vector.body807.epil.preheader
68 ; CHECK-NEXT: addq %rdi, %r9
69 ; CHECK-NEXT: xorl %r10d, %r10d
70 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
71 ; CHECK-NEXT: .p2align 4, 0x90
72 ; CHECK-NEXT: .LBB0_9: # %vector.body807.epil
73 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
74 ; CHECK-NEXT: leaq (%r9,%r10), %r11
75 ; CHECK-NEXT: vmovups %ymm0, (%rax,%r11)
76 ; CHECK-NEXT: incq %r10
77 ; CHECK-NEXT: cmpq %r10, %rsi
78 ; CHECK-NEXT: jne .LBB0_9
79 ; CHECK-NEXT: .LBB0_10: # %.lr.ph373
80 ; CHECK-NEXT: testb $1, %r8b
81 ; CHECK-NEXT: je .LBB0_11
82 ; CHECK-NEXT: # %bb.19: # %scalar.ph839.preheader
83 ; CHECK-NEXT: movl $0, (%rdi)
84 ; CHECK-NEXT: vzeroupper
86 ; CHECK-NEXT: .LBB0_11: # %vector.body847.preheader
87 ; CHECK-NEXT: movl %edx, %esi
88 ; CHECK-NEXT: andl $7, %esi
89 ; CHECK-NEXT: cmpq $7, %rcx
90 ; CHECK-NEXT: jae .LBB0_13
91 ; CHECK-NEXT: # %bb.12:
92 ; CHECK-NEXT: xorl %ecx, %ecx
93 ; CHECK-NEXT: jmp .LBB0_15
94 ; CHECK-NEXT: .LBB0_13: # %vector.body847.preheader.new
95 ; CHECK-NEXT: andq $-8, %rdx
96 ; CHECK-NEXT: xorl %ecx, %ecx
97 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
98 ; CHECK-NEXT: .p2align 4, 0x90
99 ; CHECK-NEXT: .LBB0_14: # %vector.body847
100 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
101 ; CHECK-NEXT: leaq (%rdi,%rcx), %r8
102 ; CHECK-NEXT: vmovups %ymm0, 96(%rax,%r8)
103 ; CHECK-NEXT: vmovups %ymm0, 97(%rax,%r8)
104 ; CHECK-NEXT: vmovups %ymm0, 98(%rax,%r8)
105 ; CHECK-NEXT: vmovups %ymm0, 99(%rax,%r8)
106 ; CHECK-NEXT: vmovups %ymm0, 100(%rax,%r8)
107 ; CHECK-NEXT: vmovups %ymm0, 101(%rax,%r8)
108 ; CHECK-NEXT: vmovups %ymm0, 102(%rax,%r8)
109 ; CHECK-NEXT: vmovups %ymm0, 103(%rax,%r8)
110 ; CHECK-NEXT: addq $8, %rcx
111 ; CHECK-NEXT: cmpq %rcx, %rdx
112 ; CHECK-NEXT: jne .LBB0_14
113 ; CHECK-NEXT: .LBB0_15: # %common.ret.loopexit.unr-lcssa
114 ; CHECK-NEXT: testq %rsi, %rsi
115 ; CHECK-NEXT: je .LBB0_18
116 ; CHECK-NEXT: # %bb.16: # %vector.body847.epil.preheader
117 ; CHECK-NEXT: leaq 96(%rcx,%rdi), %rcx
118 ; CHECK-NEXT: xorl %edx, %edx
119 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
120 ; CHECK-NEXT: .p2align 4, 0x90
121 ; CHECK-NEXT: .LBB0_17: # %vector.body847.epil
122 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
123 ; CHECK-NEXT: leaq (%rcx,%rdx), %rdi
124 ; CHECK-NEXT: vmovups %ymm0, (%rax,%rdi)
125 ; CHECK-NEXT: incq %rdx
126 ; CHECK-NEXT: cmpq %rdx, %rsi
127 ; CHECK-NEXT: jne .LBB0_17
128 ; CHECK-NEXT: .LBB0_18: # %common.ret
129 ; CHECK-NEXT: vzeroupper
135 5: ; preds = %16, %.preheader263
136 %lsr.iv1135 = phi ptr [ %0, %.preheader263 ], [ %uglygep1136, %16 ]
137 %indvars.iv487 = phi i64 [ 1, %.preheader263 ], [ %indvars.iv.next488, %16 ]
138 %6 = getelementptr float, ptr %1, i64 %indvars.iv487
139 %7 = load float, ptr %6, align 4
140 %8 = fcmp oeq float %7, 0.000000e+00
141 %9 = getelementptr float, ptr %2, i64 %indvars.iv487
142 %10 = load float, ptr %9, align 4
143 %11 = fcmp oeq float %10, 0.000000e+00
145 br i1 %12, label %vector.body807.preheader, label %16
147 vector.body807.preheader: ; preds = %5
149 %xtraiter = and i64 %13, 7
150 %14 = icmp ult i64 %3, 7
151 br i1 %14, label %.lr.ph373.unr-lcssa, label %vector.body807.preheader.new
153 vector.body807.preheader.new: ; preds = %vector.body807.preheader
154 %unroll_iter = and i64 %13, -8
155 br label %vector.body807
157 vector.body807: ; preds = %vector.body807, %vector.body807.preheader.new
158 %lsr.iv1194 = phi i64 [ 0, %vector.body807.preheader.new ], [ %lsr.iv.next1195.7, %vector.body807 ]
159 %niter = phi i64 [ 0, %vector.body807.preheader.new ], [ %niter.next.7, %vector.body807 ]
160 %uglygep1197 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv1194
161 store <8 x float> zeroinitializer, ptr %uglygep1197, align 4
162 %lsr.iv.next1195 = or disjoint i64 %lsr.iv1194, 1
163 %uglygep1197.1 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195
164 store <8 x float> zeroinitializer, ptr %uglygep1197.1, align 4
165 %lsr.iv.next1195.1 = or disjoint i64 %lsr.iv1194, 2
166 %uglygep1197.2 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195.1
167 store <8 x float> zeroinitializer, ptr %uglygep1197.2, align 4
168 %lsr.iv.next1195.2 = or disjoint i64 %lsr.iv1194, 3
169 %uglygep1197.3 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195.2
170 store <8 x float> zeroinitializer, ptr %uglygep1197.3, align 4
171 %lsr.iv.next1195.3 = or disjoint i64 %lsr.iv1194, 4
172 %uglygep1197.4 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195.3
173 store <8 x float> zeroinitializer, ptr %uglygep1197.4, align 4
174 %lsr.iv.next1195.4 = or disjoint i64 %lsr.iv1194, 5
175 %uglygep1197.5 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195.4
176 store <8 x float> zeroinitializer, ptr %uglygep1197.5, align 4
177 %lsr.iv.next1195.5 = or disjoint i64 %lsr.iv1194, 6
178 %uglygep1197.6 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195.5
179 store <8 x float> zeroinitializer, ptr %uglygep1197.6, align 4
180 %lsr.iv.next1195.6 = or disjoint i64 %lsr.iv1194, 7
181 %uglygep1197.7 = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv.next1195.6
182 store <8 x float> zeroinitializer, ptr %uglygep1197.7, align 4
183 %lsr.iv.next1195.7 = add i64 %lsr.iv1194, 8
184 %niter.next.7 = add i64 %niter, 8
185 %niter.ncmp.7 = icmp eq i64 %niter.next.7, %unroll_iter
186 br i1 %niter.ncmp.7, label %.lr.ph373.unr-lcssa.loopexit, label %vector.body807
188 .lr.ph373.unr-lcssa.loopexit: ; preds = %vector.body807
189 br label %.lr.ph373.unr-lcssa
191 .lr.ph373.unr-lcssa: ; preds = %.lr.ph373.unr-lcssa.loopexit, %vector.body807.preheader
192 %lsr.iv1194.unr = phi i64 [ 0, %vector.body807.preheader ], [ %lsr.iv.next1195.7, %.lr.ph373.unr-lcssa.loopexit ]
193 %lcmp.mod.not = icmp eq i64 %xtraiter, 0
194 br i1 %lcmp.mod.not, label %.lr.ph373, label %vector.body807.epil.preheader
196 vector.body807.epil.preheader: ; preds = %.lr.ph373.unr-lcssa
197 br label %vector.body807.epil
199 vector.body807.epil: ; preds = %vector.body807.epil.preheader, %vector.body807.epil
200 %lsr.iv1194.epil = phi i64 [ %lsr.iv.next1195.epil, %vector.body807.epil ], [ %lsr.iv1194.unr, %vector.body807.epil.preheader ]
201 %epil.iter = phi i64 [ %epil.iter.next, %vector.body807.epil ], [ 0, %vector.body807.epil.preheader ]
202 %uglygep1197.epil = getelementptr i8, ptr %lsr.iv1135, i64 %lsr.iv1194.epil
203 store <8 x float> zeroinitializer, ptr %uglygep1197.epil, align 4
204 %lsr.iv.next1195.epil = add i64 %lsr.iv1194.epil, 1
205 %epil.iter.next = add i64 %epil.iter, 1
206 %epil.iter.cmp.not = icmp eq i64 %epil.iter.next, %xtraiter
207 br i1 %epil.iter.cmp.not, label %.lr.ph373.loopexit, label %vector.body807.epil
209 .lr.ph373.loopexit: ; preds = %vector.body807.epil
212 .lr.ph373: ; preds = %.lr.ph373.loopexit, %.lr.ph373.unr-lcssa
213 br i1 %min.iters.check840, label %scalar.ph839.preheader, label %vector.body847.preheader
215 vector.body847.preheader: ; preds = %.lr.ph373
216 %uglygep11551 = getelementptr i8, ptr %lsr.iv1135, i64 96
217 %xtraiter12 = and i64 %13, 7
218 %15 = icmp ult i64 %3, 7
219 br i1 %15, label %common.ret.loopexit.unr-lcssa, label %vector.body847.preheader.new
221 vector.body847.preheader.new: ; preds = %vector.body847.preheader
222 %unroll_iter15 = and i64 %13, -8
223 br label %vector.body847
225 vector.body847: ; preds = %vector.body847, %vector.body847.preheader.new
226 %lsr.iv1152 = phi i64 [ 0, %vector.body847.preheader.new ], [ %lsr.iv.next1153.7, %vector.body847 ]
227 %niter16 = phi i64 [ 0, %vector.body847.preheader.new ], [ %niter16.next.7, %vector.body847 ]
228 %uglygep1156 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv1152
229 store <8 x float> zeroinitializer, ptr %uglygep1156, align 4
230 %lsr.iv.next1153 = or disjoint i64 %lsr.iv1152, 1
231 %uglygep1156.1 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153
232 store <8 x float> zeroinitializer, ptr %uglygep1156.1, align 4
233 %lsr.iv.next1153.1 = or disjoint i64 %lsr.iv1152, 2
234 %uglygep1156.2 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153.1
235 store <8 x float> zeroinitializer, ptr %uglygep1156.2, align 4
236 %lsr.iv.next1153.2 = or disjoint i64 %lsr.iv1152, 3
237 %uglygep1156.3 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153.2
238 store <8 x float> zeroinitializer, ptr %uglygep1156.3, align 4
239 %lsr.iv.next1153.3 = or disjoint i64 %lsr.iv1152, 4
240 %uglygep1156.4 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153.3
241 store <8 x float> zeroinitializer, ptr %uglygep1156.4, align 4
242 %lsr.iv.next1153.4 = or disjoint i64 %lsr.iv1152, 5
243 %uglygep1156.5 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153.4
244 store <8 x float> zeroinitializer, ptr %uglygep1156.5, align 4
245 %lsr.iv.next1153.5 = or disjoint i64 %lsr.iv1152, 6
246 %uglygep1156.6 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153.5
247 store <8 x float> zeroinitializer, ptr %uglygep1156.6, align 4
248 %lsr.iv.next1153.6 = or disjoint i64 %lsr.iv1152, 7
249 %uglygep1156.7 = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv.next1153.6
250 store <8 x float> zeroinitializer, ptr %uglygep1156.7, align 4
251 %lsr.iv.next1153.7 = add i64 %lsr.iv1152, 8
252 %niter16.next.7 = add i64 %niter16, 8
253 %niter16.ncmp.7 = icmp eq i64 %niter16.next.7, %unroll_iter15
254 br i1 %niter16.ncmp.7, label %common.ret.loopexit.unr-lcssa.loopexit, label %vector.body847
256 common.ret.loopexit.unr-lcssa.loopexit: ; preds = %vector.body847
257 br label %common.ret.loopexit.unr-lcssa
259 common.ret.loopexit.unr-lcssa: ; preds = %common.ret.loopexit.unr-lcssa.loopexit, %vector.body847.preheader
260 %lsr.iv1152.unr = phi i64 [ 0, %vector.body847.preheader ], [ %lsr.iv.next1153.7, %common.ret.loopexit.unr-lcssa.loopexit ]
261 %lcmp.mod14.not = icmp eq i64 %xtraiter12, 0
262 br i1 %lcmp.mod14.not, label %common.ret, label %vector.body847.epil.preheader
264 vector.body847.epil.preheader: ; preds = %common.ret.loopexit.unr-lcssa
265 br label %vector.body847.epil
267 vector.body847.epil: ; preds = %vector.body847.epil.preheader, %vector.body847.epil
268 %lsr.iv1152.epil = phi i64 [ %lsr.iv.next1153.epil, %vector.body847.epil ], [ %lsr.iv1152.unr, %vector.body847.epil.preheader ]
269 %epil.iter13 = phi i64 [ %epil.iter13.next, %vector.body847.epil ], [ 0, %vector.body847.epil.preheader ]
270 %uglygep1156.epil = getelementptr i8, ptr %uglygep11551, i64 %lsr.iv1152.epil
271 store <8 x float> zeroinitializer, ptr %uglygep1156.epil, align 4
272 %lsr.iv.next1153.epil = add i64 %lsr.iv1152.epil, 1
273 %epil.iter13.next = add i64 %epil.iter13, 1
274 %epil.iter13.cmp.not = icmp eq i64 %epil.iter13.next, %xtraiter12
275 br i1 %epil.iter13.cmp.not, label %common.ret.loopexit, label %vector.body847.epil
277 common.ret.loopexit: ; preds = %vector.body847.epil
280 common.ret.loopexit1: ; preds = %16
283 common.ret: ; preds = %common.ret.loopexit1, %common.ret.loopexit, %scalar.ph839.preheader, %common.ret.loopexit.unr-lcssa
286 scalar.ph839.preheader: ; preds = %.lr.ph373
287 store float 0.000000e+00, ptr %0, align 4
291 %indvars.iv.next488 = add i64 %indvars.iv487, 1
292 %exitcond492.not = icmp eq i64 %indvars.iv.next488, %3
293 %uglygep1136 = getelementptr i8, ptr %lsr.iv1135, i64 %4
294 br i1 %exitcond492.not, label %common.ret.loopexit1, label %5