1 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
3 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
5 ;CHECK-LABEL: @reduction_sum(
9 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
11 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
13 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
15 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
16 %1 = icmp sgt i32 %n, 0
17 br i1 %1, label %.lr.ph, label %._crit_edge
19 .lr.ph: ; preds = %0, %.lr.ph
20 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
21 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
22 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
23 %3 = load i32, i32* %2, align 4
24 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
25 %5 = load i32, i32* %4, align 4
26 %6 = trunc i64 %indvars.iv to i32
27 %7 = add i32 %sum.02, %6
30 %indvars.iv.next = add i64 %indvars.iv, 1
31 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
32 %exitcond = icmp eq i32 %lftr.wideiv, %n
33 br i1 %exitcond, label %._crit_edge, label %.lr.ph
35 ._crit_edge: ; preds = %.lr.ph, %0
36 %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
40 ;CHECK-LABEL: @reduction_prod(
42 ;CHECK: load <4 x i32>
44 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
46 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
48 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
50 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
51 %1 = icmp sgt i32 %n, 0
52 br i1 %1, label %.lr.ph, label %._crit_edge
54 .lr.ph: ; preds = %0, %.lr.ph
55 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
56 %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
57 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
58 %3 = load i32, i32* %2, align 4
59 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
60 %5 = load i32, i32* %4, align 4
61 %6 = trunc i64 %indvars.iv to i32
62 %7 = mul i32 %prod.02, %6
65 %indvars.iv.next = add i64 %indvars.iv, 1
66 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
67 %exitcond = icmp eq i32 %lftr.wideiv, %n
68 br i1 %exitcond, label %._crit_edge, label %.lr.ph
70 ._crit_edge: ; preds = %.lr.ph, %0
71 %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
75 ;CHECK-LABEL: @reduction_mix(
77 ;CHECK: load <4 x i32>
78 ;CHECK: mul nsw <4 x i32>
79 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
81 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
83 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
85 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
86 %1 = icmp sgt i32 %n, 0
87 br i1 %1, label %.lr.ph, label %._crit_edge
89 .lr.ph: ; preds = %0, %.lr.ph
90 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
91 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
92 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
93 %3 = load i32, i32* %2, align 4
94 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
95 %5 = load i32, i32* %4, align 4
96 %6 = mul nsw i32 %5, %3
97 %7 = trunc i64 %indvars.iv to i32
98 %8 = add i32 %sum.02, %7
100 %indvars.iv.next = add i64 %indvars.iv, 1
101 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
102 %exitcond = icmp eq i32 %lftr.wideiv, %n
103 br i1 %exitcond, label %._crit_edge, label %.lr.ph
105 ._crit_edge: ; preds = %.lr.ph, %0
106 %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
110 ;CHECK-LABEL: @reduction_mul(
111 ;CHECK: mul <4 x i32>
112 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
113 ;CHECK: mul <4 x i32>
114 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
115 ;CHECK: mul <4 x i32>
116 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
118 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
119 %1 = icmp sgt i32 %n, 0
120 br i1 %1, label %.lr.ph, label %._crit_edge
122 .lr.ph: ; preds = %0, %.lr.ph
123 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
124 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
125 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
126 %3 = load i32, i32* %2, align 4
127 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
128 %5 = load i32, i32* %4, align 4
129 %6 = trunc i64 %indvars.iv to i32
132 %9 = mul i32 %8, %sum.02
133 %indvars.iv.next = add i64 %indvars.iv, 1
134 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
135 %exitcond = icmp eq i32 %lftr.wideiv, %n
136 br i1 %exitcond, label %._crit_edge, label %.lr.ph
138 ._crit_edge: ; preds = %.lr.ph, %0
139 %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
143 ;CHECK-LABEL: @start_at_non_zero(
144 ;CHECK: phi <4 x i32>
145 ;CHECK: <i32 120, i32 0, i32 0, i32 0>
146 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
147 ;CHECK: add <4 x i32>
148 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
149 ;CHECK: add <4 x i32>
150 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
152 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
154 %cmp7 = icmp sgt i32 %n, 0
155 br i1 %cmp7, label %for.body, label %for.end
157 for.body: ; preds = %entry, %for.body
158 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
159 %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
160 %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
161 %0 = load i32, i32* %arrayidx, align 4
162 %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
163 %1 = load i32, i32* %arrayidx2, align 4
164 %mul = mul nsw i32 %1, %0
165 %add = add nsw i32 %mul, %sum.09
166 %indvars.iv.next = add i64 %indvars.iv, 1
167 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
168 %exitcond = icmp eq i32 %lftr.wideiv, %n
169 br i1 %exitcond, label %for.end, label %for.body
171 for.end: ; preds = %for.body, %entry
172 %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
176 ;CHECK-LABEL: @reduction_and(
177 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
178 ;CHECK: and <4 x i32>
179 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
180 ;CHECK: and <4 x i32>
181 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
182 ;CHECK: and <4 x i32>
183 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
185 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
187 %cmp7 = icmp sgt i32 %n, 0
188 br i1 %cmp7, label %for.body, label %for.end
190 for.body: ; preds = %entry, %for.body
191 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
192 %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
193 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
194 %0 = load i32, i32* %arrayidx, align 4
195 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
196 %1 = load i32, i32* %arrayidx2, align 4
197 %add = add nsw i32 %1, %0
198 %and = and i32 %add, %result.08
199 %indvars.iv.next = add i64 %indvars.iv, 1
200 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
201 %exitcond = icmp eq i32 %lftr.wideiv, %n
202 br i1 %exitcond, label %for.end, label %for.body
204 for.end: ; preds = %for.body, %entry
205 %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
206 ret i32 %result.0.lcssa
209 ;CHECK-LABEL: @reduction_or(
211 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
213 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
215 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
217 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
219 %cmp7 = icmp sgt i32 %n, 0
220 br i1 %cmp7, label %for.body, label %for.end
222 for.body: ; preds = %entry, %for.body
223 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
224 %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
225 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
226 %0 = load i32, i32* %arrayidx, align 4
227 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
228 %1 = load i32, i32* %arrayidx2, align 4
229 %add = add nsw i32 %1, %0
230 %or = or i32 %add, %result.08
231 %indvars.iv.next = add i64 %indvars.iv, 1
232 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
233 %exitcond = icmp eq i32 %lftr.wideiv, %n
234 br i1 %exitcond, label %for.end, label %for.body
236 for.end: ; preds = %for.body, %entry
237 %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
238 ret i32 %result.0.lcssa
241 ;CHECK-LABEL: @reduction_xor(
242 ;CHECK: xor <4 x i32>
243 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
244 ;CHECK: xor <4 x i32>
245 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
246 ;CHECK: xor <4 x i32>
247 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
249 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
251 %cmp7 = icmp sgt i32 %n, 0
252 br i1 %cmp7, label %for.body, label %for.end
254 for.body: ; preds = %entry, %for.body
255 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
256 %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
257 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
258 %0 = load i32, i32* %arrayidx, align 4
259 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
260 %1 = load i32, i32* %arrayidx2, align 4
261 %add = add nsw i32 %1, %0
262 %xor = xor i32 %add, %result.08
263 %indvars.iv.next = add i64 %indvars.iv, 1
264 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
265 %exitcond = icmp eq i32 %lftr.wideiv, %n
266 br i1 %exitcond, label %for.end, label %for.body
268 for.end: ; preds = %for.body, %entry
269 %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
270 ret i32 %result.0.lcssa
273 ; In this code the subtracted variable is on the RHS and this is not an induction variable.
274 ;CHECK-LABEL: @reduction_sub_rhs(
275 ;CHECK-NOT: phi <4 x i32>
276 ;CHECK-NOT: sub nsw <4 x i32>
278 define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
280 %cmp4 = icmp sgt i32 %n, 0
281 br i1 %cmp4, label %for.body, label %for.end
283 for.body: ; preds = %entry, %for.body
284 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
285 %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
286 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
287 %0 = load i32, i32* %arrayidx, align 4
288 %sub = sub nsw i32 %0, %x.05
289 %indvars.iv.next = add i64 %indvars.iv, 1
290 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
291 %exitcond = icmp eq i32 %lftr.wideiv, %n
292 br i1 %exitcond, label %for.end, label %for.body
294 for.end: ; preds = %for.body, %entry
295 %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
300 ; In this test the reduction variable is on the LHS and we can vectorize it.
301 ;CHECK-LABEL: @reduction_sub_lhs(
302 ;CHECK: phi <4 x i32>
303 ;CHECK: sub nsw <4 x i32>
305 define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
307 %cmp4 = icmp sgt i32 %n, 0
308 br i1 %cmp4, label %for.body, label %for.end
310 for.body: ; preds = %entry, %for.body
311 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
312 %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
313 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
314 %0 = load i32, i32* %arrayidx, align 4
315 %sub = sub nsw i32 %x.05, %0
316 %indvars.iv.next = add i64 %indvars.iv, 1
317 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
318 %exitcond = icmp eq i32 %lftr.wideiv, %n
319 br i1 %exitcond, label %for.end, label %for.body
321 for.end: ; preds = %for.body, %entry
322 %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
326 ; We can vectorize conditional reductions with multi-input phis.
327 ; CHECK: reduction_conditional
328 ; CHECK: fadd fast <4 x float>
330 define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
335 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
336 %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
337 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
338 %0 = load float, float* %arrayidx, align 4
339 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
340 %1 = load float, float* %arrayidx2, align 4
341 %cmp3 = fcmp ogt float %0, %1
342 br i1 %cmp3, label %if.then, label %for.inc
345 %cmp6 = fcmp ogt float %1, 1.000000e+00
346 br i1 %cmp6, label %if.then8, label %if.else
349 %add = fadd fast float %sum.033, %0
353 %cmp14 = fcmp ogt float %0, 2.000000e+00
354 br i1 %cmp14, label %if.then16, label %for.inc
357 %add19 = fadd fast float %sum.033, %1
361 %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
362 %indvars.iv.next = add i64 %indvars.iv, 1
363 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
364 %exitcond = icmp ne i32 %lftr.wideiv, 128
365 br i1 %exitcond, label %for.body, label %for.end
368 %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
369 ret float %sum.1.lcssa
372 ; We can't vectorize reductions with phi inputs from outside the reduction.
373 ; CHECK: noreduction_phi
374 ; CHECK-NOT: fadd <4 x float>
375 define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) {
380 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
381 %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
382 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
383 %0 = load float, float* %arrayidx, align 4
384 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
385 %1 = load float, float* %arrayidx2, align 4
386 %cmp3 = fcmp ogt float %0, %1
387 br i1 %cmp3, label %if.then, label %for.inc
390 %cmp6 = fcmp ogt float %1, 1.000000e+00
391 br i1 %cmp6, label %if.then8, label %if.else
394 %add = fadd fast float %sum.033, %0
398 %cmp14 = fcmp ogt float %0, 2.000000e+00
399 br i1 %cmp14, label %if.then16, label %for.inc
402 %add19 = fadd fast float %sum.033, %1
406 %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ]
407 %indvars.iv.next = add i64 %indvars.iv, 1
408 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
409 %exitcond = icmp ne i32 %lftr.wideiv, 128
410 br i1 %exitcond, label %for.body, label %for.end
413 %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
414 ret float %sum.1.lcssa
417 ; We can't vectorize reductions that feed another header PHI.
418 ; CHECK: noredux_header_phi
419 ; CHECK-NOT: fadd <4 x float>
421 define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S) {
426 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
427 %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ]
428 %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ]
429 %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
430 %0 = load float, float* %arrayidx, align 4
431 %add = fadd fast float %sum.08, %0
432 %add1 = fadd fast float %sum2.09, %add
433 %indvars.iv.next = add i64 %indvars.iv, 1
434 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
435 %exitcond = icmp ne i32 %lftr.wideiv, 128
436 br i1 %exitcond, label %for.body, label %for.end
439 %add1.lcssa = phi float [ %add1, %for.body ]
440 %add.lcssa = phi float [ %add, %for.body ]
441 %add2 = fadd fast float %add.lcssa, %add1.lcssa
446 ; When vectorizing a reduction whose loop header phi value is used outside the
447 ; loop special care must be taken. Otherwise, the reduced value feeding into the
448 ; outside user misses a few iterations (VF-1) of the loop.
451 ; CHECK-LABEL: @phivalueredux(
454 define i32 @phivalueredux(i32 %p) {
459 %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
460 %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ]
461 %xor = xor i32 %p.addr.02, -1
462 %inc = add nsw i32 %t.03, 1
463 %exitcond = icmp eq i32 %inc, 16
464 br i1 %exitcond, label %for.end, label %for.body
470 ; Don't vectorize a reduction value that is not the last in a reduction cyle. We
471 ; would loose iterations (VF-1) on the operations after that use.
474 ; CHECK-LABEL: not_last_operation
476 define i32 @not_last_operation(i32 %p, i32 %val) {
478 %tobool = icmp eq i32 %p, 0
482 %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
483 %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
484 %0 = zext i1 %tobool to i32
485 %inc4.1 = xor i32 %0, 1
486 %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
487 %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
488 %inc6.1 = add nsw i32 %inc613.1, 1
489 %exitcond.1 = icmp eq i32 %inc6.1, 22
490 br i1 %exitcond.1, label %exit, label %for.body
493 %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
497 ;CHECK-LABEL: @reduction_sum_multiuse(
498 ;CHECK: phi <4 x i32>
499 ;CHECK: load <4 x i32>
500 ;CHECK: add <4 x i32>
501 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
502 ;CHECK: add <4 x i32>
503 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
504 ;CHECK: add <4 x i32>
505 ;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
506 ;CHECK: %sum.lcssa = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ]
507 ;CHECK: %sum.copy = phi i32 [ %[[SCALAR]], %.lr.ph ], [ %[[VECTOR]], %middle.block ]
509 define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
510 %1 = icmp sgt i32 %n, 0
511 br i1 %1, label %.lr.ph.preheader, label %end
512 .lr.ph.preheader: ; preds = %0
515 .lr.ph: ; preds = %0, %.lr.ph
516 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
517 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ]
518 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
519 %3 = load i32, i32* %2, align 4
520 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
521 %5 = load i32, i32* %4, align 4
522 %6 = trunc i64 %indvars.iv to i32
523 %7 = add i32 %sum.02, %6
526 %indvars.iv.next = add i64 %indvars.iv, 1
527 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
528 %exitcond = icmp eq i32 %lftr.wideiv, %n
529 br i1 %exitcond, label %._crit_edge, label %.lr.ph
531 ._crit_edge: ; preds = %.lr.ph, %0
532 %sum.lcssa = phi i32 [ %9, %.lr.ph ]
533 %sum.copy = phi i32 [ %9, %.lr.ph ]
537 %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ]
538 %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ]
539 %final = add i32 %f1, %f2
543 ; This looks like a predicated reduction, but it is a reset of the reduction
544 ; variable. We cannot vectorize this.
545 ; CHECK-LABEL: reduction_reset(
546 ; CHECK-NOT: <4 x i32>
547 define void @reduction_reset(i32 %N, i32* nocapture readonly %arrayA, i32* nocapture %arrayB) {
549 %c4 = icmp sgt i32 %N, 0
550 br i1 %c4, label %.lr.ph.preheader, label %._crit_edge
552 .lr.ph.preheader: ; preds = %entry
554 %wide.trip.count = zext i32 %N to i64
557 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
558 %indvars.iv = phi i64 [ 0, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]
559 %.017 = phi i32 [ 100, %.lr.ph.preheader ], [ %csel, %.lr.ph ]
560 %c6 = getelementptr inbounds i32, i32* %arrayA, i64 %indvars.iv
561 %c7 = load i32, i32* %c6, align 4
562 %c8 = icmp sgt i32 %c7, 0
563 %c9 = add nsw i32 %c7, %.017
564 %csel = select i1 %c8, i32 %c9, i32 0
565 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
566 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
567 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
569 ._crit_edge.loopexit: ; preds = %.lr.ph
570 %csel.lcssa = phi i32 [ %csel, %.lr.ph ]
571 %phitmp19 = sext i32 %c5 to i64
572 br label %._crit_edge
574 ._crit_edge: ; preds = %._crit_edge.loopexit, %entry
575 %.015.lcssa = phi i64 [ -1, %entry ], [ %phitmp19, %._crit_edge.loopexit ]
576 %.0.lcssa = phi i32 [ 100, %entry ], [ %csel.lcssa, %._crit_edge.loopexit ]
577 %c10 = getelementptr inbounds i32, i32* %arrayB, i64 %.015.lcssa
578 store i32 %.0.lcssa, i32* %c10, align 4