1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
3 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
7 ; int foo(float *A, int n) {
9 ; for (intptr_t i=0; i < n; ++i) {
18 define i32 @add_red(ptr %A, i32 %n) {
19 ; CHECK-LABEL: @add_red(
21 ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
22 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
23 ; CHECK: for.body.lr.ph:
24 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64
25 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
27 ; CHECK-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
28 ; CHECK-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
29 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
30 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
31 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
33 ; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
34 ; CHECK-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP3]]
35 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_033]], 1
36 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
37 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
38 ; CHECK: for.cond.for.end_crit_edge:
39 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
40 ; CHECK-NEXT: br label [[FOR_END]]
42 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
43 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
46 %cmp31 = icmp sgt i32 %n, 0
47 br i1 %cmp31, label %for.body.lr.ph, label %for.end
50 %0 = sext i32 %n to i64
54 %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
55 %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
56 %mul = shl nsw i64 %i.033, 2
57 %arrayidx = getelementptr inbounds float, ptr %A, i64 %mul
58 %1 = load float, ptr %arrayidx, align 4
59 %mul2 = fmul float %1, 7.000000e+00
60 %add28 = or disjoint i64 %mul, 1
61 %arrayidx4 = getelementptr inbounds float, ptr %A, i64 %add28
62 %2 = load float, ptr %arrayidx4, align 4
63 %mul5 = fmul float %2, 7.000000e+00
64 %add6 = fadd fast float %mul2, %mul5
65 %add829 = or disjoint i64 %mul, 2
66 %arrayidx9 = getelementptr inbounds float, ptr %A, i64 %add829
67 %3 = load float, ptr %arrayidx9, align 4
68 %mul10 = fmul float %3, 7.000000e+00
69 %add11 = fadd fast float %add6, %mul10
70 %add1330 = or disjoint i64 %mul, 3
71 %arrayidx14 = getelementptr inbounds float, ptr %A, i64 %add1330
72 %4 = load float, ptr %arrayidx14, align 4
73 %mul15 = fmul float %4, 7.000000e+00
74 %add16 = fadd fast float %add11, %mul15
75 %add17 = fadd fast float %sum.032, %add16
76 %inc = add nsw i64 %i.033, 1
77 %exitcond = icmp eq i64 %inc, %0
78 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
80 for.cond.for.end_crit_edge:
81 %phitmp = fptosi float %add17 to i32
85 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
89 ; int foo(float * restrict A, float * restrict B, int n) {
91 ; for (intptr_t i=0; i < n; ++i) {
92 ; sum *= B[0]*A[i*4 ] +
100 define i32 @mul_red(ptr noalias %A, ptr noalias %B, i32 %n) {
101 ; CHECK-LABEL: @mul_red(
103 ; CHECK-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
104 ; CHECK-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
105 ; CHECK: for.body.lr.ph:
106 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
107 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64
108 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
110 ; CHECK-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
111 ; CHECK-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
112 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
113 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
114 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4
115 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP0]], [[TMP2]]
116 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
117 ; CHECK-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP4]]
118 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_040]], 1
119 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]]
120 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
121 ; CHECK: for.cond.for.end_crit_edge:
122 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
123 ; CHECK-NEXT: br label [[FOR_END]]
125 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
126 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
129 %cmp38 = icmp sgt i32 %n, 0
130 br i1 %cmp38, label %for.body.lr.ph, label %for.end
133 %0 = load float, ptr %B, align 4
134 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
135 %1 = load float, ptr %arrayidx4, align 4
136 %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2
137 %2 = load float, ptr %arrayidx9, align 4
138 %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3
139 %3 = load float, ptr %arrayidx15, align 4
140 %4 = sext i32 %n to i64
144 %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
145 %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
146 %mul = shl nsw i64 %i.040, 2
147 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
148 %5 = load float, ptr %arrayidx2, align 4
149 %mul3 = fmul float %0, %5
150 %add35 = or disjoint i64 %mul, 1
151 %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add35
152 %6 = load float, ptr %arrayidx6, align 4
153 %mul7 = fmul float %1, %6
154 %add8 = fadd fast float %mul3, %mul7
155 %add1136 = or disjoint i64 %mul, 2
156 %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add1136
157 %7 = load float, ptr %arrayidx12, align 4
158 %mul13 = fmul float %2, %7
159 %add14 = fadd fast float %add8, %mul13
160 %add1737 = or disjoint i64 %mul, 3
161 %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add1737
162 %8 = load float, ptr %arrayidx18, align 4
163 %mul19 = fmul float %3, %8
164 %add20 = fadd fast float %add14, %mul19
165 %mul21 = fmul float %sum.039, %add20
166 %inc = add nsw i64 %i.040, 1
167 %exitcond = icmp eq i64 %inc, %4
168 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
170 for.cond.for.end_crit_edge:
171 %phitmp = fptosi float %mul21 to i32
175 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
179 ; int foo(float * restrict A, float * restrict B, int n) {
181 ; for (intptr_t i=0; i < n; ++i) {
182 ; sum += B[0]*A[i*6 ] +
195 define i32 @long_red(ptr noalias %A, ptr noalias %B, i32 %n) {
196 ; CHECK-LABEL: @long_red(
198 ; CHECK-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
199 ; CHECK-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
200 ; CHECK: for.body.lr.ph:
201 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[B:%.*]], align 4
202 ; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, ptr [[B]], i64 8
203 ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX45]], align 4
204 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
205 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
207 ; CHECK-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
208 ; CHECK-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
209 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
210 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
211 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX2]], align 4
212 ; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x float> [[TMP0]], [[TMP3]]
213 ; CHECK-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
214 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[ADD47]]
215 ; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX48]], align 4
216 ; CHECK-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
217 ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]])
218 ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP6]], [[MUL49]]
219 ; CHECK-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]]
220 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_083]], 1
221 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
222 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
223 ; CHECK: for.cond.for.end_crit_edge:
224 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
225 ; CHECK-NEXT: br label [[FOR_END]]
227 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
228 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
231 %cmp81 = icmp sgt i32 %n, 0
232 br i1 %cmp81, label %for.body.lr.ph, label %for.end
235 %0 = load float, ptr %B, align 4
236 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
237 %1 = load float, ptr %arrayidx4, align 4
238 %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2
239 %2 = load float, ptr %arrayidx9, align 4
240 %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3
241 %3 = load float, ptr %arrayidx15, align 4
242 %arrayidx21 = getelementptr inbounds float, ptr %B, i64 4
243 %4 = load float, ptr %arrayidx21, align 4
244 %arrayidx27 = getelementptr inbounds float, ptr %B, i64 5
245 %5 = load float, ptr %arrayidx27, align 4
246 %arrayidx33 = getelementptr inbounds float, ptr %B, i64 6
247 %6 = load float, ptr %arrayidx33, align 4
248 %arrayidx39 = getelementptr inbounds float, ptr %B, i64 7
249 %7 = load float, ptr %arrayidx39, align 4
250 %arrayidx45 = getelementptr inbounds float, ptr %B, i64 8
251 %8 = load float, ptr %arrayidx45, align 4
252 %9 = sext i32 %n to i64
256 %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
257 %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
258 %mul = mul nsw i64 %i.083, 6
259 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
260 %10 = load float, ptr %arrayidx2, align 4
261 %mul3 = fmul fast float %0, %10
262 %add80 = or disjoint i64 %mul, 1
263 %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add80
264 %11 = load float, ptr %arrayidx6, align 4
265 %mul7 = fmul fast float %1, %11
266 %add8 = fadd fast float %mul3, %mul7
267 %add11 = add nsw i64 %mul, 2
268 %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add11
269 %12 = load float, ptr %arrayidx12, align 4
270 %mul13 = fmul fast float %2, %12
271 %add14 = fadd fast float %add8, %mul13
272 %add17 = add nsw i64 %mul, 3
273 %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add17
274 %13 = load float, ptr %arrayidx18, align 4
275 %mul19 = fmul fast float %3, %13
276 %add20 = fadd fast float %add14, %mul19
277 %add23 = add nsw i64 %mul, 4
278 %arrayidx24 = getelementptr inbounds float, ptr %A, i64 %add23
279 %14 = load float, ptr %arrayidx24, align 4
280 %mul25 = fmul fast float %4, %14
281 %add26 = fadd fast float %add20, %mul25
282 %add29 = add nsw i64 %mul, 5
283 %arrayidx30 = getelementptr inbounds float, ptr %A, i64 %add29
284 %15 = load float, ptr %arrayidx30, align 4
285 %mul31 = fmul fast float %5, %15
286 %add32 = fadd fast float %add26, %mul31
287 %add35 = add nsw i64 %mul, 6
288 %arrayidx36 = getelementptr inbounds float, ptr %A, i64 %add35
289 %16 = load float, ptr %arrayidx36, align 4
290 %mul37 = fmul fast float %6, %16
291 %add38 = fadd fast float %add32, %mul37
292 %add41 = add nsw i64 %mul, 7
293 %arrayidx42 = getelementptr inbounds float, ptr %A, i64 %add41
294 %17 = load float, ptr %arrayidx42, align 4
295 %mul43 = fmul fast float %7, %17
296 %add44 = fadd fast float %add38, %mul43
297 %add47 = add nsw i64 %mul, 8
298 %arrayidx48 = getelementptr inbounds float, ptr %A, i64 %add47
299 %18 = load float, ptr %arrayidx48, align 4
300 %mul49 = fmul fast float %8, %18
301 %add50 = fadd fast float %add44, %mul49
302 %add51 = fadd fast float %sum.082, %add50
303 %inc = add nsw i64 %i.083, 1
304 %exitcond = icmp eq i64 %inc, %9
305 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
307 for.cond.for.end_crit_edge:
308 %phitmp = fptosi float %add51 to i32
312 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
316 ; int foo(float * restrict A, float * restrict B, int n) {
318 ; for (intptr_t i=0; i < n; ++i) {
319 ; sum += B[0]*A[i*4 ];
320 ; sum += B[1]*A[i*4+1];
321 ; sum += B[2]*A[i*4+2];
322 ; sum += B[3]*A[i*4+3];
327 define i32 @chain_red(ptr noalias %A, ptr noalias %B, i32 %n) {
328 ; CHECK-LABEL: @chain_red(
330 ; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
331 ; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
332 ; CHECK: for.body.lr.ph:
333 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
334 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64
335 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
337 ; CHECK-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
338 ; CHECK-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
339 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
340 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
341 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4
342 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP0]], [[TMP2]]
343 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
344 ; CHECK-NEXT: [[OP_RDX]] = fadd fast float [[TMP4]], [[SUM_042]]
345 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_043]], 1
346 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]]
347 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
348 ; CHECK: for.cond.for.end_crit_edge:
349 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32
350 ; CHECK-NEXT: br label [[FOR_END]]
352 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
353 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
356 %cmp41 = icmp sgt i32 %n, 0
357 br i1 %cmp41, label %for.body.lr.ph, label %for.end
360 %0 = load float, ptr %B, align 4
361 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
362 %1 = load float, ptr %arrayidx4, align 4
363 %arrayidx10 = getelementptr inbounds float, ptr %B, i64 2
364 %2 = load float, ptr %arrayidx10, align 4
365 %arrayidx16 = getelementptr inbounds float, ptr %B, i64 3
366 %3 = load float, ptr %arrayidx16, align 4
367 %4 = sext i32 %n to i64
371 %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
372 %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
373 %mul = shl nsw i64 %i.043, 2
374 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
375 %5 = load float, ptr %arrayidx2, align 4
376 %mul3 = fmul fast float %0, %5
377 %add = fadd fast float %sum.042, %mul3
378 %add638 = or disjoint i64 %mul, 1
379 %arrayidx7 = getelementptr inbounds float, ptr %A, i64 %add638
380 %6 = load float, ptr %arrayidx7, align 4
381 %mul8 = fmul fast float %1, %6
382 %add9 = fadd fast float %add, %mul8
383 %add1239 = or disjoint i64 %mul, 2
384 %arrayidx13 = getelementptr inbounds float, ptr %A, i64 %add1239
385 %7 = load float, ptr %arrayidx13, align 4
386 %mul14 = fmul fast float %2, %7
387 %add15 = fadd fast float %add9, %mul14
388 %add1840 = or disjoint i64 %mul, 3
389 %arrayidx19 = getelementptr inbounds float, ptr %A, i64 %add1840
390 %8 = load float, ptr %arrayidx19, align 4
391 %mul20 = fmul fast float %3, %8
392 %add21 = fadd fast float %add15, %mul20
393 %inc = add nsw i64 %i.043, 1
394 %exitcond = icmp eq i64 %inc, %4
395 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
397 for.cond.for.end_crit_edge:
398 %phitmp = fptosi float %add21 to i32
402 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
406 ; void foo(const float *arg_A, unsigned arg_B, float *array) {
407 ; for (uint32_t i = 0; i < 6; ++i) {
408 ; const float *ptr = arg_A + i;
409 ; float w0 = array[i * 4 + 0];
410 ; float w1 = array[i * 4 + 1];
411 ; float w2 = array[i * 4 + 2];
412 ; float w3 = array[i * 4 + 3];
414 ; for (unsigned j = 0; j < arg_B; ++j) {
415 ; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
416 ; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
417 ; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
418 ; const float x4 = x3 + (-4.0f * w2) + w3;
425 ; array[i * 4 + 0] = w0;
426 ; array[i * 4 + 1] = w1;
427 ; array[i * 4 + 2] = w2;
428 ; array[i * 4 + 3] = w3;
432 define void @foo(ptr nocapture readonly %arg_A, i32 %arg_B, ptr nocapture %array) {
435 ; CHECK-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
436 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
437 ; CHECK: for.cond.cleanup:
438 ; CHECK-NEXT: ret void
440 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
441 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
442 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[ARRAY:%.*]], i64 [[TMP0]]
443 ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
444 ; CHECK-NEXT: [[TMP2:%.*]] = or disjoint i64 [[TMP0]], 1
445 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP2]]
446 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
447 ; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[TMP0]], 2
448 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP4]]
449 ; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
450 ; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 3
451 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[ARRAY]], i64 [[TMP6]]
452 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
453 ; CHECK-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
454 ; CHECK: for.body16.lr.ph:
455 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[ARG_A:%.*]], i64 [[INDVARS_IV]]
456 ; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ADD_PTR]], align 4
457 ; CHECK-NEXT: br label [[FOR_BODY16:%.*]]
458 ; CHECK: for.cond.cleanup15:
459 ; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
460 ; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
461 ; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
462 ; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
463 ; CHECK-NEXT: store float [[W0_0_LCSSA]], ptr [[ARRAYIDX]], align 4
464 ; CHECK-NEXT: store float [[W1_0_LCSSA]], ptr [[ARRAYIDX4]], align 4
465 ; CHECK-NEXT: store float [[W2_0_LCSSA]], ptr [[ARRAYIDX8]], align 4
466 ; CHECK-NEXT: store float [[W3_0_LCSSA]], ptr [[ARRAYIDX12]], align 4
467 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
468 ; CHECK-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
469 ; CHECK-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
471 ; CHECK-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
472 ; CHECK-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
473 ; CHECK-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
474 ; CHECK-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
475 ; CHECK-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
476 ; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
477 ; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
478 ; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
479 ; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
480 ; CHECK-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
481 ; CHECK-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
482 ; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
483 ; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
484 ; CHECK-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
485 ; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
486 ; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
487 ; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
488 ; CHECK-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
489 ; CHECK-NEXT: [[INC]] = add nuw i32 [[J_098]], 1
490 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
491 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
494 %cmp1495 = icmp eq i32 %arg_B, 0
497 for.cond.cleanup: ; preds = %for.cond.cleanup15
500 for.body: ; preds = %for.cond.cleanup15, %entry
501 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
502 %0 = shl i64 %indvars.iv, 2
503 %arrayidx = getelementptr inbounds float, ptr %array, i64 %0
504 %1 = load float, ptr %arrayidx, align 4
505 %2 = or disjoint i64 %0, 1
506 %arrayidx4 = getelementptr inbounds float, ptr %array, i64 %2
507 %3 = load float, ptr %arrayidx4, align 4
508 %4 = or disjoint i64 %0, 2
509 %arrayidx8 = getelementptr inbounds float, ptr %array, i64 %4
510 %5 = load float, ptr %arrayidx8, align 4
511 %6 = or disjoint i64 %0, 3
512 %arrayidx12 = getelementptr inbounds float, ptr %array, i64 %6
513 %7 = load float, ptr %arrayidx12, align 4
514 br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
516 for.body16.lr.ph: ; preds = %for.body
517 %add.ptr = getelementptr inbounds float, ptr %arg_A, i64 %indvars.iv
518 %8 = load float, ptr %add.ptr, align 4
521 for.cond.cleanup15: ; preds = %for.body16, %for.body
522 %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
523 %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
524 %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
525 %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
526 store float %w0.0.lcssa, ptr %arrayidx, align 4
527 store float %w1.0.lcssa, ptr %arrayidx4, align 4
528 store float %w2.0.lcssa, ptr %arrayidx8, align 4
529 store float %w3.0.lcssa, ptr %arrayidx12, align 4
530 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
531 %exitcond109 = icmp eq i64 %indvars.iv.next, 6
532 br i1 %exitcond109, label %for.cond.cleanup, label %for.body
534 for.body16: ; preds = %for.body16, %for.body16.lr.ph
535 %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
536 %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
537 %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
538 %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
539 %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
540 %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
541 %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
542 %sub92 = fadd fast float %mul17, %mul18.neg
543 %sub19 = fadd fast float %sub92, %8
544 %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
545 %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
546 %mul23 = fmul fast float %w1.099, 0x4002666660000000
547 %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
548 %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
549 %add2293 = fadd fast float %mul27.neg, %mul25
550 %add24 = fadd fast float %add2293, %mul23
551 %sub2694 = fadd fast float %add24, %mul21.neg
552 %sub28 = fadd fast float %sub2694, %mul20
553 %inc = add nuw i32 %j.098, 1
554 %exitcond = icmp eq i32 %inc, %arg_B
555 br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
559 ; void foo(double * restrict A, double * restrict B, double * restrict C,
561 ; for (intptr_t i=0; i < n; ++i) {
562 ; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
566 define void @store_red_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, i32 %n) {
567 ; CHECK-LABEL: @store_red_double(
569 ; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
570 ; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
571 ; CHECK: for.body.lr.ph:
572 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
573 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64
574 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
576 ; CHECK-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
577 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
578 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[MUL]]
579 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 8
580 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP0]], [[TMP2]]
581 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
582 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
583 ; CHECK-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
584 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[C:%.*]], i64 [[I_018]]
585 ; CHECK-NEXT: store double [[ADD8]], ptr [[ARRAYIDX9]], align 8
586 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_018]], 1
587 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP1]]
588 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
590 ; CHECK-NEXT: ret void
593 %cmp17 = icmp sgt i32 %n, 0
594 br i1 %cmp17, label %for.body.lr.ph, label %for.end
597 %0 = load double, ptr %B, align 8
598 %arrayidx4 = getelementptr inbounds double, ptr %B, i64 1
599 %1 = load double, ptr %arrayidx4, align 8
600 %2 = sext i32 %n to i64
604 %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
605 %mul = shl nsw i64 %i.018, 2
606 %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %mul
607 %3 = load double, ptr %arrayidx2, align 8
608 %mul3 = fmul fast double %0, %3
609 %add16 = or disjoint i64 %mul, 1
610 %arrayidx6 = getelementptr inbounds double, ptr %A, i64 %add16
611 %4 = load double, ptr %arrayidx6, align 8
612 %mul7 = fmul fast double %1, %4
613 %add8 = fadd fast double %mul3, %mul7
614 %arrayidx9 = getelementptr inbounds double, ptr %C, i64 %i.018
615 store double %add8, ptr %arrayidx9, align 8
616 %inc = add nsw i64 %i.018, 1
617 %exitcond = icmp eq i64 %inc, %2
618 br i1 %exitcond, label %for.end, label %for.body
624 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
626 ; for (intptr_t i=0; i < n; ++i) {
627 ; C[i] = B[0] *A[i*4 ] +
635 define i32 @store_red(ptr noalias %A, ptr noalias %B, ptr noalias %C, i32 %n) {
636 ; CHECK-LABEL: @store_red(
638 ; CHECK-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
639 ; CHECK-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
640 ; CHECK: for.body.lr.ph:
641 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64
642 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
644 ; CHECK-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
645 ; CHECK-NEXT: [[C_ADDR_038:%.*]] = phi ptr [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
646 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
647 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[MUL]]
648 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
649 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 4
650 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP2]]
651 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
652 ; CHECK-NEXT: store float [[TMP4]], ptr [[C_ADDR_038]], align 4
653 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[C_ADDR_038]], i64 1
654 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_039]], 1
655 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
656 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
658 ; CHECK-NEXT: ret i32 0
661 %cmp37 = icmp sgt i32 %n, 0
662 br i1 %cmp37, label %for.body.lr.ph, label %for.end
665 %arrayidx4 = getelementptr inbounds float, ptr %B, i64 1
666 %arrayidx9 = getelementptr inbounds float, ptr %B, i64 2
667 %arrayidx15 = getelementptr inbounds float, ptr %B, i64 3
668 %0 = sext i32 %n to i64
672 %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
673 %C.addr.038 = phi ptr [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
674 %1 = load float, ptr %B, align 4
675 %mul = shl nsw i64 %i.039, 2
676 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %mul
677 %2 = load float, ptr %arrayidx2, align 4
678 %mul3 = fmul fast float %1, %2
679 %3 = load float, ptr %arrayidx4, align 4
680 %add34 = or disjoint i64 %mul, 1
681 %arrayidx6 = getelementptr inbounds float, ptr %A, i64 %add34
682 %4 = load float, ptr %arrayidx6, align 4
683 %mul7 = fmul fast float %3, %4
684 %add8 = fadd fast float %mul3, %mul7
685 %5 = load float, ptr %arrayidx9, align 4
686 %add1135 = or disjoint i64 %mul, 2
687 %arrayidx12 = getelementptr inbounds float, ptr %A, i64 %add1135
688 %6 = load float, ptr %arrayidx12, align 4
689 %mul13 = fmul fast float %5, %6
690 %add14 = fadd fast float %add8, %mul13
691 %7 = load float, ptr %arrayidx15, align 4
692 %add1736 = or disjoint i64 %mul, 3
693 %arrayidx18 = getelementptr inbounds float, ptr %A, i64 %add1736
694 %8 = load float, ptr %arrayidx18, align 4
695 %mul19 = fmul fast float %7, %8
696 %add20 = fadd fast float %add14, %mul19
697 store float %add20, ptr %C.addr.038, align 4
698 %incdec.ptr = getelementptr inbounds float, ptr %C.addr.038, i64 1
699 %inc = add nsw i64 %i.039, 1
700 %exitcond = icmp eq i64 %inc, %0
701 br i1 %exitcond, label %for.end, label %for.body
707 @arr_i32 = global [32 x i32] zeroinitializer, align 16
708 @arr_float = global [32 x float] zeroinitializer, align 16
710 define void @float_red_example4(ptr %res) {
711 ; CHECK-LABEL: @float_red_example4(
713 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr @arr_float, align 16
714 ; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
715 ; CHECK-NEXT: store float [[TMP1]], ptr [[RES:%.*]], align 16
716 ; CHECK-NEXT: ret void
719 %0 = load float, ptr @arr_float, align 16
720 %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4
721 %add = fadd fast float %1, %0
722 %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8
723 %add.1 = fadd fast float %2, %add
724 %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4
725 %add.2 = fadd fast float %3, %add.1
726 store float %add.2, ptr %res, align 16
730 define void @float_red_example8(ptr %res) {
731 ; CHECK-LABEL: @float_red_example8(
733 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr @arr_float, align 16
734 ; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
735 ; CHECK-NEXT: store float [[TMP1]], ptr [[RES:%.*]], align 16
736 ; CHECK-NEXT: ret void
739 %0 = load float, ptr @arr_float, align 16
740 %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4
741 %add = fadd fast float %1, %0
742 %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8
743 %add.1 = fadd fast float %2, %add
744 %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4
745 %add.2 = fadd fast float %3, %add.1
746 %4 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 4), align 16
747 %add.3 = fadd fast float %4, %add.2
748 %5 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 5), align 4
749 %add.4 = fadd fast float %5, %add.3
750 %6 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 6), align 8
751 %add.5 = fadd fast float %6, %add.4
752 %7 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 7), align 4
753 %add.6 = fadd fast float %7, %add.5
754 store float %add.6, ptr %res, align 16
758 define void @float_red_example16(ptr %res) {
759 ; CHECK-LABEL: @float_red_example16(
761 ; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr @arr_float, align 16
762 ; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]])
763 ; CHECK-NEXT: store float [[TMP1]], ptr [[RES:%.*]], align 16
764 ; CHECK-NEXT: ret void
767 %0 = load float, ptr @arr_float, align 16
768 %1 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 1), align 4
769 %add = fadd fast float %1, %0
770 %2 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 2), align 8
771 %add.1 = fadd fast float %2, %add
772 %3 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 3), align 4
773 %add.2 = fadd fast float %3, %add.1
774 %4 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 4), align 16
775 %add.3 = fadd fast float %4, %add.2
776 %5 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 5), align 4
777 %add.4 = fadd fast float %5, %add.3
778 %6 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 6), align 8
779 %add.5 = fadd fast float %6, %add.4
780 %7 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 7), align 4
781 %add.6 = fadd fast float %7, %add.5
782 %8 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 8), align 16
783 %add.7 = fadd fast float %8, %add.6
784 %9 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 9), align 4
785 %add.8 = fadd fast float %9, %add.7
786 %10 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 10), align 8
787 %add.9 = fadd fast float %10, %add.8
788 %11 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 11), align 4
789 %add.10 = fadd fast float %11, %add.9
790 %12 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 12), align 16
791 %add.11 = fadd fast float %12, %add.10
792 %13 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 13), align 4
793 %add.12 = fadd fast float %13, %add.11
794 %14 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 14), align 8
795 %add.13 = fadd fast float %14, %add.12
796 %15 = load float, ptr getelementptr inbounds ([32 x float], ptr @arr_float, i64 0, i64 15), align 4
797 %add.14 = fadd fast float %15, %add.13
798 store float %add.14, ptr %res, align 16
802 define void @i32_red_example4(ptr %res) {
803 ; CHECK-LABEL: @i32_red_example4(
805 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
806 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
807 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16
808 ; CHECK-NEXT: ret void
811 %0 = load i32, ptr @arr_i32, align 16
812 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
813 %add = add nsw i32 %1, %0
814 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
815 %add.1 = add nsw i32 %2, %add
816 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
817 %add.2 = add nsw i32 %3, %add.1
818 store i32 %add.2, ptr %res, align 16
822 define void @i32_red_example8(ptr %res) {
823 ; CHECK-LABEL: @i32_red_example8(
825 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16
826 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
827 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16
828 ; CHECK-NEXT: ret void
831 %0 = load i32, ptr @arr_i32, align 16
832 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
833 %add = add nsw i32 %1, %0
834 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
835 %add.1 = add nsw i32 %2, %add
836 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
837 %add.2 = add nsw i32 %3, %add.1
838 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
839 %add.3 = add nsw i32 %4, %add.2
840 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
841 %add.4 = add nsw i32 %5, %add.3
842 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
843 %add.5 = add nsw i32 %6, %add.4
844 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
845 %add.6 = add nsw i32 %7, %add.5
846 store i32 %add.6, ptr %res, align 16
850 define void @i32_red_example16(ptr %res) {
851 ; CHECK-LABEL: @i32_red_example16(
853 ; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @arr_i32, align 16
854 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
855 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16
856 ; CHECK-NEXT: ret void
859 %0 = load i32, ptr @arr_i32, align 16
860 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
861 %add = add nsw i32 %1, %0
862 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
863 %add.1 = add nsw i32 %2, %add
864 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
865 %add.2 = add nsw i32 %3, %add.1
866 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
867 %add.3 = add nsw i32 %4, %add.2
868 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
869 %add.4 = add nsw i32 %5, %add.3
870 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
871 %add.5 = add nsw i32 %6, %add.4
872 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
873 %add.6 = add nsw i32 %7, %add.5
874 %8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 8), align 16
875 %add.7 = add nsw i32 %8, %add.6
876 %9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 9), align 4
877 %add.8 = add nsw i32 %9, %add.7
878 %10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 10), align 8
879 %add.9 = add nsw i32 %10, %add.8
880 %11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 11), align 4
881 %add.10 = add nsw i32 %11, %add.9
882 %12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 12), align 16
883 %add.11 = add nsw i32 %12, %add.10
884 %13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 13), align 4
885 %add.12 = add nsw i32 %13, %add.11
886 %14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 14), align 8
887 %add.13 = add nsw i32 %14, %add.12
888 %15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 15), align 4
889 %add.14 = add nsw i32 %15, %add.13
890 store i32 %add.14, ptr %res, align 16
894 define void @i32_red_example32(ptr %res) {
895 ; CHECK-LABEL: @i32_red_example32(
897 ; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i32>, ptr @arr_i32, align 16
898 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
899 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[RES:%.*]], align 16
900 ; CHECK-NEXT: ret void
903 %0 = load i32, ptr @arr_i32, align 16
904 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
905 %add = add nsw i32 %1, %0
906 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
907 %add.1 = add nsw i32 %2, %add
908 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
909 %add.2 = add nsw i32 %3, %add.1
910 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
911 %add.3 = add nsw i32 %4, %add.2
912 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
913 %add.4 = add nsw i32 %5, %add.3
914 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
915 %add.5 = add nsw i32 %6, %add.4
916 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
917 %add.6 = add nsw i32 %7, %add.5
918 %8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 8), align 16
919 %add.7 = add nsw i32 %8, %add.6
920 %9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 9), align 4
921 %add.8 = add nsw i32 %9, %add.7
922 %10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 10), align 8
923 %add.9 = add nsw i32 %10, %add.8
924 %11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 11), align 4
925 %add.10 = add nsw i32 %11, %add.9
926 %12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 12), align 16
927 %add.11 = add nsw i32 %12, %add.10
928 %13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 13), align 4
929 %add.12 = add nsw i32 %13, %add.11
930 %14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 14), align 8
931 %add.13 = add nsw i32 %14, %add.12
932 %15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 15), align 4
933 %add.14 = add nsw i32 %15, %add.13
934 %16 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 16), align 16
935 %add.15 = add nsw i32 %16, %add.14
936 %17 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 17), align 4
937 %add.16 = add nsw i32 %17, %add.15
938 %18 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 18), align 8
939 %add.17 = add nsw i32 %18, %add.16
940 %19 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 19), align 4
941 %add.18 = add nsw i32 %19, %add.17
942 %20 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 20), align 16
943 %add.19 = add nsw i32 %20, %add.18
944 %21 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 21), align 4
945 %add.20 = add nsw i32 %21, %add.19
946 %22 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 22), align 8
947 %add.21 = add nsw i32 %22, %add.20
948 %23 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 23), align 4
949 %add.22 = add nsw i32 %23, %add.21
950 %24 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 24), align 16
951 %add.23 = add nsw i32 %24, %add.22
952 %25 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 25), align 4
953 %add.24 = add nsw i32 %25, %add.23
954 %26 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 26), align 8
955 %add.25 = add nsw i32 %26, %add.24
956 %27 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 27), align 4
957 %add.26 = add nsw i32 %27, %add.25
958 %28 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 28), align 16
959 %add.27 = add nsw i32 %28, %add.26
960 %29 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 29), align 4
961 %add.28 = add nsw i32 %29, %add.27
962 %30 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 30), align 8
963 %add.29 = add nsw i32 %30, %add.28
964 %31 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 31), align 4
965 %add.30 = add nsw i32 %31, %add.29
966 store i32 %add.30, ptr %res, align 16
970 declare i32 @foobar(i32)
972 define void @i32_red_call(i32 %val) {
973 ; CHECK-LABEL: @i32_red_call(
975 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16
976 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
977 ; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
978 ; CHECK-NEXT: ret void
981 %0 = load i32, ptr @arr_i32, align 16
982 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
983 %add = add nsw i32 %1, %0
984 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
985 %add.1 = add nsw i32 %2, %add
986 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
987 %add.2 = add nsw i32 %3, %add.1
988 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
989 %add.3 = add nsw i32 %4, %add.2
990 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
991 %add.4 = add nsw i32 %5, %add.3
992 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
993 %add.5 = add nsw i32 %6, %add.4
994 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
995 %add.6 = add nsw i32 %7, %add.5
996 %res = call i32 @foobar(i32 %add.6)
1000 define void @i32_red_invoke(i32 %val) personality ptr @__gxx_personality_v0 {
1001 ; CHECK-LABEL: @i32_red_invoke(
1002 ; CHECK-NEXT: entry:
1003 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @arr_i32, align 16
1004 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1005 ; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1006 ; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1008 ; CHECK-NEXT: [[CLEANUP:%.*]] = landingpad i8
1009 ; CHECK-NEXT: cleanup
1010 ; CHECK-NEXT: br label [[NORMAL]]
1012 ; CHECK-NEXT: ret void
1015 %0 = load i32, ptr @arr_i32, align 16
1016 %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
1017 %add = add nsw i32 %1, %0
1018 %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
1019 %add.1 = add nsw i32 %2, %add
1020 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
1021 %add.2 = add nsw i32 %3, %add.1
1022 %4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 4), align 16
1023 %add.3 = add nsw i32 %4, %add.2
1024 %5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 5), align 4
1025 %add.4 = add nsw i32 %5, %add.3
1026 %6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 6), align 8
1027 %add.5 = add nsw i32 %6, %add.4
1028 %7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 7), align 4
1029 %add.6 = add nsw i32 %7, %add.5
1030 %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
1032 %cleanup = landingpad i8 cleanup
1038 ; Test case from PR47670. Reduction result is used as incoming value in phi.
1039 define i32 @reduction_result_used_in_phi(ptr nocapture readonly %data, i1 zeroext %b) {
1040 ; CHECK-LABEL: @reduction_result_used_in_phi(
1041 ; CHECK-NEXT: entry:
1042 ; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1044 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[DATA:%.*]], align 4
1045 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
1046 ; CHECK-NEXT: br label [[EXIT]]
1048 ; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[BB]] ]
1049 ; CHECK-NEXT: ret i32 [[SUM_1]]
1052 br i1 %b, label %bb, label %exit
1055 %l.0 = load i32, ptr %data, align 4
1056 %idx.1 = getelementptr inbounds i32, ptr %data, i64 1
1057 %l.1 = load i32, ptr %idx.1, align 4
1058 %add.1 = add i32 %l.1, %l.0
1059 %idx.2 = getelementptr inbounds i32, ptr %data, i64 2
1060 %l.2 = load i32, ptr %idx.2, align 4
1061 %add.2 = add i32 %l.2, %add.1
1062 %idx.3 = getelementptr inbounds i32, ptr %data, i64 3
1063 %l.3 = load i32, ptr %idx.3, align 4
1064 %add.3 = add i32 %l.3, %add.2
1068 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1072 define i32 @reduction_result_used_in_phi_loop(ptr nocapture readonly %data, i1 zeroext %b) {
1073 ; CHECK-LABEL: @reduction_result_used_in_phi_loop(
1074 ; CHECK-NEXT: entry:
1075 ; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1077 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[DATA:%.*]], align 4
1078 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
1079 ; CHECK-NEXT: br label [[EXIT]]
1081 ; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[BB]] ]
1082 ; CHECK-NEXT: ret i32 [[SUM_1]]
1085 br i1 %b, label %bb, label %exit
1088 %l.0 = load i32, ptr %data, align 4
1089 %idx.1 = getelementptr inbounds i32, ptr %data, i64 1
1090 %l.1 = load i32, ptr %idx.1, align 4
1091 %add.1 = add i32 %l.1, %l.0
1092 %idx.2 = getelementptr inbounds i32, ptr %data, i64 2
1093 %l.2 = load i32, ptr %idx.2, align 4
1094 %add.2 = add i32 %l.2, %add.1
1095 %idx.3 = getelementptr inbounds i32, ptr %data, i64 3
1096 %l.3 = load i32, ptr %idx.3, align 4
1097 %add.3 = add i32 %l.3, %add.2
1101 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1105 ; Make sure we do not crash or infinite loop on ill-formed IR.
1107 define void @unreachable_block() {
1108 ; CHECK-LABEL: @unreachable_block(
1110 ; CHECK-NEXT: br label [[BB_1:%.*]]
1112 ; CHECK-NEXT: [[T0:%.*]] = add i16 [[T0]], undef
1113 ; CHECK-NEXT: br label [[BB_1]]
1115 ; CHECK-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1116 ; CHECK-NEXT: ret void
1122 %t0 = add i16 %t0, undef ; unreachable IR may depend on itself
1126 %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ]
1130 ; The FMF on the reduction should match the incoming insts.
1132 define float @fadd_v4f32_fmf(ptr %p) {
1133 ; CHECK-LABEL: @fadd_v4f32_fmf(
1134 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
1135 ; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP1]])
1136 ; CHECK-NEXT: ret float [[TMP2]]
1138 %p1 = getelementptr inbounds float, float* %p, i64 1
1139 %p2 = getelementptr inbounds float, ptr %p, i64 2
1140 %p3 = getelementptr inbounds float, ptr %p, i64 3
1141 %t0 = load float, ptr %p, align 4
1142 %t1 = load float, ptr %p1, align 4
1143 %t2 = load float, ptr %p2, align 4
1144 %t3 = load float, ptr %p3, align 4
1145 %add1 = fadd reassoc nsz float %t1, %t0
1146 %add2 = fadd reassoc nsz float %t2, %add1
1147 %add3 = fadd reassoc nsz float %t3, %add2
1151 ; The minimal FMF for fadd reduction are "reassoc nsz".
1152 ; Only the common FMF of all operations in the reduction propagate to the result.
1153 ; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags.
1155 define float @fadd_v4f32_fmf_intersect(ptr %p) {
1156 ; CHECK-LABEL: @fadd_v4f32_fmf_intersect(
1157 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
1158 ; CHECK-NEXT: [[TMP2:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP1]])
1159 ; CHECK-NEXT: ret float [[TMP2]]
1161 %p1 = getelementptr inbounds float, float* %p, i64 1
1162 %p2 = getelementptr inbounds float, ptr %p, i64 2
1163 %p3 = getelementptr inbounds float, ptr %p, i64 3
1164 %t0 = load float, ptr %p, align 4
1165 %t1 = load float, ptr %p1, align 4
1166 %t2 = load float, ptr %p2, align 4
1167 %t3 = load float, ptr %p3, align 4
1168 %add1 = fadd ninf reassoc nsz nnan float %t1, %t0
1169 %add2 = fadd ninf reassoc nsz nnan arcp float %t2, %add1
1170 %add3 = fadd ninf reassoc nsz contract float %t3, %add2
1174 ; This must not propagate 'nsw' to a new add instruction.
1176 define void @nsw_propagation_v4i32(ptr %res, i32 %start) {
1177 ; CHECK-LABEL: @nsw_propagation_v4i32(
1178 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
1179 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1180 ; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]]
1181 ; CHECK-NEXT: store i32 [[OP_RDX]], ptr [[RES:%.*]], align 16
1182 ; CHECK-NEXT: ret void
1185 ; STORE-LABEL: @nsw_propagation_v4i32(
1186 ; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
1187 ; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1188 ; STORE-NEXT: [[OP_RDX:%.*]] = add i32 [[START:%.*]], [[TMP2]]
1189 ; STORE-NEXT: store i32 [[OP_RDX]], ptr [[RES:%.*]], align 16
1190 ; STORE-NEXT: ret void
1191 %t0 = load i32, ptr @arr_i32, align 16
1192 %t1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
1193 %t2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
1194 %t3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
1195 %s = add nsw i32 %start, %t0
1196 %add = add nsw i32 %t1, %s
1197 %add.1 = add nsw i32 %t2, %add
1198 %add.2 = add nsw i32 %t3, %add.1
1199 store i32 %add.2, ptr %res, align 16
1203 declare i32 @__gxx_personality_v0(...)