1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
3 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
7 ; int foo(float *A, int n) {
9 ; for (intptr_t i=0; i < n; ++i) {
18 define i32 @add_red(float* %A, i32 %n) {
19 ; CHECK-LABEL: @add_red(
21 ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
22 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
23 ; CHECK: for.body.lr.ph:
24 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64
25 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
27 ; CHECK-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
28 ; CHECK-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
29 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
30 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
31 ; CHECK-NEXT: [[ADD28:%.*]] = or i64 [[MUL]], 1
32 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD28]]
33 ; CHECK-NEXT: [[ADD829:%.*]] = or i64 [[MUL]], 2
34 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD829]]
35 ; CHECK-NEXT: [[ADD1330:%.*]] = or i64 [[MUL]], 3
36 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
37 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
38 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
39 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
40 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
41 ; CHECK-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
42 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_033]], 1
43 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
44 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
45 ; CHECK: for.cond.for.end_crit_edge:
46 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
47 ; CHECK-NEXT: br label [[FOR_END]]
49 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
50 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
52 ; STORE-LABEL: @add_red(
54 ; STORE-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
55 ; STORE-NEXT: br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
56 ; STORE: for.body.lr.ph:
57 ; STORE-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64
58 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
60 ; STORE-NEXT: [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
61 ; STORE-NEXT: [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
62 ; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
63 ; STORE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
64 ; STORE-NEXT: [[ADD28:%.*]] = or i64 [[MUL]], 1
65 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD28]]
66 ; STORE-NEXT: [[ADD829:%.*]] = or i64 [[MUL]], 2
67 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD829]]
68 ; STORE-NEXT: [[ADD1330:%.*]] = or i64 [[MUL]], 3
69 ; STORE-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
70 ; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
71 ; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
72 ; STORE-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
73 ; STORE-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
74 ; STORE-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
75 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_033]], 1
76 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
77 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
78 ; STORE: for.cond.for.end_crit_edge:
79 ; STORE-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
80 ; STORE-NEXT: br label [[FOR_END]]
82 ; STORE-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
83 ; STORE-NEXT: ret i32 [[SUM_0_LCSSA]]
86 %cmp31 = icmp sgt i32 %n, 0
87 br i1 %cmp31, label %for.body.lr.ph, label %for.end
90 %0 = sext i32 %n to i64
94 %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
95 %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
96 %mul = shl nsw i64 %i.033, 2
97 %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
98 %1 = load float, float* %arrayidx, align 4
99 %mul2 = fmul float %1, 7.000000e+00
100 %add28 = or i64 %mul, 1
101 %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
102 %2 = load float, float* %arrayidx4, align 4
103 %mul5 = fmul float %2, 7.000000e+00
104 %add6 = fadd fast float %mul2, %mul5
105 %add829 = or i64 %mul, 2
106 %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
107 %3 = load float, float* %arrayidx9, align 4
108 %mul10 = fmul float %3, 7.000000e+00
109 %add11 = fadd fast float %add6, %mul10
110 %add1330 = or i64 %mul, 3
111 %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
112 %4 = load float, float* %arrayidx14, align 4
113 %mul15 = fmul float %4, 7.000000e+00
114 %add16 = fadd fast float %add11, %mul15
115 %add17 = fadd fast float %sum.032, %add16
116 %inc = add nsw i64 %i.033, 1
117 %exitcond = icmp eq i64 %inc, %0
118 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
120 for.cond.for.end_crit_edge:
121 %phitmp = fptosi float %add17 to i32
125 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
129 ; int foo(float * restrict A, float * restrict B, int n) {
131 ; for (intptr_t i=0; i < n; ++i) {
132 ; sum *= B[0]*A[i*4 ] +
140 define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
141 ; CHECK-LABEL: @mul_red(
143 ; CHECK-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
144 ; CHECK-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
145 ; CHECK: for.body.lr.ph:
146 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
147 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
148 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
149 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
150 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
151 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
152 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
154 ; CHECK-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
155 ; CHECK-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
156 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
157 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
158 ; CHECK-NEXT: [[ADD35:%.*]] = or i64 [[MUL]], 1
159 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
160 ; CHECK-NEXT: [[ADD1136:%.*]] = or i64 [[MUL]], 2
161 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1136]]
162 ; CHECK-NEXT: [[ADD1737:%.*]] = or i64 [[MUL]], 3
163 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1737]]
164 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
165 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
166 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
167 ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
168 ; CHECK-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
169 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_040]], 1
170 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
171 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
172 ; CHECK: for.cond.for.end_crit_edge:
173 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
174 ; CHECK-NEXT: br label [[FOR_END]]
176 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
177 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
179 ; STORE-LABEL: @mul_red(
181 ; STORE-NEXT: [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
182 ; STORE-NEXT: br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
183 ; STORE: for.body.lr.ph:
184 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
185 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
186 ; STORE-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
187 ; STORE-NEXT: [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
188 ; STORE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
189 ; STORE-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
190 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
192 ; STORE-NEXT: [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
193 ; STORE-NEXT: [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
194 ; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
195 ; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
196 ; STORE-NEXT: [[ADD35:%.*]] = or i64 [[MUL]], 1
197 ; STORE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
198 ; STORE-NEXT: [[ADD1136:%.*]] = or i64 [[MUL]], 2
199 ; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1136]]
200 ; STORE-NEXT: [[ADD1737:%.*]] = or i64 [[MUL]], 3
201 ; STORE-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1737]]
202 ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
203 ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
204 ; STORE-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
205 ; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
206 ; STORE-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
207 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_040]], 1
208 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
209 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
210 ; STORE: for.cond.for.end_crit_edge:
211 ; STORE-NEXT: [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
212 ; STORE-NEXT: br label [[FOR_END]]
214 ; STORE-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
215 ; STORE-NEXT: ret i32 [[SUM_0_LCSSA]]
218 %cmp38 = icmp sgt i32 %n, 0
219 br i1 %cmp38, label %for.body.lr.ph, label %for.end
222 %0 = load float, float* %B, align 4
223 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
224 %1 = load float, float* %arrayidx4, align 4
225 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
226 %2 = load float, float* %arrayidx9, align 4
227 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
228 %3 = load float, float* %arrayidx15, align 4
229 %4 = sext i32 %n to i64
233 %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
234 %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
235 %mul = shl nsw i64 %i.040, 2
236 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
237 %5 = load float, float* %arrayidx2, align 4
238 %mul3 = fmul float %0, %5
239 %add35 = or i64 %mul, 1
240 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
241 %6 = load float, float* %arrayidx6, align 4
242 %mul7 = fmul float %1, %6
243 %add8 = fadd fast float %mul3, %mul7
244 %add1136 = or i64 %mul, 2
245 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
246 %7 = load float, float* %arrayidx12, align 4
247 %mul13 = fmul float %2, %7
248 %add14 = fadd fast float %add8, %mul13
249 %add1737 = or i64 %mul, 3
250 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
251 %8 = load float, float* %arrayidx18, align 4
252 %mul19 = fmul float %3, %8
253 %add20 = fadd fast float %add14, %mul19
254 %mul21 = fmul float %sum.039, %add20
255 %inc = add nsw i64 %i.040, 1
256 %exitcond = icmp eq i64 %inc, %4
257 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
259 for.cond.for.end_crit_edge:
260 %phitmp = fptosi float %mul21 to i32
264 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
268 ; int foo(float * restrict A, float * restrict B, int n) {
270 ; for (intptr_t i=0; i < n; ++i) {
271 ; sum += B[0]*A[i*6 ] +
284 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
285 ; CHECK-LABEL: @long_red(
287 ; CHECK-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
288 ; CHECK-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
289 ; CHECK: for.body.lr.ph:
290 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
291 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
292 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
293 ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, float* [[B]], i64 4
294 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
295 ; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, float* [[B]], i64 6
296 ; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, float* [[B]], i64 7
297 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B]] to <8 x float>*
298 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
299 ; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
300 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
301 ; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[N]] to i64
302 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
304 ; CHECK-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
305 ; CHECK-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
306 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
307 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
308 ; CHECK-NEXT: [[ADD80:%.*]] = or i64 [[MUL]], 1
309 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD80]]
310 ; CHECK-NEXT: [[ADD11:%.*]] = add nsw i64 [[MUL]], 2
311 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD11]]
312 ; CHECK-NEXT: [[ADD17:%.*]] = add nsw i64 [[MUL]], 3
313 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD17]]
314 ; CHECK-NEXT: [[ADD23:%.*]] = add nsw i64 [[MUL]], 4
315 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD23]]
316 ; CHECK-NEXT: [[ADD29:%.*]] = add nsw i64 [[MUL]], 5
317 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD29]]
318 ; CHECK-NEXT: [[ADD35:%.*]] = add nsw i64 [[MUL]], 6
319 ; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
320 ; CHECK-NEXT: [[ADD41:%.*]] = add nsw i64 [[MUL]], 7
321 ; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD41]]
322 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
323 ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
324 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
325 ; CHECK-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
326 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
327 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
328 ; CHECK-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
329 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]])
330 ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
331 ; CHECK-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
332 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_083]], 1
333 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
334 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
335 ; CHECK: for.cond.for.end_crit_edge:
336 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
337 ; CHECK-NEXT: br label [[FOR_END]]
339 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
340 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
342 ; STORE-LABEL: @long_red(
344 ; STORE-NEXT: [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
345 ; STORE-NEXT: br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
346 ; STORE: for.body.lr.ph:
347 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
348 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
349 ; STORE-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
350 ; STORE-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds float, float* [[B]], i64 4
351 ; STORE-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
352 ; STORE-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds float, float* [[B]], i64 6
353 ; STORE-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds float, float* [[B]], i64 7
354 ; STORE-NEXT: [[TMP0:%.*]] = bitcast float* [[B]] to <8 x float>*
355 ; STORE-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
356 ; STORE-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
357 ; STORE-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
358 ; STORE-NEXT: [[TMP3:%.*]] = sext i32 [[N]] to i64
359 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
361 ; STORE-NEXT: [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
362 ; STORE-NEXT: [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
363 ; STORE-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
364 ; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
365 ; STORE-NEXT: [[ADD80:%.*]] = or i64 [[MUL]], 1
366 ; STORE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD80]]
367 ; STORE-NEXT: [[ADD11:%.*]] = add nsw i64 [[MUL]], 2
368 ; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD11]]
369 ; STORE-NEXT: [[ADD17:%.*]] = add nsw i64 [[MUL]], 3
370 ; STORE-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD17]]
371 ; STORE-NEXT: [[ADD23:%.*]] = add nsw i64 [[MUL]], 4
372 ; STORE-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD23]]
373 ; STORE-NEXT: [[ADD29:%.*]] = add nsw i64 [[MUL]], 5
374 ; STORE-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD29]]
375 ; STORE-NEXT: [[ADD35:%.*]] = add nsw i64 [[MUL]], 6
376 ; STORE-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
377 ; STORE-NEXT: [[ADD41:%.*]] = add nsw i64 [[MUL]], 7
378 ; STORE-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD41]]
379 ; STORE-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
380 ; STORE-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
381 ; STORE-NEXT: [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
382 ; STORE-NEXT: [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
383 ; STORE-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
384 ; STORE-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
385 ; STORE-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
386 ; STORE-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]])
387 ; STORE-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
388 ; STORE-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
389 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_083]], 1
390 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
391 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
392 ; STORE: for.cond.for.end_crit_edge:
393 ; STORE-NEXT: [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
394 ; STORE-NEXT: br label [[FOR_END]]
396 ; STORE-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
397 ; STORE-NEXT: ret i32 [[SUM_0_LCSSA]]
400 %cmp81 = icmp sgt i32 %n, 0
401 br i1 %cmp81, label %for.body.lr.ph, label %for.end
404 %0 = load float, float* %B, align 4
405 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
406 %1 = load float, float* %arrayidx4, align 4
407 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
408 %2 = load float, float* %arrayidx9, align 4
409 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
410 %3 = load float, float* %arrayidx15, align 4
411 %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
412 %4 = load float, float* %arrayidx21, align 4
413 %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
414 %5 = load float, float* %arrayidx27, align 4
415 %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
416 %6 = load float, float* %arrayidx33, align 4
417 %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
418 %7 = load float, float* %arrayidx39, align 4
419 %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
420 %8 = load float, float* %arrayidx45, align 4
421 %9 = sext i32 %n to i64
425 %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
426 %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
427 %mul = mul nsw i64 %i.083, 6
428 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
429 %10 = load float, float* %arrayidx2, align 4
430 %mul3 = fmul fast float %0, %10
431 %add80 = or i64 %mul, 1
432 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
433 %11 = load float, float* %arrayidx6, align 4
434 %mul7 = fmul fast float %1, %11
435 %add8 = fadd fast float %mul3, %mul7
436 %add11 = add nsw i64 %mul, 2
437 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
438 %12 = load float, float* %arrayidx12, align 4
439 %mul13 = fmul fast float %2, %12
440 %add14 = fadd fast float %add8, %mul13
441 %add17 = add nsw i64 %mul, 3
442 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
443 %13 = load float, float* %arrayidx18, align 4
444 %mul19 = fmul fast float %3, %13
445 %add20 = fadd fast float %add14, %mul19
446 %add23 = add nsw i64 %mul, 4
447 %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
448 %14 = load float, float* %arrayidx24, align 4
449 %mul25 = fmul fast float %4, %14
450 %add26 = fadd fast float %add20, %mul25
451 %add29 = add nsw i64 %mul, 5
452 %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
453 %15 = load float, float* %arrayidx30, align 4
454 %mul31 = fmul fast float %5, %15
455 %add32 = fadd fast float %add26, %mul31
456 %add35 = add nsw i64 %mul, 6
457 %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
458 %16 = load float, float* %arrayidx36, align 4
459 %mul37 = fmul fast float %6, %16
460 %add38 = fadd fast float %add32, %mul37
461 %add41 = add nsw i64 %mul, 7
462 %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
463 %17 = load float, float* %arrayidx42, align 4
464 %mul43 = fmul fast float %7, %17
465 %add44 = fadd fast float %add38, %mul43
466 %add47 = add nsw i64 %mul, 8
467 %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
468 %18 = load float, float* %arrayidx48, align 4
469 %mul49 = fmul fast float %8, %18
470 %add50 = fadd fast float %add44, %mul49
471 %add51 = fadd fast float %sum.082, %add50
472 %inc = add nsw i64 %i.083, 1
473 %exitcond = icmp eq i64 %inc, %9
474 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
476 for.cond.for.end_crit_edge:
477 %phitmp = fptosi float %add51 to i32
481 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
485 ; int foo(float * restrict A, float * restrict B, int n) {
487 ; for (intptr_t i=0; i < n; ++i) {
488 ; sum += B[0]*A[i*4 ];
489 ; sum += B[1]*A[i*4+1];
490 ; sum += B[2]*A[i*4+2];
491 ; sum += B[3]*A[i*4+3];
496 define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
497 ; CHECK-LABEL: @chain_red(
499 ; CHECK-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
500 ; CHECK-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
501 ; CHECK: for.body.lr.ph:
502 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
503 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
504 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
505 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
506 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
507 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
508 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
510 ; CHECK-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
511 ; CHECK-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
512 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
513 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
514 ; CHECK-NEXT: [[ADD638:%.*]] = or i64 [[MUL]], 1
515 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD638]]
516 ; CHECK-NEXT: [[ADD1239:%.*]] = or i64 [[MUL]], 2
517 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1239]]
518 ; CHECK-NEXT: [[ADD1840:%.*]] = or i64 [[MUL]], 3
519 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1840]]
520 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
521 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
522 ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
523 ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
524 ; CHECK-NEXT: [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
525 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_043]], 1
526 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
527 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
528 ; CHECK: for.cond.for.end_crit_edge:
529 ; CHECK-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_EXTRA]] to i32
530 ; CHECK-NEXT: br label [[FOR_END]]
532 ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
533 ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
535 ; STORE-LABEL: @chain_red(
537 ; STORE-NEXT: [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
538 ; STORE-NEXT: br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
539 ; STORE: for.body.lr.ph:
540 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
541 ; STORE-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
542 ; STORE-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
543 ; STORE-NEXT: [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
544 ; STORE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
545 ; STORE-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
546 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
548 ; STORE-NEXT: [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
549 ; STORE-NEXT: [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
550 ; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
551 ; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
552 ; STORE-NEXT: [[ADD638:%.*]] = or i64 [[MUL]], 1
553 ; STORE-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD638]]
554 ; STORE-NEXT: [[ADD1239:%.*]] = or i64 [[MUL]], 2
555 ; STORE-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1239]]
556 ; STORE-NEXT: [[ADD1840:%.*]] = or i64 [[MUL]], 3
557 ; STORE-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1840]]
558 ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
559 ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
560 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
561 ; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
562 ; STORE-NEXT: [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
563 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_043]], 1
564 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
565 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
566 ; STORE: for.cond.for.end_crit_edge:
567 ; STORE-NEXT: [[PHITMP:%.*]] = fptosi float [[OP_EXTRA]] to i32
568 ; STORE-NEXT: br label [[FOR_END]]
570 ; STORE-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
571 ; STORE-NEXT: ret i32 [[SUM_0_LCSSA]]
574 %cmp41 = icmp sgt i32 %n, 0
575 br i1 %cmp41, label %for.body.lr.ph, label %for.end
578 %0 = load float, float* %B, align 4
579 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
580 %1 = load float, float* %arrayidx4, align 4
581 %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
582 %2 = load float, float* %arrayidx10, align 4
583 %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
584 %3 = load float, float* %arrayidx16, align 4
585 %4 = sext i32 %n to i64
589 %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
590 %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
591 %mul = shl nsw i64 %i.043, 2
592 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
593 %5 = load float, float* %arrayidx2, align 4
594 %mul3 = fmul fast float %0, %5
595 %add = fadd fast float %sum.042, %mul3
596 %add638 = or i64 %mul, 1
597 %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
598 %6 = load float, float* %arrayidx7, align 4
599 %mul8 = fmul fast float %1, %6
600 %add9 = fadd fast float %add, %mul8
601 %add1239 = or i64 %mul, 2
602 %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
603 %7 = load float, float* %arrayidx13, align 4
604 %mul14 = fmul fast float %2, %7
605 %add15 = fadd fast float %add9, %mul14
606 %add1840 = or i64 %mul, 3
607 %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
608 %8 = load float, float* %arrayidx19, align 4
609 %mul20 = fmul fast float %3, %8
610 %add21 = fadd fast float %add15, %mul20
611 %inc = add nsw i64 %i.043, 1
612 %exitcond = icmp eq i64 %inc, %4
613 br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
615 for.cond.for.end_crit_edge:
616 %phitmp = fptosi float %add21 to i32
620 %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
624 ; void foo(const float *arg_A, unsigned arg_B, float *array) {
625 ; for (uint32_t i = 0; i < 6; ++i) {
626 ; const float *ptr = arg_A + i;
627 ; float w0 = array[i * 4 + 0];
628 ; float w1 = array[i * 4 + 1];
629 ; float w2 = array[i * 4 + 2];
630 ; float w3 = array[i * 4 + 3];
632 ; for (unsigned j = 0; j < arg_B; ++j) {
633 ; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
634 ; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
635 ; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
636 ; const float x4 = x3 + (-4.0f * w2) + w3;
643 ; array[i * 4 + 0] = w0;
644 ; array[i * 4 + 1] = w1;
645 ; array[i * 4 + 2] = w2;
646 ; array[i * 4 + 3] = w3;
650 define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
653 ; CHECK-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
654 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
655 ; CHECK: for.cond.cleanup:
656 ; CHECK-NEXT: ret void
658 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
659 ; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
660 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
661 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
662 ; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1
663 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
664 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
665 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2
666 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
667 ; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
668 ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3
669 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
670 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
671 ; CHECK-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
672 ; CHECK: for.body16.lr.ph:
673 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
674 ; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
675 ; CHECK-NEXT: br label [[FOR_BODY16:%.*]]
676 ; CHECK: for.cond.cleanup15:
677 ; CHECK-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
678 ; CHECK-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
679 ; CHECK-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
680 ; CHECK-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
681 ; CHECK-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
682 ; CHECK-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
683 ; CHECK-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
684 ; CHECK-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
685 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
686 ; CHECK-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
687 ; CHECK-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
689 ; CHECK-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
690 ; CHECK-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
691 ; CHECK-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
692 ; CHECK-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
693 ; CHECK-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
694 ; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
695 ; CHECK-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
696 ; CHECK-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
697 ; CHECK-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
698 ; CHECK-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
699 ; CHECK-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
700 ; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
701 ; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
702 ; CHECK-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
703 ; CHECK-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
704 ; CHECK-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
705 ; CHECK-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
706 ; CHECK-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
707 ; CHECK-NEXT: [[INC]] = add nuw i32 [[J_098]], 1
708 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
709 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
713 ; STORE-NEXT: [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
714 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
715 ; STORE: for.cond.cleanup:
716 ; STORE-NEXT: ret void
718 ; STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
719 ; STORE-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
720 ; STORE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
721 ; STORE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
722 ; STORE-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1
723 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
724 ; STORE-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
725 ; STORE-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2
726 ; STORE-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
727 ; STORE-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
728 ; STORE-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3
729 ; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
730 ; STORE-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
731 ; STORE-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
732 ; STORE: for.body16.lr.ph:
733 ; STORE-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
734 ; STORE-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
735 ; STORE-NEXT: br label [[FOR_BODY16:%.*]]
736 ; STORE: for.cond.cleanup15:
737 ; STORE-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
738 ; STORE-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
739 ; STORE-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
740 ; STORE-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
741 ; STORE-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
742 ; STORE-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
743 ; STORE-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
744 ; STORE-NEXT: store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
745 ; STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
746 ; STORE-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
747 ; STORE-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
749 ; STORE-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
750 ; STORE-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
751 ; STORE-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
752 ; STORE-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
753 ; STORE-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
754 ; STORE-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
755 ; STORE-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
756 ; STORE-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
757 ; STORE-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
758 ; STORE-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
759 ; STORE-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
760 ; STORE-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
761 ; STORE-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
762 ; STORE-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
763 ; STORE-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
764 ; STORE-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
765 ; STORE-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
766 ; STORE-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
767 ; STORE-NEXT: [[INC]] = add nuw i32 [[J_098]], 1
768 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
769 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
772 %cmp1495 = icmp eq i32 %arg_B, 0
775 for.cond.cleanup: ; preds = %for.cond.cleanup15
778 for.body: ; preds = %for.cond.cleanup15, %entry
779 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
780 %0 = shl i64 %indvars.iv, 2
781 %arrayidx = getelementptr inbounds float, float* %array, i64 %0
782 %1 = load float, float* %arrayidx, align 4
784 %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
785 %3 = load float, float* %arrayidx4, align 4
787 %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
788 %5 = load float, float* %arrayidx8, align 4
790 %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
791 %7 = load float, float* %arrayidx12, align 4
792 br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
794 for.body16.lr.ph: ; preds = %for.body
795 %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
796 %8 = load float, float* %add.ptr, align 4
799 for.cond.cleanup15: ; preds = %for.body16, %for.body
800 %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
801 %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
802 %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
803 %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
804 store float %w0.0.lcssa, float* %arrayidx, align 4
805 store float %w1.0.lcssa, float* %arrayidx4, align 4
806 store float %w2.0.lcssa, float* %arrayidx8, align 4
807 store float %w3.0.lcssa, float* %arrayidx12, align 4
808 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
809 %exitcond109 = icmp eq i64 %indvars.iv.next, 6
810 br i1 %exitcond109, label %for.cond.cleanup, label %for.body
812 for.body16: ; preds = %for.body16, %for.body16.lr.ph
813 %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
814 %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
815 %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
816 %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
817 %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
818 %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
819 %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
820 %sub92 = fadd fast float %mul17, %mul18.neg
821 %sub19 = fadd fast float %sub92, %8
822 %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
823 %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
824 %mul23 = fmul fast float %w1.099, 0x4002666660000000
825 %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
826 %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
827 %add2293 = fadd fast float %mul27.neg, %mul25
828 %add24 = fadd fast float %add2293, %mul23
829 %sub2694 = fadd fast float %add24, %mul21.neg
830 %sub28 = fadd fast float %sub2694, %mul20
831 %inc = add nuw i32 %j.098, 1
832 %exitcond = icmp eq i32 %inc, %arg_B
833 br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
837 ; void foo(double * restrict A, double * restrict B, double * restrict C,
839 ; for (intptr_t i=0; i < n; ++i) {
840 ; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
844 define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
845 ; CHECK-LABEL: @store_red_double(
847 ; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
848 ; CHECK-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
849 ; CHECK: for.body.lr.ph:
850 ; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8
851 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
852 ; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8
853 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
854 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
856 ; CHECK-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
857 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
858 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
859 ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8
860 ; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]]
861 ; CHECK-NEXT: [[ADD16:%.*]] = or i64 [[MUL]], 1
862 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
863 ; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8
864 ; CHECK-NEXT: [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]]
865 ; CHECK-NEXT: [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]]
866 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
867 ; CHECK-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8
868 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_018]], 1
869 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
870 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
872 ; CHECK-NEXT: ret void
874 ; STORE-LABEL: @store_red_double(
876 ; STORE-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
877 ; STORE-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
878 ; STORE: for.body.lr.ph:
879 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
880 ; STORE-NEXT: [[TMP0:%.*]] = bitcast double* [[B]] to <2 x double>*
881 ; STORE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
882 ; STORE-NEXT: [[TMP2:%.*]] = sext i32 [[N]] to i64
883 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
885 ; STORE-NEXT: [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
886 ; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
887 ; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
888 ; STORE-NEXT: [[ADD16:%.*]] = or i64 [[MUL]], 1
889 ; STORE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
890 ; STORE-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
891 ; STORE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
892 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]]
893 ; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
894 ; STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
895 ; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
896 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
897 ; STORE-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8
898 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_018]], 1
899 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
900 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
902 ; STORE-NEXT: ret void
905 %cmp17 = icmp sgt i32 %n, 0
906 br i1 %cmp17, label %for.body.lr.ph, label %for.end
909 %0 = load double, double* %B, align 8
910 %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
911 %1 = load double, double* %arrayidx4, align 8
912 %2 = sext i32 %n to i64
916 %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
917 %mul = shl nsw i64 %i.018, 2
918 %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
919 %3 = load double, double* %arrayidx2, align 8
920 %mul3 = fmul fast double %0, %3
921 %add16 = or i64 %mul, 1
922 %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
923 %4 = load double, double* %arrayidx6, align 8
924 %mul7 = fmul fast double %1, %4
925 %add8 = fadd fast double %mul3, %mul7
926 %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
927 store double %add8, double* %arrayidx9, align 8
928 %inc = add nsw i64 %i.018, 1
929 %exitcond = icmp eq i64 %inc, %2
930 br i1 %exitcond, label %for.end, label %for.body
936 ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
938 ; for (intptr_t i=0; i < n; ++i) {
939 ; C[i] = B[0] *A[i*4 ] +
947 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
948 ; CHECK-LABEL: @store_red(
950 ; CHECK-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
951 ; CHECK-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
952 ; CHECK: for.body.lr.ph:
953 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
954 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
955 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
956 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64
957 ; CHECK-NEXT: br label [[FOR_BODY:%.*]]
959 ; CHECK-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
960 ; CHECK-NEXT: [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
961 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4
962 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
963 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
964 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
965 ; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP1]], [[TMP2]]
966 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
967 ; CHECK-NEXT: [[ADD34:%.*]] = or i64 [[MUL]], 1
968 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
969 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX6]], align 4
970 ; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[TMP3]], [[TMP4]]
971 ; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[MUL3]], [[MUL7]]
972 ; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX9]], align 4
973 ; CHECK-NEXT: [[ADD1135:%.*]] = or i64 [[MUL]], 2
974 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
975 ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX12]], align 4
976 ; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP5]], [[TMP6]]
977 ; CHECK-NEXT: [[ADD14:%.*]] = fadd fast float [[ADD8]], [[MUL13]]
978 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX15]], align 4
979 ; CHECK-NEXT: [[ADD1736:%.*]] = or i64 [[MUL]], 3
980 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
981 ; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX18]], align 4
982 ; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP7]], [[TMP8]]
983 ; CHECK-NEXT: [[ADD20:%.*]] = fadd fast float [[ADD14]], [[MUL19]]
984 ; CHECK-NEXT: store float [[ADD20]], float* [[C_ADDR_038]], align 4
985 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
986 ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_039]], 1
987 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
988 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
990 ; CHECK-NEXT: ret i32 0
992 ; STORE-LABEL: @store_red(
994 ; STORE-NEXT: [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
995 ; STORE-NEXT: br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
996 ; STORE: for.body.lr.ph:
997 ; STORE-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
998 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
999 ; STORE-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
1000 ; STORE-NEXT: [[TMP0:%.*]] = sext i32 [[N]] to i64
1001 ; STORE-NEXT: br label [[FOR_BODY:%.*]]
1003 ; STORE-NEXT: [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
1004 ; STORE-NEXT: [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
1005 ; STORE-NEXT: [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
1006 ; STORE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
1007 ; STORE-NEXT: [[ADD34:%.*]] = or i64 [[MUL]], 1
1008 ; STORE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
1009 ; STORE-NEXT: [[ADD1135:%.*]] = or i64 [[MUL]], 2
1010 ; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
1011 ; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>*
1012 ; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1013 ; STORE-NEXT: [[ADD1736:%.*]] = or i64 [[MUL]], 3
1014 ; STORE-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
1015 ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
1016 ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
1017 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
1018 ; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
1019 ; STORE-NEXT: store float [[TMP6]], float* [[C_ADDR_038]], align 4
1020 ; STORE-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
1021 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_039]], 1
1022 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
1023 ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
1025 ; STORE-NEXT: ret i32 0
1028 %cmp37 = icmp sgt i32 %n, 0
1029 br i1 %cmp37, label %for.body.lr.ph, label %for.end
1032 %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
1033 %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
1034 %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
1035 %0 = sext i32 %n to i64
1039 %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
1040 %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
1041 %1 = load float, float* %B, align 4
1042 %mul = shl nsw i64 %i.039, 2
1043 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
1044 %2 = load float, float* %arrayidx2, align 4
1045 %mul3 = fmul fast float %1, %2
1046 %3 = load float, float* %arrayidx4, align 4
1047 %add34 = or i64 %mul, 1
1048 %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
1049 %4 = load float, float* %arrayidx6, align 4
1050 %mul7 = fmul fast float %3, %4
1051 %add8 = fadd fast float %mul3, %mul7
1052 %5 = load float, float* %arrayidx9, align 4
1053 %add1135 = or i64 %mul, 2
1054 %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
1055 %6 = load float, float* %arrayidx12, align 4
1056 %mul13 = fmul fast float %5, %6
1057 %add14 = fadd fast float %add8, %mul13
1058 %7 = load float, float* %arrayidx15, align 4
1059 %add1736 = or i64 %mul, 3
1060 %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
1061 %8 = load float, float* %arrayidx18, align 4
1062 %mul19 = fmul fast float %7, %8
1063 %add20 = fadd fast float %add14, %mul19
1064 store float %add20, float* %C.addr.038, align 4
1065 %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
1066 %inc = add nsw i64 %i.039, 1
1067 %exitcond = icmp eq i64 %inc, %0
1068 br i1 %exitcond, label %for.end, label %for.body
1074 @arr_i32 = global [32 x i32] zeroinitializer, align 16
1075 @arr_float = global [32 x float] zeroinitializer, align 16
1077 define void @float_red_example4(float* %res) {
1078 ; CHECK-LABEL: @float_red_example4(
1079 ; CHECK-NEXT: entry:
1080 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1081 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1082 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
1083 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1084 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1085 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1086 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
1087 ; CHECK-NEXT: store float [[ADD_2]], float* [[RES:%.*]], align 16
1088 ; CHECK-NEXT: ret void
1090 ; STORE-LABEL: @float_red_example4(
1091 ; STORE-NEXT: entry:
1092 ; STORE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
1093 ; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
1094 ; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16
1095 ; STORE-NEXT: ret void
1098 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1099 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1100 %add = fadd fast float %1, %0
1101 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1102 %add.1 = fadd fast float %2, %add
1103 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1104 %add.2 = fadd fast float %3, %add.1
1105 store float %add.2, float* %res, align 16
1109 define void @float_red_example8(float* %res) {
1110 ; CHECK-LABEL: @float_red_example8(
1111 ; CHECK-NEXT: entry:
1112 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1113 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1114 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
1115 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1116 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1117 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1118 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
1119 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1120 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
1121 ; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1122 ; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
1123 ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1124 ; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
1125 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1126 ; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
1127 ; CHECK-NEXT: store float [[ADD_6]], float* [[RES:%.*]], align 16
1128 ; CHECK-NEXT: ret void
1130 ; STORE-LABEL: @float_red_example8(
1131 ; STORE-NEXT: entry:
1132 ; STORE-NEXT: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
1133 ; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
1134 ; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16
1135 ; STORE-NEXT: ret void
1138 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1139 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1140 %add = fadd fast float %1, %0
1141 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1142 %add.1 = fadd fast float %2, %add
1143 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1144 %add.2 = fadd fast float %3, %add.1
1145 %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1146 %add.3 = fadd fast float %4, %add.2
1147 %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1148 %add.4 = fadd fast float %5, %add.3
1149 %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1150 %add.5 = fadd fast float %6, %add.4
1151 %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1152 %add.6 = fadd fast float %7, %add.5
1153 store float %add.6, float* %res, align 16
1157 define void @float_red_example16(float* %res) {
1158 ; CHECK-LABEL: @float_red_example16(
1159 ; CHECK-NEXT: entry:
1160 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1161 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1162 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
1163 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1164 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1165 ; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1166 ; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
1167 ; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1168 ; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
1169 ; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1170 ; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
1171 ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1172 ; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
1173 ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1174 ; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
1175 ; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
1176 ; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP8]], [[ADD_6]]
1177 ; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
1178 ; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP9]], [[ADD_7]]
1179 ; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
1180 ; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP10]], [[ADD_8]]
1181 ; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
1182 ; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP11]], [[ADD_9]]
1183 ; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
1184 ; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP12]], [[ADD_10]]
1185 ; CHECK-NEXT: [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
1186 ; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP13]], [[ADD_11]]
1187 ; CHECK-NEXT: [[TMP14:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
1188 ; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP14]], [[ADD_12]]
1189 ; CHECK-NEXT: [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
1190 ; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP15]], [[ADD_13]]
1191 ; CHECK-NEXT: store float [[ADD_14]], float* [[RES:%.*]], align 16
1192 ; CHECK-NEXT: ret void
1194 ; STORE-LABEL: @float_red_example16(
1195 ; STORE-NEXT: entry:
1196 ; STORE-NEXT: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
1197 ; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]])
1198 ; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16
1199 ; STORE-NEXT: ret void
1202 %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
1203 %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
1204 %add = fadd fast float %1, %0
1205 %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
1206 %add.1 = fadd fast float %2, %add
1207 %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
1208 %add.2 = fadd fast float %3, %add.1
1209 %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
1210 %add.3 = fadd fast float %4, %add.2
1211 %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
1212 %add.4 = fadd fast float %5, %add.3
1213 %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
1214 %add.5 = fadd fast float %6, %add.4
1215 %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
1216 %add.6 = fadd fast float %7, %add.5
1217 %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
1218 %add.7 = fadd fast float %8, %add.6
1219 %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
1220 %add.8 = fadd fast float %9, %add.7
1221 %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
1222 %add.9 = fadd fast float %10, %add.8
1223 %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
1224 %add.10 = fadd fast float %11, %add.9
1225 %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
1226 %add.11 = fadd fast float %12, %add.10
1227 %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
1228 %add.12 = fadd fast float %13, %add.11
1229 %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
1230 %add.13 = fadd fast float %14, %add.12
1231 %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
1232 %add.14 = fadd fast float %15, %add.13
1233 store float %add.14, float* %res, align 16
1237 define void @i32_red_example4(i32* %res) {
1238 ; CHECK-LABEL: @i32_red_example4(
1239 ; CHECK-NEXT: entry:
1240 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1241 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1242 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1243 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1244 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1245 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1246 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1247 ; CHECK-NEXT: store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
1248 ; CHECK-NEXT: ret void
1250 ; STORE-LABEL: @i32_red_example4(
1251 ; STORE-NEXT: entry:
1252 ; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
1253 ; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
1254 ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1255 ; STORE-NEXT: ret void
1258 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1259 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1260 %add = add nsw i32 %1, %0
1261 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1262 %add.1 = add nsw i32 %2, %add
1263 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1264 %add.2 = add nsw i32 %3, %add.1
1265 store i32 %add.2, i32* %res, align 16
1269 define void @i32_red_example8(i32* %res) {
1270 ; CHECK-LABEL: @i32_red_example8(
1271 ; CHECK-NEXT: entry:
1272 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1273 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1274 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1275 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1276 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1277 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1278 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1279 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1280 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1281 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1282 ; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1283 ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1284 ; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1285 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1286 ; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1287 ; CHECK-NEXT: store i32 [[ADD_6]], i32* [[RES:%.*]], align 16
1288 ; CHECK-NEXT: ret void
1290 ; STORE-LABEL: @i32_red_example8(
1291 ; STORE-NEXT: entry:
1292 ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1293 ; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1294 ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1295 ; STORE-NEXT: ret void
1298 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1299 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1300 %add = add nsw i32 %1, %0
1301 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1302 %add.1 = add nsw i32 %2, %add
1303 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1304 %add.2 = add nsw i32 %3, %add.1
1305 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1306 %add.3 = add nsw i32 %4, %add.2
1307 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1308 %add.4 = add nsw i32 %5, %add.3
1309 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1310 %add.5 = add nsw i32 %6, %add.4
1311 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1312 %add.6 = add nsw i32 %7, %add.5
1313 store i32 %add.6, i32* %res, align 16
1317 define void @i32_red_example16(i32* %res) {
1318 ; CHECK-LABEL: @i32_red_example16(
1319 ; CHECK-NEXT: entry:
1320 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1321 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1322 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1323 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1324 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1325 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1326 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1327 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1328 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1329 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1330 ; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1331 ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1332 ; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1333 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1334 ; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1335 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1336 ; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
1337 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1338 ; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
1339 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1340 ; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
1341 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1342 ; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
1343 ; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1344 ; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
1345 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1346 ; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
1347 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1348 ; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
1349 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1350 ; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
1351 ; CHECK-NEXT: store i32 [[ADD_14]], i32* [[RES:%.*]], align 16
1352 ; CHECK-NEXT: ret void
1354 ; STORE-LABEL: @i32_red_example16(
1355 ; STORE-NEXT: entry:
1356 ; STORE-NEXT: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
1357 ; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
1358 ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1359 ; STORE-NEXT: ret void
1362 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1363 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1364 %add = add nsw i32 %1, %0
1365 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1366 %add.1 = add nsw i32 %2, %add
1367 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1368 %add.2 = add nsw i32 %3, %add.1
1369 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1370 %add.3 = add nsw i32 %4, %add.2
1371 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1372 %add.4 = add nsw i32 %5, %add.3
1373 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1374 %add.5 = add nsw i32 %6, %add.4
1375 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1376 %add.6 = add nsw i32 %7, %add.5
1377 %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1378 %add.7 = add nsw i32 %8, %add.6
1379 %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1380 %add.8 = add nsw i32 %9, %add.7
1381 %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1382 %add.9 = add nsw i32 %10, %add.8
1383 %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1384 %add.10 = add nsw i32 %11, %add.9
1385 %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1386 %add.11 = add nsw i32 %12, %add.10
1387 %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1388 %add.12 = add nsw i32 %13, %add.11
1389 %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1390 %add.13 = add nsw i32 %14, %add.12
1391 %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1392 %add.14 = add nsw i32 %15, %add.13
1393 store i32 %add.14, i32* %res, align 16
1397 define void @i32_red_example32(i32* %res) {
1398 ; CHECK-LABEL: @i32_red_example32(
1399 ; CHECK-NEXT: entry:
1400 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1401 ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1402 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1403 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1404 ; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1405 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1406 ; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1407 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1408 ; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1409 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1410 ; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1411 ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1412 ; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1413 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1414 ; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1415 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1416 ; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
1417 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1418 ; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
1419 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1420 ; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
1421 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1422 ; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
1423 ; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1424 ; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
1425 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1426 ; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
1427 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1428 ; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
1429 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1430 ; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
1431 ; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
1432 ; CHECK-NEXT: [[ADD_15:%.*]] = add nsw i32 [[TMP16]], [[ADD_14]]
1433 ; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
1434 ; CHECK-NEXT: [[ADD_16:%.*]] = add nsw i32 [[TMP17]], [[ADD_15]]
1435 ; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
1436 ; CHECK-NEXT: [[ADD_17:%.*]] = add nsw i32 [[TMP18]], [[ADD_16]]
1437 ; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
1438 ; CHECK-NEXT: [[ADD_18:%.*]] = add nsw i32 [[TMP19]], [[ADD_17]]
1439 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
1440 ; CHECK-NEXT: [[ADD_19:%.*]] = add nsw i32 [[TMP20]], [[ADD_18]]
1441 ; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
1442 ; CHECK-NEXT: [[ADD_20:%.*]] = add nsw i32 [[TMP21]], [[ADD_19]]
1443 ; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
1444 ; CHECK-NEXT: [[ADD_21:%.*]] = add nsw i32 [[TMP22]], [[ADD_20]]
1445 ; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
1446 ; CHECK-NEXT: [[ADD_22:%.*]] = add nsw i32 [[TMP23]], [[ADD_21]]
1447 ; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
1448 ; CHECK-NEXT: [[ADD_23:%.*]] = add nsw i32 [[TMP24]], [[ADD_22]]
1449 ; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
1450 ; CHECK-NEXT: [[ADD_24:%.*]] = add nsw i32 [[TMP25]], [[ADD_23]]
1451 ; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
1452 ; CHECK-NEXT: [[ADD_25:%.*]] = add nsw i32 [[TMP26]], [[ADD_24]]
1453 ; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
1454 ; CHECK-NEXT: [[ADD_26:%.*]] = add nsw i32 [[TMP27]], [[ADD_25]]
1455 ; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
1456 ; CHECK-NEXT: [[ADD_27:%.*]] = add nsw i32 [[TMP28]], [[ADD_26]]
1457 ; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
1458 ; CHECK-NEXT: [[ADD_28:%.*]] = add nsw i32 [[TMP29]], [[ADD_27]]
1459 ; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
1460 ; CHECK-NEXT: [[ADD_29:%.*]] = add nsw i32 [[TMP30]], [[ADD_28]]
1461 ; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
1462 ; CHECK-NEXT: [[ADD_30:%.*]] = add nsw i32 [[TMP31]], [[ADD_29]]
1463 ; CHECK-NEXT: store i32 [[ADD_30]], i32* [[RES:%.*]], align 16
1464 ; CHECK-NEXT: ret void
1466 ; STORE-LABEL: @i32_red_example32(
1467 ; STORE-NEXT: entry:
1468 ; STORE-NEXT: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
1469 ; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
1470 ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1471 ; STORE-NEXT: ret void
1474 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1475 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1476 %add = add nsw i32 %1, %0
1477 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1478 %add.1 = add nsw i32 %2, %add
1479 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1480 %add.2 = add nsw i32 %3, %add.1
1481 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1482 %add.3 = add nsw i32 %4, %add.2
1483 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1484 %add.4 = add nsw i32 %5, %add.3
1485 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1486 %add.5 = add nsw i32 %6, %add.4
1487 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1488 %add.6 = add nsw i32 %7, %add.5
1489 %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1490 %add.7 = add nsw i32 %8, %add.6
1491 %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1492 %add.8 = add nsw i32 %9, %add.7
1493 %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1494 %add.9 = add nsw i32 %10, %add.8
1495 %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1496 %add.10 = add nsw i32 %11, %add.9
1497 %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1498 %add.11 = add nsw i32 %12, %add.10
1499 %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1500 %add.12 = add nsw i32 %13, %add.11
1501 %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1502 %add.13 = add nsw i32 %14, %add.12
1503 %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1504 %add.14 = add nsw i32 %15, %add.13
1505 %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
1506 %add.15 = add nsw i32 %16, %add.14
1507 %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
1508 %add.16 = add nsw i32 %17, %add.15
1509 %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
1510 %add.17 = add nsw i32 %18, %add.16
1511 %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
1512 %add.18 = add nsw i32 %19, %add.17
1513 %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
1514 %add.19 = add nsw i32 %20, %add.18
1515 %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
1516 %add.20 = add nsw i32 %21, %add.19
1517 %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
1518 %add.21 = add nsw i32 %22, %add.20
1519 %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
1520 %add.22 = add nsw i32 %23, %add.21
1521 %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
1522 %add.23 = add nsw i32 %24, %add.22
1523 %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
1524 %add.24 = add nsw i32 %25, %add.23
1525 %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
1526 %add.25 = add nsw i32 %26, %add.24
1527 %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
1528 %add.26 = add nsw i32 %27, %add.25
1529 %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
1530 %add.27 = add nsw i32 %28, %add.26
1531 %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
1532 %add.28 = add nsw i32 %29, %add.27
1533 %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
1534 %add.29 = add nsw i32 %30, %add.28
1535 %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
1536 %add.30 = add nsw i32 %31, %add.29
1537 store i32 %add.30, i32* %res, align 16
1541 declare i32 @foobar(i32)
1543 define void @i32_red_call(i32 %val) {
1544 ; CHECK-LABEL: @i32_red_call(
1545 ; CHECK-NEXT: entry:
1546 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1547 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1548 ; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
1549 ; CHECK-NEXT: ret void
1551 ; STORE-LABEL: @i32_red_call(
1552 ; STORE-NEXT: entry:
1553 ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1554 ; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1555 ; STORE-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
1556 ; STORE-NEXT: ret void
1559 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1560 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1561 %add = add nsw i32 %1, %0
1562 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1563 %add.1 = add nsw i32 %2, %add
1564 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1565 %add.2 = add nsw i32 %3, %add.1
1566 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1567 %add.3 = add nsw i32 %4, %add.2
1568 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1569 %add.4 = add nsw i32 %5, %add.3
1570 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1571 %add.5 = add nsw i32 %6, %add.4
1572 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1573 %add.6 = add nsw i32 %7, %add.5
1574 %res = call i32 @foobar(i32 %add.6)
1578 define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
1579 ; CHECK-LABEL: @i32_red_invoke(
1580 ; CHECK-NEXT: entry:
1581 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1582 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1583 ; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1584 ; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1586 ; CHECK-NEXT: [[CLEANUP:%.*]] = landingpad i8
1587 ; CHECK-NEXT: cleanup
1588 ; CHECK-NEXT: br label [[NORMAL]]
1590 ; CHECK-NEXT: ret void
1592 ; STORE-LABEL: @i32_red_invoke(
1593 ; STORE-NEXT: entry:
1594 ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1595 ; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1596 ; STORE-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1597 ; STORE-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1599 ; STORE-NEXT: [[CLEANUP:%.*]] = landingpad i8
1600 ; STORE-NEXT: cleanup
1601 ; STORE-NEXT: br label [[NORMAL]]
1603 ; STORE-NEXT: ret void
1606 %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1607 %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1608 %add = add nsw i32 %1, %0
1609 %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1610 %add.1 = add nsw i32 %2, %add
1611 %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1612 %add.2 = add nsw i32 %3, %add.1
1613 %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1614 %add.3 = add nsw i32 %4, %add.2
1615 %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1616 %add.4 = add nsw i32 %5, %add.3
1617 %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1618 %add.5 = add nsw i32 %6, %add.4
1619 %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1620 %add.6 = add nsw i32 %7, %add.5
1621 %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
1623 %cleanup = landingpad i8 cleanup
1629 ; Test case from PR47670. Reduction result is used as incoming value in phi.
1630 define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroext %b) {
1631 ; CHECK-LABEL: @reduction_result_used_in_phi(
1632 ; CHECK-NEXT: entry:
1633 ; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1635 ; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1636 ; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1637 ; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1638 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1639 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1640 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1641 ; CHECK-NEXT: br label [[EXIT]]
1643 ; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1644 ; CHECK-NEXT: ret i32 [[SUM_1]]
1646 ; STORE-LABEL: @reduction_result_used_in_phi(
1647 ; STORE-NEXT: entry:
1648 ; STORE-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1650 ; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1651 ; STORE-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1652 ; STORE-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1653 ; STORE-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1654 ; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1655 ; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1656 ; STORE-NEXT: br label [[EXIT]]
1658 ; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1659 ; STORE-NEXT: ret i32 [[SUM_1]]
1662 br i1 %b, label %bb, label %exit
1665 %l.0 = load i32, i32* %data, align 4
1666 %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
1667 %l.1 = load i32, i32* %idx.1, align 4
1668 %add.1 = add i32 %l.1, %l.0
1669 %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
1670 %l.2 = load i32, i32* %idx.2, align 4
1671 %add.2 = add i32 %l.2, %add.1
1672 %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
1673 %l.3 = load i32, i32* %idx.3, align 4
1674 %add.3 = add i32 %l.3, %add.2
1678 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1682 define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) {
1683 ; CHECK-LABEL: @reduction_result_used_in_phi_loop(
1684 ; CHECK-NEXT: entry:
1685 ; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1687 ; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1688 ; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1689 ; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1690 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1691 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1692 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1693 ; CHECK-NEXT: br label [[EXIT]]
1695 ; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1696 ; CHECK-NEXT: ret i32 [[SUM_1]]
1698 ; STORE-LABEL: @reduction_result_used_in_phi_loop(
1699 ; STORE-NEXT: entry:
1700 ; STORE-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1702 ; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1
1703 ; STORE-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2
1704 ; STORE-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3
1705 ; STORE-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>*
1706 ; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1707 ; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1708 ; STORE-NEXT: br label [[EXIT]]
1710 ; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1711 ; STORE-NEXT: ret i32 [[SUM_1]]
1714 br i1 %b, label %bb, label %exit
1717 %l.0 = load i32, i32* %data, align 4
1718 %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
1719 %l.1 = load i32, i32* %idx.1, align 4
1720 %add.1 = add i32 %l.1, %l.0
1721 %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
1722 %l.2 = load i32, i32* %idx.2, align 4
1723 %add.2 = add i32 %l.2, %add.1
1724 %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
1725 %l.3 = load i32, i32* %idx.3, align 4
1726 %add.3 = add i32 %l.3, %add.2
1730 %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1734 ; Make sure we do not crash or infinite loop on ill-formed IR.
1736 define void @unreachable_block() {
1737 ; CHECK-LABEL: @unreachable_block(
1739 ; CHECK-NEXT: br label [[BB_1:%.*]]
1741 ; CHECK-NEXT: [[T0:%.*]] = add i16 [[T0]], undef
1742 ; CHECK-NEXT: br label [[BB_1]]
1744 ; CHECK-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1745 ; CHECK-NEXT: ret void
1747 ; STORE-LABEL: @unreachable_block(
1749 ; STORE-NEXT: br label [[BB_1:%.*]]
1751 ; STORE-NEXT: [[T0:%.*]] = add i16 [[T0]], undef
1752 ; STORE-NEXT: br label [[BB_1]]
1754 ; STORE-NEXT: [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1755 ; STORE-NEXT: ret void
1761 %t0 = add i16 %t0, undef ; unreachable IR may depend on itself
1765 %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ]
1769 ; The FMF on the reduction should match the incoming insts.
1771 define float @fadd_v4f32_fmf(float* %p) {
1772 ; CHECK-LABEL: @fadd_v4f32_fmf(
1773 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
1774 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
1775 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
1776 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
1777 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1778 ; CHECK-NEXT: [[TMP3:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
1779 ; CHECK-NEXT: ret float [[TMP3]]
1781 ; STORE-LABEL: @fadd_v4f32_fmf(
1782 ; STORE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
1783 ; STORE-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
1784 ; STORE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
1785 ; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
1786 ; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1787 ; STORE-NEXT: [[TMP3:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
1788 ; STORE-NEXT: ret float [[TMP3]]
1790 %p1 = getelementptr inbounds float, float* %p, i64 1
1791 %p2 = getelementptr inbounds float, float* %p, i64 2
1792 %p3 = getelementptr inbounds float, float* %p, i64 3
1793 %t0 = load float, float* %p, align 4
1794 %t1 = load float, float* %p1, align 4
1795 %t2 = load float, float* %p2, align 4
1796 %t3 = load float, float* %p3, align 4
1797 %add1 = fadd reassoc nsz float %t1, %t0
1798 %add2 = fadd reassoc nsz float %t2, %add1
1799 %add3 = fadd reassoc nsz float %t3, %add2
1803 ; The minimal FMF for fadd reduction are "reassoc nsz".
1804 ; Only the common FMF of all operations in the reduction propagate to the result.
1805 ; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags.
1807 define float @fadd_v4f32_fmf_intersect(float* %p) {
1808 ; CHECK-LABEL: @fadd_v4f32_fmf_intersect(
1809 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
1810 ; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
1811 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
1812 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
1813 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1814 ; CHECK-NEXT: [[TMP3:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
1815 ; CHECK-NEXT: ret float [[TMP3]]
1817 ; STORE-LABEL: @fadd_v4f32_fmf_intersect(
1818 ; STORE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
1819 ; STORE-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
1820 ; STORE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
1821 ; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
1822 ; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1823 ; STORE-NEXT: [[TMP3:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
1824 ; STORE-NEXT: ret float [[TMP3]]
1826 %p1 = getelementptr inbounds float, float* %p, i64 1
1827 %p2 = getelementptr inbounds float, float* %p, i64 2
1828 %p3 = getelementptr inbounds float, float* %p, i64 3
1829 %t0 = load float, float* %p, align 4
1830 %t1 = load float, float* %p1, align 4
1831 %t2 = load float, float* %p2, align 4
1832 %t3 = load float, float* %p3, align 4
1833 %add1 = fadd ninf reassoc nsz nnan float %t1, %t0
1834 %add2 = fadd ninf reassoc nsz nnan arcp float %t2, %add1
1835 %add3 = fadd ninf reassoc nsz contract float %t3, %add2
1839 declare i32 @__gxx_personality_v0(...)