1 ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
2 ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
3 ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
4 ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
5 ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
7 define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
8 ; CHECK-ORDERED-LABEL: @fadd_strict
9 ; CHECK-ORDERED: vector.body:
10 ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
11 ; CHECK-ORDERED: %[[LOAD:.*]] = load <8 x float>, ptr
12 ; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]])
13 ; CHECK-ORDERED: for.end
14 ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
15 ; CHECK-ORDERED: ret float %[[PHI]]
17 ; CHECK-UNORDERED-LABEL: @fadd_strict
18 ; CHECK-UNORDERED: vector.body
19 ; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ]
20 ; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, ptr
21 ; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[LOAD_VEC]], %[[VEC_PHI]]
22 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
23 ; CHECK-UNORDERED: middle.block
24 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]])
25 ; CHECK-UNORDERED: for.body
26 ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr
27 ; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
28 ; CHECK-UNORDERED: for.end
29 ; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
30 ; CHECK-UNORDERED: ret float %[[RES]]
32 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
33 ; CHECK-NOT-VECTORIZED-NOT: vector.body
39 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
40 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
41 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
42 %0 = load float, ptr %arrayidx, align 4
43 %add = fadd float %0, %sum.07
44 %iv.next = add nuw nsw i64 %iv, 1
45 %exitcond.not = icmp eq i64 %iv.next, %n
46 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
52 ; Same as above but where fadd has a fast-math flag.
53 define float @fadd_strict_fmf(ptr noalias nocapture readonly %a, i64 %n) {
54 ; CHECK-ORDERED-LABEL: @fadd_strict_fmf
55 ; CHECK-ORDERED: vector.body:
56 ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX:%.*]], %vector.body ]
57 ; CHECK-ORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, ptr
58 ; CHECK-ORDERED: [[RDX]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[LOAD_VEC]])
59 ; CHECK-ORDERED: for.end:
60 ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX]], %middle.block ]
61 ; CHECK-ORDERED: ret float [[RES]]
63 ; CHECK-UNORDERED-LABEL: @fadd_strict_fmf
64 ; CHECK-UNORDERED: vector.body:
65 ; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FADD_VEC:%.*]], %vector.body ]
66 ; CHECK-UNORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, ptr
67 ; CHECK-UNORDERED: [[FADD_VEC]] = fadd nnan <8 x float> [[LOAD_VEC]], [[VEC_PHI]]
68 ; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd
69 ; CHECK-UNORDERED: middle.block:
70 ; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[FADD_VEC]])
71 ; CHECK-UNORDERED: for.body:
72 ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr
73 ; CHECK-UNORDERED: [[FADD:%.*]] = fadd nnan float [[LOAD]], {{.*}}
74 ; CHECK-UNORDERED: for.end:
75 ; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD]], %for.body ], [ [[RDX]], %middle.block ]
76 ; CHECK-UNORDERED: ret float [[RES]]
78 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf
79 ; CHECK-NOT-VECTORIZED-NOT: vector.body
85 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
86 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
87 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
88 %0 = load float, ptr %arrayidx, align 4
89 %add = fadd nnan float %0, %sum.07
90 %iv.next = add nuw nsw i64 %iv, 1
91 %exitcond.not = icmp eq i64 %iv.next, %n
92 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
98 define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) {
99 ; CHECK-ORDERED-LABEL: @fadd_strict_unroll
100 ; CHECK-ORDERED: vector.body:
101 ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
102 ; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
103 ; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, ptr
104 ; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, ptr
105 ; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, ptr
106 ; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, ptr
107 ; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
108 ; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
109 ; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
110 ; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
111 ; CHECK-ORDERED: for.end
112 ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
113 ; CHECK-ORDERED: ret float %[[PHI]]
115 ; CHECK-UNORDERED-LABEL: @fadd_strict_unroll
116 ; CHECK-UNORDERED: vector.body
117 ; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
118 ; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
119 ; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
120 ; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
121 ; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr
122 ; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr
123 ; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, ptr
124 ; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, ptr
125 ; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]]
126 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]]
127 ; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]]
128 ; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]]
129 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
130 ; CHECK-UNORDERED: middle.block
131 ; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
132 ; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
133 ; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
134 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]])
135 ; CHECK-UNORDERED: for.body
136 ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr
137 ; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
138 ; CHECK-UNORDERED: for.end
139 ; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
140 ; CHECK-UNORDERED: ret float %[[RES]]
142 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
143 ; CHECK-NOT-VECTORIZED-NOT: vector.body
149 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
150 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
151 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
152 %0 = load float, ptr %arrayidx, align 4
153 %add = fadd float %0, %sum.07
154 %iv.next = add nuw nsw i64 %iv, 1
155 %exitcond.not = icmp eq i64 %iv.next, %n
156 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
162 ; An additional test for unrolling where we need the last value of the reduction, i.e:
163 ; float sum = 0, sum2;
164 ; for(int i=0; i<N; ++i) {
170 define float @fadd_strict_unroll_last_val(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) {
171 ; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val
172 ; CHECK-ORDERED: vector.body
173 ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
174 ; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
175 ; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, ptr
176 ; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, ptr
177 ; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, ptr
178 ; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, ptr
179 ; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
180 ; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
181 ; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
182 ; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
183 ; CHECK-ORDERED: for.body
184 ; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ {{.*}}, %scalar.ph ]
185 ; CHECK-ORDERED: %[[LOAD5:.*]] = load float, ptr
186 ; CHECK-ORDERED: %[[FADD]] = fadd float %[[SUM_PHI]], %[[LOAD5]]
187 ; CHECK-ORDERED: for.cond.cleanup
188 ; CHECK-ORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ]
189 ; CHECK-ORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
190 ; CHECK-ORDERED: store float %[[FADD_42]], ptr %b
191 ; CHECK-ORDERED: for.end
192 ; CHECK-ORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
193 ; CHECK-ORDERED: ret float %[[SUM_LCSSA]]
195 ; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val
196 ; CHECK-UNORDERED: vector.body
197 ; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
198 ; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
199 ; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
200 ; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
201 ; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr
202 ; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr
203 ; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, ptr
204 ; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, ptr
205 ; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_PHI1]], %[[VEC_LOAD1]]
206 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_PHI2]], %[[VEC_LOAD2]]
207 ; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_PHI3]], %[[VEC_LOAD3]]
208 ; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_PHI4]], %[[VEC_LOAD4]]
209 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
210 ; CHECK-UNORDERED: middle.block
211 ; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
212 ; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
213 ; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
214 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]])
215 ; CHECK-UNORDERED: for.body
216 ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr
217 ; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]]
218 ; CHECK-UNORDERED: for.cond.cleanup
219 ; CHECK-UNORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
220 ; CHECK-UNORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
221 ; CHECK-UNORDERED: store float %[[FADD_42]], ptr %b
222 ; CHECK-UNORDERED: for.end
223 ; CHECK-UNORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
224 ; CHECK-UNORDERED: ret float %[[SUM_LCSSA]]
226 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val
227 ; CHECK-NOT-VECTORIZED-NOT: vector.body
230 %cmp = icmp sgt i64 %n, 0
231 br i1 %cmp, label %for.body, label %for.end
234 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
235 %sum = phi float [ 0.000000e+00, %entry ], [ %fadd, %for.body ]
236 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
237 %0 = load float, ptr %arrayidx, align 4
238 %fadd = fadd float %sum, %0
239 %iv.next = add nuw nsw i64 %iv, 1
240 %exitcond.not = icmp eq i64 %iv.next, %n
241 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
244 %fadd.lcssa = phi float [ %fadd, %for.body ]
245 %fadd2 = fadd float %fadd.lcssa, 4.200000e+01
246 store float %fadd2, ptr %b, align 4
250 %sum.lcssa = phi float [ %fadd.lcssa, %for.cond.cleanup ], [ 0.000000e+00, %entry ]
254 define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) {
255 ; CHECK-ORDERED-LABEL: @fadd_strict_interleave
256 ; CHECK-ORDERED: entry
257 ; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, ptr %a, i64 1
258 ; CHECK-ORDERED: %[[LOAD1:.*]] = load float, ptr %a
259 ; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr %[[ARRAYIDX]]
260 ; CHECK-ORDERED: vector.body
261 ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
262 ; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
263 ; CHECK-ORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, ptr
264 ; CHECK-ORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
265 ; CHECK-ORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
266 ; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]])
267 ; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]])
268 ; CHECK-ORDERED: for.end
269 ; CHECK-ORDERED: ret void
271 ; CHECK-UNORDERED-LABEL: @fadd_strict_interleave
272 ; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, ptr %a, i64 1
273 ; CHECK-UNORDERED: %[[LOADA1:.*]] = load float, ptr %a
274 ; CHECK-UNORDERED: %[[LOADA2:.*]] = load float, ptr %[[ARRAYIDX]]
275 ; CHECK-UNORDERED: vector.ph
276 ; CHECK-UNORDERED: %[[INS2:.*]] = insertelement <4 x float> splat (float -0.000000e+00), float %[[LOADA2]], i32 0
277 ; CHECK-UNORDERED: %[[INS1:.*]] = insertelement <4 x float> splat (float -0.000000e+00), float %[[LOADA1]], i32 0
278 ; CHECK-UNORDERED: vector.body
279 ; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <4 x float> [ %[[INS2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
280 ; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <4 x float> [ %[[INS1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
281 ; CHECK-UNORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, ptr
282 ; CHECK-UNORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
283 ; CHECK-UNORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
284 ; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <4 x float> %[[STRIDED1:.*]], %[[VEC_PHI1]]
285 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]]
286 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
287 ; CHECK-UNORDERED: middle.block
288 ; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
289 ; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
290 ; CHECK-UNORDERED: for.body
291 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
292 ; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}}
293 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
294 ; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD2]], {{.*}}
295 ; CHECK-UNORDERED: for.end
296 ; CHECK-UNORDERED: %[[SUM1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[RDX1]], %middle.block ]
297 ; CHECK-UNORDERED: %[[SUM2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX2]], %middle.block ]
298 ; CHECK-UNORDERED: store float %[[SUM1]]
299 ; CHECK-UNORDERED: store float %[[SUM2]]
300 ; CHECK-UNORDERED: ret void
302 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
303 ; CHECK-NOT-VECTORIZED-NOT: vector.body
306 %arrayidxa = getelementptr inbounds float, ptr %a, i64 1
307 %a1 = load float, ptr %a, align 4
308 %a2 = load float, ptr %arrayidxa, align 4
312 %add.phi1 = phi float [ %a2, %entry ], [ %add2, %for.body ]
313 %add.phi2 = phi float [ %a1, %entry ], [ %add1, %for.body ]
314 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
315 %arrayidxb1 = getelementptr inbounds float, ptr %b, i64 %iv
316 %0 = load float, ptr %arrayidxb1, align 4
317 %add1 = fadd float %0, %add.phi2
318 %or = or disjoint i64 %iv, 1
319 %arrayidxb2 = getelementptr inbounds float, ptr %b, i64 %or
320 %1 = load float, ptr %arrayidxb2, align 4
321 %add2 = fadd float %1, %add.phi1
322 %iv.next = add nuw nsw i64 %iv, 2
323 %exitcond.not = icmp eq i64 %iv.next, %n
324 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
327 store float %add1, ptr %a, align 4
328 store float %add2, ptr %arrayidxa, align 4
332 define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) {
333 ; CHECK-ORDERED-LABEL: @fadd_of_sum
334 ; CHECK-ORDERED: vector.body
335 ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
336 ; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr
337 ; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, ptr
338 ; CHECK-ORDERED: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]]
339 ; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]])
340 ; CHECK-ORDERED: for.end.loopexit
341 ; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
342 ; CHECK-ORDERED: for.end
343 ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
344 ; CHECK-ORDERED: ret float %[[PHI]]
346 ; CHECK-UNORDERED-LABEL: @fadd_of_sum
347 ; CHECK-UNORDERED: vector.body
348 ; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
349 ; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <4 x float>, ptr
350 ; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <4 x float>, ptr
351 ; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]]
352 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[VEC_PHI]], %[[VEC_FADD1]]
353 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
354 ; CHECK-UNORDERED: middle.block
355 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
356 ; CHECK-UNORDERED: for.body
357 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
358 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
359 ; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]]
360 ; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]]
361 ; CHECK-UNORDERED: for.end.loopexit
362 ; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
363 ; CHECK-UNORDERED: for.end
364 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
365 ; CHECK-UNORDERED: ret float %[[SUM]]
367 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
368 ; CHECK-NOT-VECTORIZED-NOT: vector.body
371 %arrayidx = getelementptr inbounds float, ptr %a, i64 1
372 %0 = load float, ptr %arrayidx, align 4
373 %cmp1 = fcmp ogt float %0, 5.000000e-01
374 br i1 %cmp1, label %for.body, label %for.end
376 for.body: ; preds = %for.body
377 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
378 %res.014 = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
379 %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %iv
380 %1 = load float, ptr %arrayidx2, align 4
381 %arrayidx4 = getelementptr inbounds float, ptr %b, i64 %iv
382 %2 = load float, ptr %arrayidx4, align 4
383 %add = fadd float %1, %2
384 %rdx = fadd float %res.014, %add
385 %iv.next = add nuw nsw i64 %iv, 1
386 %exitcond.not = icmp eq i64 %iv.next, %n
387 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
389 for.end: ; preds = %for.body, %entry
390 %res = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
394 define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) {
395 ; CHECK-ORDERED-LABEL: @fadd_conditional
396 ; CHECK-ORDERED: vector.body:
397 ; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ]
398 ; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr
399 ; CHECK-ORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
400 ; CHECK-ORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
401 ; CHECK-ORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
402 ; CHECK-ORDERED: pred.load.continue6
403 ; CHECK-ORDERED: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
404 ; CHECK-ORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], splat (i1 true)
405 ; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> splat (float 3.000000e+00), <4 x float> %[[PHI1]]
406 ; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
407 ; CHECK-ORDERED: for.body
408 ; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
409 ; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr
410 ; CHECK-ORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
411 ; CHECK-ORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc
412 ; CHECK-ORDERED: if.then
413 ; CHECK-ORDERED: %[[LOAD3:.*]] = load float, ptr
414 ; CHECK-ORDERED: br label %for.inc
415 ; CHECK-ORDERED: for.inc
416 ; CHECK-ORDERED: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
417 ; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]]
418 ; CHECK-ORDERED: for.end
419 ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
420 ; CHECK-ORDERED: ret float %[[RDX_PHI]]
422 ; CHECK-UNORDERED-LABEL: @fadd_conditional
423 ; CHECK-UNORDERED: vector.body
424 ; CHECK-UNORDERED: %[[PHI:.*]] = phi <4 x float> [ <float 1.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %pred.load.continue6 ]
425 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr
426 ; CHECK-UNORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
427 ; CHECK-UNORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
428 ; CHECK-UNORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
429 ; CHECK-UNORDERED: pred.load.continue6
430 ; CHECK-UNORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], splat (i1 true)
431 ; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> splat (float 3.000000e+00), <4 x float> %[[PRED_PHI:.*]]
432 ; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[PHI]], %[[PRED]]
433 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
434 ; CHECK-UNORDERED: middle.block
435 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
436 ; CHECK-UNORDERED: for.body
437 ; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
438 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
439 ; CHECK-UNORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
440 ; CHECK-UNORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc
441 ; CHECK-UNORDERED: if.then
442 ; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, ptr
443 ; CHECK-UNORDERED: for.inc
444 ; CHECK-UNORDERED: %[[PHI:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
445 ; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI]]
446 ; CHECK-UNORDERED: for.end
447 ; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
448 ; CHECK-UNORDERED: ret float %[[RDX_PHI]]
450 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
451 ; CHECK-NOT-VECTORIZED-NOT: vector.body
456 for.body: ; preds = %for.body
457 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
458 %res = phi float [ 1.000000e+00, %entry ], [ %fadd, %for.inc ]
459 %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
460 %0 = load float, ptr %arrayidx, align 4
461 %tobool = fcmp une float %0, 0.000000e+00
462 br i1 %tobool, label %if.then, label %for.inc
464 if.then: ; preds = %for.body
465 %arrayidx2 = getelementptr inbounds float, ptr %a, i64 %iv
466 %1 = load float, ptr %arrayidx2, align 4
470 %phi = phi float [ %1, %if.then ], [ 3.000000e+00, %for.body ]
471 %fadd = fadd float %res, %phi
472 %iv.next = add nuw nsw i64 %iv, 1
473 %exitcond.not = icmp eq i64 %iv.next, %n
474 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
477 %rdx = phi float [ %fadd, %for.inc ]
481 ; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute
482 define float @fadd_predicated(ptr noalias nocapture %a, i64 %n) {
483 ; CHECK-ORDERED-LABEL: @fadd_predicated
484 ; CHECK-ORDERED: vector.ph
485 ; CHECK-ORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1
486 ; CHECK-ORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i64 0
487 ; CHECK-ORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
488 ; CHECK-ORDERED: vector.body
489 ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ]
490 ; CHECK-ORDERED: pred.load.continue2
491 ; CHECK-ORDERED: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ]
492 ; CHECK-ORDERED: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> splat (float -0.000000e+00)
493 ; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
494 ; CHECK-ORDERED: for.end:
495 ; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ]
496 ; CHECK-ORDERED: ret float %[[RES_PHI]]
498 ; CHECK-UNORDERED-LABEL: @fadd_predicated
499 ; CHECK-UNORDERED: vector.ph
500 ; CHECK-UNORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1
501 ; CHECK-UNORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i64 0
502 ; CHECK-UNORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
503 ; CHECK-UNORDERED: vector.body
504 ; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD:.*]], %pred.load.continue2 ]
505 ; CHECK-UNORDERED: %[[ICMP:.*]] = icmp ule <2 x i64> %vec.ind, %[[SPLAT]]
506 ; CHECK-UNORDERED: pred.load.continue2
507 ; CHECK-UNORDERED: %[[FADD]] = fadd <2 x float> %[[RDX_PHI]], {{.*}}
508 ; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]]
509 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
510 ; CHECK-UNORDERED: middle.block
511 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]])
512 ; CHECK-UNORDERED: for.body
513 ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr
514 ; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[LOAD]]
515 ; CHECK-UNORDERED: for.end
516 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
517 ; CHECK-UNORDERED: ret float %[[SUM]]
519 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated
520 ; CHECK-NOT-VECTORIZED-NOT: vector.body
525 for.body: ; preds = %entry, %for.body
526 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
527 %sum.02 = phi float [ %l7, %for.body ], [ 0.000000e+00, %entry ]
528 %l2 = getelementptr inbounds float, ptr %a, i64 %iv
529 %l3 = load float, ptr %l2, align 4
530 %l7 = fadd float %sum.02, %l3
531 %iv.next = add i64 %iv, 1
532 %exitcond = icmp eq i64 %iv.next, %n
533 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
535 for.end: ; preds = %for.body
536 %sum.0.lcssa = phi float [ %l7, %for.body ]
537 ret float %sum.0.lcssa
540 ; Negative test - loop contains multiple fadds which we cannot safely reorder
541 define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
542 ; CHECK-ORDERED-LABEL: @fadd_multiple
543 ; CHECK-ORDERED-NOT: vector.body
545 ; CHECK-UNORDERED-LABEL: @fadd_multiple
546 ; CHECK-UNORDERED: vector.body
547 ; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
548 ; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr
549 ; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
550 ; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr
551 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
552 ; CHECK-UNORDERED: middle.block
553 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
554 ; CHECK-UNORDERED: for.body
555 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
556 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
557 ; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
558 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
559 ; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
560 ; CHECK-UNORDERED: for.end
561 ; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
562 ; CHECK-UNORDERED: ret float %[[RET]]
564 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
565 ; CHECK-NOT-VECTORIZED-NOT: vector.body
570 for.body: ; preds = %entry, %for.body
571 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
572 %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ]
573 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
574 %0 = load float, ptr %arrayidx, align 4
575 %add = fadd float %sum, %0
576 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
577 %1 = load float, ptr %arrayidx2, align 4
578 %add3 = fadd float %add, %1
579 %iv.next = add nuw nsw i64 %iv, 1
580 %exitcond.not = icmp eq i64 %iv.next, %n
581 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
583 for.end: ; preds = %for.body
584 %rdx = phi float [ %add3, %for.body ]
588 ; Negative test - loop contains two fadds and only one fadd has the fast flag,
589 ; which we cannot safely reorder.
590 define float @fadd_multiple_one_flag(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
591 ; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag
592 ; CHECK-ORDERED-NOT: vector.body
594 ; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag
595 ; CHECK-UNORDERED: vector.body
596 ; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ splat (float -0.000000e+00), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
597 ; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, ptr
598 ; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
599 ; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, ptr
600 ; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd fast <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
601 ; CHECK-UNORDERED: middle.block
602 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
603 ; CHECK-UNORDERED: for.body
604 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
605 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
606 ; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
607 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
608 ; CHECK-UNORDERED: %[[FADD2]] = fadd fast float %[[FADD1]], %[[LOAD2]]
609 ; CHECK-UNORDERED: for.end
610 ; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
611 ; CHECK-UNORDERED: ret float %[[RET]]
613 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag
614 ; CHECK-NOT-VECTORIZED-NOT: vector.body
619 for.body: ; preds = %entry, %for.body
620 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
621 %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ]
622 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
623 %0 = load float, ptr %arrayidx, align 4
624 %add = fadd float %sum, %0
625 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
626 %1 = load float, ptr %arrayidx2, align 4
627 %add3 = fadd fast float %add, %1
628 %iv.next = add nuw nsw i64 %iv, 1
629 %exitcond.not = icmp eq i64 %iv.next, %n
630 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
632 for.end: ; preds = %for.body
633 %rdx = phi float [ %add3, %for.body ]
637 ; Tests with both a floating point reduction & induction, e.g.
639 ;float fp_iv_rdx_loop(float *values, float init, float * __restrict__ A, int N) {
640 ; float fp_inc = 2.0;
643 ; for (int i=0; i < N; ++i) {
652 ; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported
653 ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
654 ; with the -hints-allow-reordering flag set to true.
655 define float @induction_and_reduction(ptr nocapture readonly %values, float %init, ptr noalias nocapture %A, i64 %N) {
656 ; CHECK-ORDERED-LABEL: @induction_and_reduction
657 ; CHECK-ORDERED-NOT: vector.body
659 ; CHECK-UNORDERED-LABEL: @induction_and_reduction
660 ; CHECK-UNORDERED-NOT: vector.body
662 ; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction
663 ; CHECK-NOT-VECTORIZED-NOT: vector.body
669 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
670 %sum.015 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ]
671 %x.014 = phi float [ %init, %entry ], [ %add, %for.body ]
672 %arrayidx = getelementptr inbounds float, ptr %A, i64 %iv
673 store float %x.014, ptr %arrayidx, align 4
674 %add = fadd float %x.014, 2.000000e+00
675 %arrayidx2 = getelementptr inbounds float, ptr %values, i64 %iv
676 %0 = load float, ptr %arrayidx2, align 4
677 %add3 = fadd float %sum.015, %0
678 %iv.next = add nuw nsw i64 %iv, 1
679 %exitcond.not = icmp eq i64 %iv.next, %N
680 br i1 %exitcond.not, label %for.end, label %for.body
686 ; As above, but with the FP induction being unordered (fast) the loop can be vectorized with strict reductions
687 define float @fast_induction_and_reduction(ptr nocapture readonly %values, float %init, ptr noalias nocapture %A, i64 %N) {
688 ; CHECK-ORDERED-LABEL: @fast_induction_and_reduction
689 ; CHECK-ORDERED: vector.ph
690 ; CHECK-ORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
691 ; CHECK-ORDERED: vector.body
692 ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
693 ; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
694 ; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr
695 ; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
696 ; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], splat (float 8.000000e+00)
697 ; CHECK-ORDERED: for.body
698 ; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
699 ; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
700 ; CHECK-ORDERED: store float %[[IND_SUM_PHI]], ptr
701 ; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
702 ; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr
703 ; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
704 ; CHECK-ORDERED: for.end
705 ; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ]
706 ; CHECK-ORDERED: ret float %[[RES_PHI]]
708 ; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
709 ; CHECK-UNORDERED: vector.ph
710 ; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
711 ; CHECK-UNORDERED: vector.body
712 ; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
713 ; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
714 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, ptr
715 ; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]]
716 ; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], splat (float 8.000000e+00)
717 ; CHECK-UNORDERED: middle.block:
718 ; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
719 ; CHECK-UNORDERED: for.body:
720 ; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ]
721 ; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
722 ; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], ptr
723 ; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
724 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
725 ; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
726 ; CHECK-UNORDERED: for.end
727 ; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ]
728 ; CHECK-UNORDERED: ret float %[[RES_PHI]]
730 ; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
731 ; CHECK-NOT-VECTORIZED-NOT: vector.body
737 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
738 %sum.015 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ]
739 %x.014 = phi fast float [ %init, %entry ], [ %add, %for.body ]
740 %arrayidx = getelementptr inbounds float, ptr %A, i64 %iv
741 store float %x.014, ptr %arrayidx, align 4
742 %add = fadd fast float %x.014, 2.000000e+00
743 %arrayidx2 = getelementptr inbounds float, ptr %values, i64 %iv
744 %0 = load float, ptr %arrayidx2, align 4
745 %add3 = fadd float %sum.015, %0
746 %iv.next = add nuw nsw i64 %iv, 1
747 %exitcond.not = icmp eq i64 %iv.next, %N
748 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
754 ; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop
755 ; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
756 ; with the -hints-allow-reordering flag set to true.
757 define float @fast_induction_unordered_reduction(ptr nocapture readonly %values, float %init, ptr noalias nocapture %A, ptr noalias nocapture %B, i64 %N) {
759 ; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction
760 ; CHECK-ORDERED-NOT: vector.body
762 ; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction
763 ; CHECK-UNORDERED-NOT: vector.body
765 ; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction
766 ; CHECK-NOT-VECTORIZED-NOT: vector.body
772 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
773 %sum2.023 = phi float [ 3.000000e+00, %entry ], [ %mul, %for.body ]
774 %sum.022 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ]
775 %x.021 = phi float [ %init, %entry ], [ %add, %for.body ]
776 %arrayidx = getelementptr inbounds float, ptr %A, i64 %iv
777 store float %x.021, ptr %arrayidx, align 4
778 %add = fadd fast float %x.021, 2.000000e+00
779 %arrayidx2 = getelementptr inbounds float, ptr %values, i64 %iv
780 %0 = load float, ptr %arrayidx2, align 4
781 %add3 = fadd float %sum.022, %0
782 %mul = fmul float %sum2.023, %0
783 %iv.next = add nuw nsw i64 %iv, 1
784 %exitcond.not = icmp eq i64 %iv.next, %N
785 br i1 %exitcond.not, label %for.end, label %for.body
788 %add6 = fadd float %add3, %mul
792 ; Test reductions for a VF of 1 and a UF > 1.
793 define float @fadd_scalar_vf(ptr noalias nocapture readonly %a, i64 %n) {
794 ; CHECK-ORDERED-LABEL: @fadd_scalar_vf
795 ; CHECK-ORDERED: vector.body
796 ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ]
797 ; CHECK-ORDERED: %[[LOAD1:.*]] = load float, ptr
798 ; CHECK-ORDERED: %[[LOAD2:.*]] = load float, ptr
799 ; CHECK-ORDERED: %[[LOAD3:.*]] = load float, ptr
800 ; CHECK-ORDERED: %[[LOAD4:.*]] = load float, ptr
801 ; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]]
802 ; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]]
803 ; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]]
804 ; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]]
805 ; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd
806 ; CHECK-ORDERED: scalar.ph
807 ; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ %[[FADD4]], %middle.block ], [ 0.000000e+00, %entry ]
808 ; CHECK-ORDERED: for.body
809 ; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
810 ; CHECK-ORDERED: %[[LOAD5:.*]] = load float, ptr
811 ; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
812 ; CHECK-ORDERED: for.end
813 ; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ]
814 ; CHECK-ORDERED: ret float %[[RES_PHI]]
816 ; CHECK-UNORDERED-LABEL: @fadd_scalar_vf
817 ; CHECK-UNORDERED: vector.body
818 ; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ]
819 ; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
820 ; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ]
821 ; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ]
822 ; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, ptr
823 ; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, ptr
824 ; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, ptr
825 ; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, ptr
826 ; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]]
827 ; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]]
828 ; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]]
829 ; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]]
830 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
831 ; CHECK-UNORDERED: middle.block
832 ; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]]
833 ; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]]
834 ; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]]
835 ; CHECK-UNORDERED: scalar.ph
836 ; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ %[[BIN_RDX3]], %middle.block ], [ 0.000000e+00, %entry ]
837 ; CHECK-UNORDERED: for.body
838 ; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
839 ; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, ptr
840 ; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
841 ; CHECK-UNORDERED: for.end
842 ; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ]
843 ; CHECK-UNORDERED: ret float %[[RES_PHI]]
845 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf
846 ; CHECK-NOT-VECTORIZED-NOT: @vector.body
852 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
853 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
854 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
855 %0 = load float, ptr %arrayidx, align 4
856 %add = fadd float %0, %sum.07
857 %iv.next = add nuw nsw i64 %iv, 1
858 %exitcond.not = icmp eq i64 %iv.next, %n
859 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4
865 ; Same as above but where fadd has a fast-math flag.
866 define float @fadd_scalar_vf_fmf(ptr noalias nocapture readonly %a, i64 %n) {
867 ; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf
868 ; CHECK-ORDERED: vector.body:
869 ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ]
870 ; CHECK-ORDERED: [[LOAD1:%.*]] = load float, ptr
871 ; CHECK-ORDERED: [[LOAD2:%.*]] = load float, ptr
872 ; CHECK-ORDERED: [[LOAD3:%.*]] = load float, ptr
873 ; CHECK-ORDERED: [[LOAD4:%.*]] = load float, ptr
874 ; CHECK-ORDERED: [[FADD1:%.*]] = fadd nnan float [[VEC_PHI]], [[LOAD1]]
875 ; CHECK-ORDERED: [[FADD2:%.*]] = fadd nnan float [[FADD1]], [[LOAD2]]
876 ; CHECK-ORDERED: [[FADD3:%.*]] = fadd nnan float [[FADD2]], [[LOAD3]]
877 ; CHECK-ORDERED: [[FADD4]] = fadd nnan float [[FADD3]], [[LOAD4]]
878 ; CHECK-ORDERED-NOT: @llvm.vector.reduce.fadd
879 ; CHECK-ORDERED: scalar.ph:
880 ; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ [[FADD4]], %middle.block ], [ 0.000000e+00, %entry ]
881 ; CHECK-ORDERED: for.body:
882 ; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ]
883 ; CHECK-ORDERED: [[LOAD5:%.*]] = load float, ptr
884 ; CHECK-ORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]]
885 ; CHECK-ORDERED: for.end:
886 ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[FADD4]], %middle.block ]
887 ; CHECK-ORDERED: ret float [[RES]]
889 ; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf
890 ; CHECK-UNORDERED: vector.body:
891 ; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD1:%.*]], %vector.body ]
892 ; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD2:%.*]], %vector.body ]
893 ; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
894 ; CHECK-UNORDERED: [[VEC_PHI4:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ]
895 ; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, ptr
896 ; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr
897 ; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, ptr
898 ; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, ptr
899 ; CHECK-UNORDERED: [[FADD1]] = fadd nnan float [[LOAD1]], [[VEC_PHI1]]
900 ; CHECK-UNORDERED: [[FADD2]] = fadd nnan float [[LOAD2]], [[VEC_PHI2]]
901 ; CHECK-UNORDERED: [[FADD3]] = fadd nnan float [[LOAD3]], [[VEC_PHI3]]
902 ; CHECK-UNORDERED: [[FADD4]] = fadd nnan float [[LOAD4]], [[VEC_PHI4]]
903 ; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd
904 ; CHECK-UNORDERED: middle.block:
905 ; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD2]], [[FADD1]]
906 ; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[BIN_RDX1]]
907 ; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[FADD4]], [[BIN_RDX2]]
908 ; CHECK-UNORDERED: scalar.ph:
909 ; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ [[BIN_RDX3]], %middle.block ], [ 0.000000e+00, %entry ]
910 ; CHECK-UNORDERED: for.body:
911 ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ]
912 ; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, ptr
913 ; CHECK-UNORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]]
914 ; CHECK-UORDERED: for.end
915 ; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[BIN_RDX3]], %middle.block ]
916 ; CHECK-UNORDERED: ret float [[RES]]
918 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf
919 ; CHECK-NOT-VECTORIZED-NOT: vector.body
925 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
926 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
927 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
928 %0 = load float, ptr %arrayidx, align 4
929 %add = fadd nnan float %0, %sum.07
930 %iv.next = add nuw nsw i64 %iv, 1
931 %exitcond.not = icmp eq i64 %iv.next, %n
932 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4
938 ; Test case where the reduction step is a first-order recurrence.
939 define double @reduction_increment_by_first_order_recurrence() {
940 ; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
941 ; CHECK-ORDERED: vector.body:
942 ; CHECK-ORDERED: [[RED:%.*]] = phi double [ 0.000000e+00, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
943 ; CHECK-ORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
944 ; CHECK-ORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
945 ; CHECK-ORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
946 ; CHECK-ORDERED: [[RED_NEXT]] = call double @llvm.vector.reduce.fadd.v4f64(double [[RED]], <4 x double> [[TMP1]])
947 ; CHECK-ORDERED: scalar.ph:
948 ; CHECK-ORDERED: = phi double [ [[RED_NEXT]], %middle.block ], [ 0.000000e+00, %entry ]
950 ; CHECK-UNORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
951 ; CHECK-UNORDERED: vector.body:
952 ; CHECK-UNORDERED: [[RED:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
953 ; CHECK-UNORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
954 ; CHECK-UNORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
955 ; CHECK-UNORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
956 ; CHECK-UNORDERED: [[RED_NEXT]] = fadd <4 x double> [[TMP1]], [[RED]]
957 ; CHECK-UNORDERED: middle.block:
958 ; CHECK-UNORDERED: [[RDX:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[RED_NEXT]])
959 ; CHECK-UNORDERED: scalar.ph:
960 ; CHECK-UNORDERED: [[BC_MERGE_RDX:%.*]] = phi double [ [[RDX]], %middle.block ], [ 0.000000e+00, %entry ]
962 ; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence(
963 ; CHECK-NOT-VECTORIZED-NOT: vector.body
969 %red = phi double [ 0.0, %entry ], [ %red.next, %loop ]
970 %for = phi double [ 0.0, %entry ], [ %for.next, %loop ]
971 %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
972 %red.next = fadd double %for, %red
973 %for.next = sitofp i32 %iv to double
974 %iv.next = add nsw i32 %iv, 1
975 %ec = icmp eq i32 %iv.next, 0
976 br i1 %ec, label %exit, label %loop, !llvm.loop !13
979 %res = phi double [ %red.next, %loop ]
983 ; We should not mark the fadd as an ordered reduction here as there are
984 ; more than 2 uses of the instruction
985 define float @fadd_multiple_use(i64 %n) {
986 ; CHECK-ORDERED-LABEL: @fadd_multiple_use
987 ; CHECK-ORDERED-LABEL-NOT: vector.body
989 ; CHECK-UNORDERED-LABEL: @fadd_multiple_use
990 ; CHECK-UNORDERED-LABEL-NOT: vector.body
992 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use
993 ; CHECK-NOT-VECTORIZED-NOT: vector.body
999 %iv = phi i64 [ 0, %entry ], [ %iv.next2, %bb2 ]
1000 %red = phi float [ 0.0, %entry ], [ %fadd, %bb2 ]
1001 %phi1 = phi i64 [ 0, %entry ], [ %iv.next, %bb2 ]
1002 %fadd = fadd float %red, 1.000000e+00
1003 %iv.next = add nsw i64 %phi1, 1
1004 %cmp = icmp ult i64 %iv, %n
1005 br i1 %cmp, label %bb2, label %bb1
1008 %phi2 = phi float [ %fadd, %for.body ]
1012 %iv.next2 = add nuw nsw i64 %iv, 1
1013 br i1 false, label %for.end, label %for.body
1016 %phi3 = phi float [ %fadd, %bb2 ]
1020 ; Test case where the loop has a call to the llvm.fmuladd intrinsic.
1021 define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) {
1022 ; CHECK-ORDERED-LABEL: @fmuladd_strict
1023 ; CHECK-ORDERED: vector.body:
1024 ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
1025 ; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr
1026 ; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr
1027 ; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr
1028 ; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr
1029 ; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr
1030 ; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr
1031 ; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr
1032 ; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr
1033 ; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
1034 ; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
1035 ; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
1036 ; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
1037 ; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]])
1038 ; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]])
1039 ; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]])
1040 ; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]])
1041 ; CHECK-ORDERED: for.body:
1042 ; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
1043 ; CHECK-ORDERED: [[LOAD:%.*]] = load float, ptr
1044 ; CHECK-ORDERED: [[LOAD1:%.*]] = load float, ptr
1045 ; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
1046 ; CHECK-ORDERED: for.end
1047 ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ]
1049 ; CHECK-UNORDERED-LABEL: @fmuladd_strict
1050 ; CHECK-UNORDERED: vector.body:
1051 ; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
1052 ; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr
1053 ; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr
1054 ; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr
1055 ; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr
1056 ; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr
1057 ; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
1058 ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
1059 ; CHECK-UNORDERED: middle.block:
1060 ; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
1061 ; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
1062 ; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
1063 ; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
1064 ; CHECK-UNORDERED: for.body:
1065 ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
1066 ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr
1067 ; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr
1068 ; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
1069 ; CHECK-UNORDERED: for.end:
1070 ; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
1071 ; CHECK-UNORDERED: ret float [[RES]]
1073 ; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
1074 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1080 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1081 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
1082 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1083 %0 = load float, ptr %arrayidx, align 4
1084 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
1085 %1 = load float, ptr %arrayidx2, align 4
1086 %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
1087 %iv.next = add nuw nsw i64 %iv, 1
1088 %exitcond.not = icmp eq i64 %iv.next, %n
1089 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
1095 ; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic.
1096 define float @fmuladd_scalar_vf(ptr %a, ptr %b, i64 %n) {
1097 ; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf
1098 ; CHECK-ORDERED: vector.body:
1099 ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
1100 ; CHECK-ORDERED: [[LOAD:%.*]] = load float, ptr
1101 ; CHECK-ORDERED: [[LOAD1:%.*]] = load float, ptr
1102 ; CHECK-ORDERED: [[LOAD2:%.*]] = load float, ptr
1103 ; CHECK-ORDERED: [[LOAD3:%.*]] = load float, ptr
1104 ; CHECK-ORDERED: [[LOAD4:%.*]] = load float, ptr
1105 ; CHECK-ORDERED: [[LOAD5:%.*]] = load float, ptr
1106 ; CHECK-ORDERED: [[LOAD6:%.*]] = load float, ptr
1107 ; CHECK-ORDERED: [[LOAD7:%.*]] = load float, ptr
1108 ; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]]
1109 ; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]]
1110 ; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]]
1111 ; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]]
1112 ; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]]
1113 ; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]]
1114 ; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]]
1115 ; CHECK-ORDERED: [[FADD3]] = fadd float [[FADD2]], [[FMUL3]]
1116 ; CHECK-ORDERED-NOT: llvm.vector.reduce.fadd
1117 ; CHECK-ORDERED: scalar.ph
1118 ; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ [[FADD3]], %middle.block ], [ 0.000000e+00, %entry ]
1119 ; CHECK-ORDERED: for.body
1120 ; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
1121 ; CHECK-ORDERED: [[LOAD8:%.*]] = load float, ptr
1122 ; CHECK-ORDERED: [[LOAD9:%.*]] = load float, ptr
1123 ; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
1124 ; CHECK-ORDERED: for.end
1125 ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ]
1126 ; CHECK-ORDERED: ret float [[RES]]
1128 ; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf
1129 ; CHECK-UNORDERED: vector.body:
1130 ; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
1131 ; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
1132 ; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
1133 ; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
1134 ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr
1135 ; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, ptr
1136 ; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr
1137 ; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, ptr
1138 ; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, ptr
1139 ; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, ptr
1140 ; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, ptr
1141 ; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, ptr
1142 ; CHECK-UNORDERED: [[FMULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]])
1143 ; CHECK-UNORDERED: [[FMULADD1]] = tail call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]])
1144 ; CHECK-UNORDERED: [[FMULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]])
1145 ; CHECK-UNORDERED: [[FMULADD3]] = tail call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]])
1146 ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
1147 ; CHECK-UNORDERED: middle.block:
1148 ; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]]
1149 ; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]]
1150 ; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]]
1151 ; CHECK-UNORDERED: scalar.ph:
1152 ; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ [[BIN_RDX2]], %middle.block ], [ 0.000000e+00, %entry ]
1153 ; CHECK-UNORDERED: for.body:
1154 ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
1155 ; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, ptr
1156 ; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, ptr
1157 ; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
1158 ; CHECK-UNORDERED: for.end:
1159 ; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ]
1160 ; CHECK-UNORDERED: ret float [[RES]]
1162 ; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf
1163 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1169 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1170 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
1171 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1172 %0 = load float, ptr %arrayidx, align 4
1173 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
1174 %1 = load float, ptr %arrayidx2, align 4
1175 %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
1176 %iv.next = add nuw nsw i64 %iv, 1
1177 %exitcond.not = icmp eq i64 %iv.next, %n
1178 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4
1184 ; Test case where the reduction phi is one of the mul operands of the fmuladd.
1185 define float @fmuladd_phi_is_mul_operand(ptr %a, ptr %b, i64 %n) {
1186 ; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand
1187 ; CHECK-ORDERED-NOT: vector.body
1189 ; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand
1190 ; CHECK-UNORDERED-NOT: vector.body
1192 ; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand
1193 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1199 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1200 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
1201 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1202 %0 = load float, ptr %arrayidx, align 4
1203 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
1204 %1 = load float, ptr %arrayidx2, align 4
1205 %muladd = tail call float @llvm.fmuladd.f32(float %sum.07, float %0, float %1)
1206 %iv.next = add nuw nsw i64 %iv, 1
1207 %exitcond.not = icmp eq i64 %iv.next, %n
1208 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
1214 ; Test case where the reduction phi is two operands of the fmuladd.
1215 define float @fmuladd_phi_is_two_operands(ptr %a, i64 %n) {
1216 ; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands
1217 ; CHECK-ORDERED-NOT: vector.body
1219 ; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands
1220 ; CHECK-UNORDERED-NOT: vector.body
1222 ; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands
1223 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1229 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1230 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
1231 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1232 %0 = load float, ptr %arrayidx, align 4
1233 %muladd = tail call float @llvm.fmuladd.f32(float %sum.07, float %0, float %sum.07)
1234 %iv.next = add nuw nsw i64 %iv, 1
1235 %exitcond.not = icmp eq i64 %iv.next, %n
1236 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
1242 ; Test case with multiple calls to llvm.fmuladd, which is not safe to reorder
1243 ; so is only vectorized in the unordered (fast) case.
1244 define float @fmuladd_multiple(ptr %a, ptr %b, i64 %n) {
1245 ; CHECK-ORDERED-LABEL: @fmuladd_multiple
1246 ; CHECK-ORDERED-NOT: vector.body:
1248 ; CHECK-UNORDERED-LABEL: @fmuladd_multiple
1249 ; CHECK-UNORDERED: vector.body:
1250 ; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
1251 ; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr
1252 ; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr
1253 ; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr
1254 ; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr
1255 ; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr
1256 ; CHECK-UNORDERED: [[FMULADD:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
1257 ; CHECK-UNORDERED: [[FMULADD2]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[FMULADD]])
1258 ; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
1259 ; CHECK-UNORDERED: middle.block:
1260 ; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
1261 ; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
1262 ; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
1263 ; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
1264 ; CHECK-UNORDERED: for.body:
1265 ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD2:%.*]], %for.body ]
1266 ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, ptr
1267 ; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, ptr
1268 ; CHECK-UNORDERED: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
1269 ; CHECK-UNORDERED: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[MULADD]])
1270 ; CHECK-UNORDERED: for.end:
1271 ; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD2]], %for.body ], [ [[RDX]], %middle.block ]
1272 ; CHECK-UNORDERED: ret float [[RES]]
1274 ; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple
1275 ; CHECK-NOT-VECTORIZED-NOT: vector.body:
1281 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1282 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd2, %for.body ]
1283 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1284 %0 = load float, ptr %arrayidx, align 4
1285 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
1286 %1 = load float, ptr %arrayidx2, align 4
1287 %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
1288 %muladd2 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %muladd)
1289 %iv.next = add nuw nsw i64 %iv, 1
1290 %exitcond.not = icmp eq i64 %iv.next, %n
1291 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
1297 ; Same as above but the first fmuladd is one of the mul operands of the second fmuladd.
1298 define float @multiple_fmuladds_mul_operand(ptr %a, ptr %b, i64 %n) {
1299 ; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand
1300 ; CHECK-ORDERED-NOT: vector.body
1302 ; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand
1303 ; CHECK-UNORDERED-NOT: vector.body
1305 ; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand
1306 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1312 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1313 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd2, %for.body ]
1314 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1315 %0 = load float, ptr %arrayidx, align 4
1316 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
1317 %1 = load float, ptr %arrayidx2, align 4
1318 %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
1319 %muladd2 = tail call float @llvm.fmuladd.f32(float %0, float %muladd, float %1)
1320 %iv.next = add nuw nsw i64 %iv, 1
1321 %exitcond.not = icmp eq i64 %iv.next, %n
1322 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
1328 ; Same as above but the first fmuladd is two of the operands of the second fmuladd.
1329 define float @multiple_fmuladds_two_operands(ptr %a, ptr %b, i64 %n) {
1330 ; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands
1331 ; CHECK-ORDERED-NOT: vector.body
1333 ; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands
1334 ; CHECK-UNORDERED-NOT: vector.body
1336 ; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands
1337 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1343 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
1344 %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd2, %for.body ]
1345 %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
1346 %0 = load float, ptr %arrayidx, align 4
1347 %arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
1348 %1 = load float, ptr %arrayidx2, align 4
1349 %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
1350 %muladd2 = tail call float @llvm.fmuladd.f32(float %0, float %muladd, float %muladd)
1351 %iv.next = add nuw nsw i64 %iv, 1
1352 %exitcond.not = icmp eq i64 %iv.next, %n
1353 br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
1359 declare float @llvm.fmuladd.f32(float, float, float)
1361 ; Test case with invariant store where fadd is strict.
1362 define void @reduction_store_to_invariant_address(ptr %dst, ptr readonly %src) {
1363 ; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address(
1364 ; CHECK-ORDERED: entry
1365 ; CHECK-ORDERED: %[[DEST_PTR:.*]] = getelementptr inbounds float, ptr %dst, i64 42
1366 ; CHECK-ORDERED: vector.body
1367 ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
1368 ; CHECK-ORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, ptr
1369 ; CHECK-ORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD_VEC]])
1370 ; CHECK-ORDERED: middle.block
1371 ; CHECK-ORDERED: store float %[[RDX]], ptr %[[DEST_PTR]]
1372 ; CHECK-ORDERED: for.body
1373 ; CHECK-ORDERED: %[[LOAD:.*]] = load float, ptr
1374 ; CHECK-ORDERED: %[[FADD:.*]] = fadd float %{{.*}}, %[[LOAD]]
1375 ; CHECK-ORDERED: store float %[[FADD]], ptr %[[DEST_PTR]]
1377 ; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
1378 ; CHECK-UNORDERED: entry
1379 ; CHECK-UNORDERED: %[[DEST_PTR:.*]] = getelementptr inbounds float, ptr %dst, i64 42
1380 ; CHECK-UNORDERED: vector.body
1381 ; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ]
1382 ; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, ptr
1383 ; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[VEC_PHI]], %[[LOAD_VEC]]
1384 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
1385 ; CHECK-UNORDERED: middle.block
1386 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]])
1387 ; CHECK-UNORDERED: store float %[[RDX]], ptr %[[DEST_PTR]]
1388 ; CHECK-UNORDERED: for.body
1389 ; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr
1390 ; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]]
1391 ; CHECK-UNORDERED: store float %[[FADD]], ptr %[[DEST_PTR]]
1393 ; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
1394 ; CHECK-NOT-VECTORIZED-NOT: vector.body
1397 %arrayidx = getelementptr inbounds float, ptr %dst, i64 42
1398 store float 0.000000e+00, ptr %arrayidx, align 4
1402 %0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
1403 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1404 %arrayidx1 = getelementptr inbounds float, ptr %src, i64 %indvars.iv
1405 %1 = load float, ptr %arrayidx1, align 4
1406 %add = fadd float %0, %1
1407 store float %add, ptr %arrayidx, align 4
1408 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1409 %exitcond = icmp eq i64 %indvars.iv.next, 1000
1410 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
1416 !0 = distinct !{!0, !5, !9, !11}
1417 !1 = distinct !{!1, !5, !10, !11}
1418 !2 = distinct !{!2, !6, !9, !11}
1419 !3 = distinct !{!3, !7, !9, !11, !12}
1420 !4 = distinct !{!4, !8, !10, !11}
1421 !5 = !{!"llvm.loop.vectorize.width", i32 8}
1422 !6 = !{!"llvm.loop.vectorize.width", i32 4}
1423 !7 = !{!"llvm.loop.vectorize.width", i32 2}
1424 !8 = !{!"llvm.loop.vectorize.width", i32 1}
1425 !9 = !{!"llvm.loop.interleave.count", i32 1}
1426 !10 = !{!"llvm.loop.interleave.count", i32 4}
1427 !11 = !{!"llvm.loop.vectorize.enable", i1 true}
1428 !12 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
1429 !13 = distinct !{!13, !6, !9, !11}