1 ; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -S \
2 ; RUN: | FileCheck %s --check-prefix=CHECK-VECTOR
3 ; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -S \
4 ; RUN: | FileCheck %s --check-prefix=CHECK-SCALAR
6 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
7 target triple = "x86_64-apple-macosx10.8.0"
9 ; We don't unroll this loop because it has a small constant trip count
10 ; that is not profitable for generating a scalar epilogue
12 ; CHECK-VECTOR-LABEL: @foo_trip_count_8(
13 ; CHECK-VECTOR: load <4 x i32>
14 ; CHECK-VECTOR-NOT: load <4 x i32>
15 ; CHECK-VECTOR: store <4 x i32>
16 ; CHECK-VECTOR-NOT: store <4 x i32>
19 ; CHECK-SCALAR-LABEL: @foo_trip_count_8(
20 ; CHECK-SCALAR: load i32, ptr
21 ; CHECK-SCALAR-NOT: load i32, ptr
22 ; CHECK-SCALAR: store i32
23 ; CHECK-SCALAR-NOT: store i32
25 define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp {
29 for.body: ; preds = %for.body, %entry
30 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
31 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
32 %1 = load i32, ptr %0, align 4
33 %2 = add nsw i32 %1, 6
34 store i32 %2, ptr %0, align 4
35 %indvars.iv.next = add i64 %indvars.iv, 1
36 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
37 %exitcond = icmp eq i32 %lftr.wideiv, 8
38 br i1 %exitcond, label %for.end, label %for.body
40 for.end: ; preds = %for.body
44 ; We should unroll this loop 4 times since TC being a multiple of VF means
45 ; that the epilogue loop may not need to run, making it profitable for
46 ; the vector loop to run even once
48 ; CHECK-VECTOR-LABEL: @foo_trip_count_16(
49 ; CHECK-VECTOR: load <4 x i32>
50 ; CHECK-VECTOR: load <4 x i32>
51 ; CHECK-VECTOR: load <4 x i32>
52 ; CHECK-VECTOR: load <4 x i32>
53 ; CHECK-VECTOR-NOT: load <4 x i32>
54 ; CHECK-VECTOR: store <4 x i32>
55 ; CHECK-VECTOR: store <4 x i32>
56 ; CHECK-VECTOR: store <4 x i32>
57 ; CHECK-VECTOR: store <4 x i32>
58 ; CHECK-VECTOR-NOT: store <4 x i32>
61 ; CHECK-SCALAR-LABEL: @foo_trip_count_16(
62 ; CHECK-SCALAR: load i32, ptr
63 ; CHECK-SCALAR-NOT: load i32, ptr
64 ; CHECK-SCALAR: store i32
65 ; CHECK-SCALAR-NOT: store i32
67 define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp {
71 for.body: ; preds = %for.body, %entry
72 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
73 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
74 %1 = load i32, ptr %0, align 4
75 %2 = add nsw i32 %1, 6
76 store i32 %2, ptr %0, align 4
77 %indvars.iv.next = add i64 %indvars.iv, 1
78 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
79 %exitcond = icmp eq i32 %lftr.wideiv, 16
80 br i1 %exitcond, label %for.end, label %for.body
82 for.end: ; preds = %for.body
86 ; We should unroll this loop four times since unrolling it twice
87 ; will produce the same epilogue TC of 1, making larger unroll count
90 ; CHECK-VECTOR-LABEL: @foo_trip_count_17(
91 ; CHECK-VECTOR: load <4 x i32>
92 ; CHECK-VECTOR: load <4 x i32>
93 ; CHECK-VECTOR: load <4 x i32>
94 ; CHECK-VECTOR: load <4 x i32>
95 ; CHECK-VECTOR-NOT: load <4 x i32>
96 ; CHECK-VECTOR: store <4 x i32>
97 ; CHECK-VECTOR: store <4 x i32>
98 ; CHECK-VECTOR: store <4 x i32>
99 ; CHECK-VECTOR: store <4 x i32>
100 ; CHECK-VECTOR-NOT: store <4 x i32>
103 ; CHECK-SCALAR-LABEL: @foo_trip_count_17(
104 ; CHECK-SCALAR: load i32, ptr
105 ; CHECK-SCALAR-NOT: load i32, ptr
106 ; CHECK-SCALAR: store i32
107 ; CHECK-SCALAR-NOT: store i32
109 define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp {
113 for.body: ; preds = %for.body, %entry
114 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
115 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
116 %1 = load i32, ptr %0, align 4
117 %2 = add nsw i32 %1, 6
118 store i32 %2, ptr %0, align 4
119 %indvars.iv.next = add i64 %indvars.iv, 1
120 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
121 %exitcond = icmp eq i32 %lftr.wideiv, 17
122 br i1 %exitcond, label %for.end, label %for.body
124 for.end: ; preds = %for.body
128 ; We should unroll this loop twice since unrolling four times will
129 ; create an epilogue loop of TC 8, while unrolling it twice will
130 ; eliminate the epologue loop altogether
132 ; CHECK-VECTOR-LABEL: @foo_trip_count_24(
133 ; CHECK-VECTOR: load <4 x i32>
134 ; CHECK-VECTOR: load <4 x i32>
135 ; CHECK-VECTOR-NOT: load <4 x i32>
136 ; CHECK-VECTOR: store <4 x i32>
137 ; CHECK-VECTOR: store <4 x i32>
138 ; CHECK-VECTOR-NOT: store <4 x i32>
141 ; CHECK-SCALAR-LABEL: @foo_trip_count_24(
142 ; CHECK-SCALAR: load i32, ptr
143 ; CHECK-SCALAR-NOT: load i32, ptr
144 ; CHECK-SCALAR: store i32
145 ; CHECK-SCALAR-NOT: store i32
147 define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp {
151 for.body: ; preds = %for.body, %entry
152 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
153 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
154 %1 = load i32, ptr %0, align 4
155 %2 = add nsw i32 %1, 6
156 store i32 %2, ptr %0, align 4
157 %indvars.iv.next = add i64 %indvars.iv, 1
158 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
159 %exitcond = icmp eq i32 %lftr.wideiv, 24
160 br i1 %exitcond, label %for.end, label %for.body
162 for.end: ; preds = %for.body
166 ; We should unroll this loop twice since TC not being a multiple of VF may require
167 ; the epilogue loop to run, making it profitable when the vector loop runs
170 ; CHECK-VECTOR-LABEL: @foo_trip_count_25(
171 ; CHECK-VECTOR: load <4 x i32>
172 ; CHECK-VECTOR: load <4 x i32>
173 ; CHECK-VECTOR-NOT: load <4 x i32>
174 ; CHECK-VECTOR: store <4 x i32>
175 ; CHECK-VECTOR: store <4 x i32>
176 ; CHECK-VECTOR-NOT: store <4 x i32>
179 ; CHECK-SCALAR-LABEL: @foo_trip_count_25(
180 ; CHECK-SCALAR: load i32, ptr
181 ; CHECK-SCALAR-NOT: load i32, ptr
182 ; CHECK-SCALAR: store i32
183 ; CHECK-SCALAR-NOT: store i32
185 define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp {
189 for.body: ; preds = %for.body, %entry
190 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
191 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
192 %1 = load i32, ptr %0, align 4
193 %2 = add nsw i32 %1, 6
194 store i32 %2, ptr %0, align 4
195 %indvars.iv.next = add i64 %indvars.iv, 1
196 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
197 %exitcond = icmp eq i32 %lftr.wideiv, 25
198 br i1 %exitcond, label %for.end, label %for.body
200 for.end: ; preds = %for.body
204 ; We should unroll this loop 4 times since TC not being a multiple of VF may require
205 ; the epilogue loop to run, making it profitable when the vector loop runs
208 ; CHECK-VECTOR-LABEL: @foo_trip_count_33(
209 ; CHECK-VECTOR: load <4 x i32>
210 ; CHECK-VECTOR: load <4 x i32>
211 ; CHECK-VECTOR: load <4 x i32>
212 ; CHECK-VECTOR: load <4 x i32>
213 ; CHECK-VECTOR-NOT: load <4 x i32>
214 ; CHECK-VECTOR: store <4 x i32>
215 ; CHECK-VECTOR: store <4 x i32>
216 ; CHECK-VECTOR: store <4 x i32>
217 ; CHECK-VECTOR: store <4 x i32>
218 ; CHECK-VECTOR-NOT: store <4 x i32>
221 ; CHECK-SCALAR-LABEL: @foo_trip_count_33(
222 ; CHECK-SCALAR: load i32, ptr
223 ; CHECK-SCALAR-NOT: load i32, ptr
224 ; CHECK-SCALAR: store i32
225 ; CHECK-SCALAR-NOT: store i32
227 define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp {
231 for.body: ; preds = %for.body, %entry
232 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
233 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
234 %1 = load i32, ptr %0, align 4
235 %2 = add nsw i32 %1, 6
236 store i32 %2, ptr %0, align 4
237 %indvars.iv.next = add i64 %indvars.iv, 1
238 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
239 %exitcond = icmp eq i32 %lftr.wideiv, 33
240 br i1 %exitcond, label %for.end, label %for.body
242 for.end: ; preds = %for.body
246 ; We should unroll this loop 4 times since TC not being a multiple of VF may require
247 ; the epilogue loop to run, making it profitable when the vector loop runs
248 ; at least twice. The IC is restricted to 4 since that is the maximum supported
251 ; CHECK-VECTOR-LABEL: @foo_trip_count_101(
252 ; CHECK-VECTOR: load <4 x i32>
253 ; CHECK-VECTOR: load <4 x i32>
254 ; CHECK-VECTOR: load <4 x i32>
255 ; CHECK-VECTOR: load <4 x i32>
256 ; CHECK-VECTOR-NOT: load <4 x i32>
257 ; CHECK-VECTOR: store <4 x i32>
258 ; CHECK-VECTOR: store <4 x i32>
259 ; CHECK-VECTOR: store <4 x i32>
260 ; CHECK-VECTOR: store <4 x i32>
261 ; CHECK-VECTOR-NOT: store <4 x i32>
264 ; CHECK-SCALAR-LABEL: @foo_trip_count_101(
265 ; CHECK-SCALAR: load i32, ptr
266 ; CHECK-SCALAR-NOT: load i32, ptr
267 ; CHECK-SCALAR: store i32
268 ; CHECK-SCALAR-NOT: store i32
270 define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp {
274 for.body: ; preds = %for.body, %entry
275 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
276 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
277 %1 = load i32, ptr %0, align 4
278 %2 = add nsw i32 %1, 6
279 store i32 %2, ptr %0, align 4
280 %indvars.iv.next = add i64 %indvars.iv, 1
281 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
282 %exitcond = icmp eq i32 %lftr.wideiv, 101
283 br i1 %exitcond, label %for.end, label %for.body
285 for.end: ; preds = %for.body
289 ; But this is a good small loop to unroll as we don't know of a bound on its
292 ; CHECK-VECTOR-LABEL: @bar(
293 ; CHECK-VECTOR: store <4 x i32>
294 ; CHECK-VECTOR: store <4 x i32>
297 ; For x86, loop unroll in loop vectorizer is disabled when VF==1.
299 ; CHECK-SCALAR-LABEL: @bar(
300 ; CHECK-SCALAR: store i32
301 ; CHECK-SCALAR-NOT: store i32
303 define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
304 %1 = icmp sgt i32 %n, 0
305 br i1 %1, label %.lr.ph, label %._crit_edge
307 .lr.ph: ; preds = %0, %.lr.ph
308 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
309 %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
310 %3 = load i32, ptr %2, align 4
311 %4 = add nsw i32 %3, 6
312 store i32 %4, ptr %2, align 4
313 %indvars.iv.next = add i64 %indvars.iv, 1
314 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
315 %exitcond = icmp eq i32 %lftr.wideiv, %n
316 br i1 %exitcond, label %._crit_edge, label %.lr.ph
318 ._crit_edge: ; preds = %.lr.ph, %0
322 ; Also unroll if we need a runtime check but it was going to be added for
323 ; vectorization anyways.
324 ; CHECK-VECTOR-LABEL: @runtime_chk(
325 ; CHECK-VECTOR: store <4 x float>
326 ; CHECK-VECTOR: store <4 x float>
328 ; But not if the unrolling would introduce the runtime check.
329 ; CHECK-SCALAR-LABEL: @runtime_chk(
330 ; CHECK-SCALAR: store float
331 ; CHECK-SCALAR-NOT: store float
332 define void @runtime_chk(ptr %A, ptr %B, float %N) {
337 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
338 %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
339 %0 = load float, ptr %arrayidx, align 4
340 %mul = fmul float %0, %N
341 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
342 store float %mul, ptr %arrayidx2, align 4
343 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
344 %exitcond = icmp eq i64 %indvars.iv.next, 256
345 br i1 %exitcond, label %for.end, label %for.body