1 ; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -S \
2 ; RUN: | FileCheck %s --check-prefix=CHECK-VECTOR
3 ; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -S \
4 ; RUN: | FileCheck %s --check-prefix=CHECK-SCALAR
6 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
7 target triple = "x86_64-apple-macosx10.8.0"
9 ; We don't unroll this loop because it has a small constant trip count
10 ; that is not profitable for generating a scalar epilogue
12 ; CHECK-VECTOR-LABEL: @foo_trip_count_8(
13 ; CHECK-VECTOR: load <4 x i32>
14 ; CHECK-VECTOR-NOT: load <4 x i32>
15 ; CHECK-VECTOR: store <4 x i32>
16 ; CHECK-VECTOR-NOT: store <4 x i32>
19 ; CHECK-SCALAR-LABEL: @foo_trip_count_8(
20 ; CHECK-SCALAR: load i32, ptr
21 ; CHECK-SCALAR-NOT: load i32, ptr
22 ; CHECK-SCALAR: store i32
23 ; CHECK-SCALAR-NOT: store i32
25 define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp {
29 for.body: ; preds = %for.body, %entry
30 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
31 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
32 %1 = load i32, ptr %0, align 4
33 %2 = add nsw i32 %1, 6
34 store i32 %2, ptr %0, align 4
35 %indvars.iv.next = add i64 %indvars.iv, 1
36 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
37 %exitcond = icmp eq i32 %lftr.wideiv, 8
38 br i1 %exitcond, label %for.end, label %for.body
40 for.end: ; preds = %for.body
44 ; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
45 ; that the epilogue loop may not need to run, making it profitable for
46 ; the vector loop to run even once
48 ; CHECK-VECTOR-LABEL: @foo_trip_count_16(
49 ; CHECK-VECTOR: load <4 x i32>
50 ; CHECK-VECTOR-NOT: load <4 x i32>
51 ; CHECK-VECTOR: store <4 x i32>
52 ; CHECK-VECTOR-NOT: store <4 x i32>
55 ; CHECK-SCALAR-LABEL: @foo_trip_count_16(
56 ; CHECK-SCALAR: load i32, ptr
57 ; CHECK-SCALAR-NOT: load i32, ptr
58 ; CHECK-SCALAR: store i32
59 ; CHECK-SCALAR-NOT: store i32
61 define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp {
65 for.body: ; preds = %for.body, %entry
66 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
67 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
68 %1 = load i32, ptr %0, align 4
69 %2 = add nsw i32 %1, 6
70 store i32 %2, ptr %0, align 4
71 %indvars.iv.next = add i64 %indvars.iv, 1
72 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
73 %exitcond = icmp eq i32 %lftr.wideiv, 16
74 br i1 %exitcond, label %for.end, label %for.body
76 for.end: ; preds = %for.body
80 ; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
81 ; the epilogue loop to run, making it profitable when the vector loop runs
84 ; CHECK-VECTOR-LABEL: @foo_trip_count_17(
85 ; CHECK-VECTOR: load <4 x i32>
86 ; CHECK-VECTOR-NOT: load <4 x i32>
87 ; CHECK-VECTOR: store <4 x i32>
88 ; CHECK-VECTOR-NOT: store <4 x i32>
91 ; CHECK-SCALAR-LABEL: @foo_trip_count_17(
92 ; CHECK-SCALAR: load i32, ptr
93 ; CHECK-SCALAR-NOT: load i32, ptr
94 ; CHECK-SCALAR: store i32
95 ; CHECK-SCALAR-NOT: store i32
97 define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp {
101 for.body: ; preds = %for.body, %entry
102 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
103 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
104 %1 = load i32, ptr %0, align 4
105 %2 = add nsw i32 %1, 6
106 store i32 %2, ptr %0, align 4
107 %indvars.iv.next = add i64 %indvars.iv, 1
108 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
109 %exitcond = icmp eq i32 %lftr.wideiv, 17
110 br i1 %exitcond, label %for.end, label %for.body
112 for.end: ; preds = %for.body
116 ; TODO: We should unroll this loop 4 times since TC being a multiple of VF means
117 ; that the epilogue loop may not need to run, making it profitable for
118 ; the vector loop to run even once. The IC is restricted to 4 since
119 ; that is the maximum supported for the target.
121 ; CHECK-VECTOR-LABEL: @foo_trip_count_24(
122 ; CHECK-VECTOR: load <4 x i32>
123 ; CHECK-VECTOR-NOT: load <4 x i32>
124 ; CHECK-VECTOR: store <4 x i32>
125 ; CHECK-VECTOR-NOT: store <4 x i32>
128 ; CHECK-SCALAR-LABEL: @foo_trip_count_24(
129 ; CHECK-SCALAR: load i32, ptr
130 ; CHECK-SCALAR-NOT: load i32, ptr
131 ; CHECK-SCALAR: store i32
132 ; CHECK-SCALAR-NOT: store i32
134 define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp {
138 for.body: ; preds = %for.body, %entry
139 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
140 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
141 %1 = load i32, ptr %0, align 4
142 %2 = add nsw i32 %1, 6
143 store i32 %2, ptr %0, align 4
144 %indvars.iv.next = add i64 %indvars.iv, 1
145 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
146 %exitcond = icmp eq i32 %lftr.wideiv, 24
147 br i1 %exitcond, label %for.end, label %for.body
149 for.end: ; preds = %for.body
153 ; TODO: We should unroll this loop twice since TC not being a multiple of VF may require
154 ; the epilogue loop to run, making it profitable when the vector loop runs
157 ; CHECK-VECTOR-LABEL: @foo_trip_count_25(
158 ; CHECK-VECTOR: load <4 x i32>
159 ; CHECK-VECTOR-NOT: load <4 x i32>
160 ; CHECK-VECTOR: store <4 x i32>
161 ; CHECK-VECTOR-NOT: store <4 x i32>
164 ; CHECK-SCALAR-LABEL: @foo_trip_count_25(
165 ; CHECK-SCALAR: load i32, ptr
166 ; CHECK-SCALAR-NOT: load i32, ptr
167 ; CHECK-SCALAR: store i32
168 ; CHECK-SCALAR-NOT: store i32
170 define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp {
174 for.body: ; preds = %for.body, %entry
175 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
176 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
177 %1 = load i32, ptr %0, align 4
178 %2 = add nsw i32 %1, 6
179 store i32 %2, ptr %0, align 4
180 %indvars.iv.next = add i64 %indvars.iv, 1
181 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
182 %exitcond = icmp eq i32 %lftr.wideiv, 25
183 br i1 %exitcond, label %for.end, label %for.body
185 for.end: ; preds = %for.body
189 ; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
190 ; the epilogue loop to run, making it profitable when the vector loop runs
193 ; CHECK-VECTOR-LABEL: @foo_trip_count_33(
194 ; CHECK-VECTOR: load <4 x i32>
195 ; CHECK-VECTOR-NOT: load <4 x i32>
196 ; CHECK-VECTOR: store <4 x i32>
197 ; CHECK-VECTOR-NOT: store <4 x i32>
200 ; CHECK-SCALAR-LABEL: @foo_trip_count_33(
201 ; CHECK-SCALAR: load i32, ptr
202 ; CHECK-SCALAR-NOT: load i32, ptr
203 ; CHECK-SCALAR: store i32
204 ; CHECK-SCALAR-NOT: store i32
206 define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp {
210 for.body: ; preds = %for.body, %entry
211 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
212 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
213 %1 = load i32, ptr %0, align 4
214 %2 = add nsw i32 %1, 6
215 store i32 %2, ptr %0, align 4
216 %indvars.iv.next = add i64 %indvars.iv, 1
217 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
218 %exitcond = icmp eq i32 %lftr.wideiv, 33
219 br i1 %exitcond, label %for.end, label %for.body
221 for.end: ; preds = %for.body
225 ; TODO: We should unroll this loop 4 times since TC not being a multiple of VF may require
226 ; the epilogue loop to run, making it profitable when the vector loop runs
227 ; at least twice. The IC is restricted to 4 since that is the maximum supported
230 ; CHECK-VECTOR-LABEL: @foo_trip_count_101(
231 ; CHECK-VECTOR: load <4 x i32>
232 ; CHECK-VECTOR-NOT: load <4 x i32>
233 ; CHECK-VECTOR: store <4 x i32>
234 ; CHECK-VECTOR-NOT: store <4 x i32>
237 ; CHECK-SCALAR-LABEL: @foo_trip_count_101(
238 ; CHECK-SCALAR: load i32, ptr
239 ; CHECK-SCALAR-NOT: load i32, ptr
240 ; CHECK-SCALAR: store i32
241 ; CHECK-SCALAR-NOT: store i32
243 define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp {
247 for.body: ; preds = %for.body, %entry
248 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
249 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
250 %1 = load i32, ptr %0, align 4
251 %2 = add nsw i32 %1, 6
252 store i32 %2, ptr %0, align 4
253 %indvars.iv.next = add i64 %indvars.iv, 1
254 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
255 %exitcond = icmp eq i32 %lftr.wideiv, 101
256 br i1 %exitcond, label %for.end, label %for.body
258 for.end: ; preds = %for.body
262 ; But this is a good small loop to unroll as we don't know of a bound on its
265 ; CHECK-VECTOR-LABEL: @bar(
266 ; CHECK-VECTOR: store <4 x i32>
267 ; CHECK-VECTOR: store <4 x i32>
270 ; For x86, loop unroll in loop vectorizer is disabled when VF==1.
272 ; CHECK-SCALAR-LABEL: @bar(
273 ; CHECK-SCALAR: store i32
274 ; CHECK-SCALAR-NOT: store i32
276 define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
277 %1 = icmp sgt i32 %n, 0
278 br i1 %1, label %.lr.ph, label %._crit_edge
280 .lr.ph: ; preds = %0, %.lr.ph
281 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
282 %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
283 %3 = load i32, ptr %2, align 4
284 %4 = add nsw i32 %3, 6
285 store i32 %4, ptr %2, align 4
286 %indvars.iv.next = add i64 %indvars.iv, 1
287 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
288 %exitcond = icmp eq i32 %lftr.wideiv, %n
289 br i1 %exitcond, label %._crit_edge, label %.lr.ph
291 ._crit_edge: ; preds = %.lr.ph, %0
295 ; Also unroll if we need a runtime check but it was going to be added for
296 ; vectorization anyways.
297 ; CHECK-VECTOR-LABEL: @runtime_chk(
298 ; CHECK-VECTOR: store <4 x float>
299 ; CHECK-VECTOR: store <4 x float>
301 ; But not if the unrolling would introduce the runtime check.
302 ; CHECK-SCALAR-LABEL: @runtime_chk(
303 ; CHECK-SCALAR: store float
304 ; CHECK-SCALAR-NOT: store float
305 define void @runtime_chk(ptr %A, ptr %B, float %N) {
310 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
311 %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
312 %0 = load float, ptr %arrayidx, align 4
313 %mul = fmul float %0, %N
314 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
315 store float %mul, ptr %arrayidx2, align 4
316 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
317 %exitcond = icmp eq i64 %indvars.iv.next, 256
318 br i1 %exitcond, label %for.end, label %for.body