1 ; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
2 ; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
3 ; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
6 ; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
7 ; regarding IEEE 754 standard.
8 ; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
9 ; because NEON is not IEEE compliant.
10 ; Darwin, on the other hand, doesn't support subnormals, and all optimizations
11 ; are allowed, even without -ffast-math.
13 ; Integer loops are always vectorizeable
14 ; CHECK: Checking a loop in "sumi"
15 ; CHECK: We can vectorize this loop!
16 define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
18 %cmp5 = icmp eq i32 %N, 0
19 br i1 %cmp5, label %for.end, label %for.body.preheader
21 for.body.preheader: ; preds = %entry
24 for.body: ; preds = %for.body.preheader, %for.body
25 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
26 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
27 %0 = load i32, i32* %arrayidx, align 4
28 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
29 %1 = load i32, i32* %arrayidx1, align 4
30 %mul = mul nsw i32 %1, %0
31 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
32 store i32 %mul, i32* %arrayidx2, align 4
33 %inc = add nuw nsw i32 %i.06, 1
34 %exitcond = icmp eq i32 %inc, %N
35 br i1 %exitcond, label %for.end.loopexit, label %for.body
37 for.end.loopexit: ; preds = %for.body
40 for.end: ; preds = %for.end.loopexit, %entry
44 ; Floating-point loops need fast-math to be vectorizeable
45 ; LINUX: Checking a loop in "sumf"
46 ; LINUX: Potentially unsafe FP op prevents vectorization
47 ; DARWIN: Checking a loop in "sumf"
48 ; DARWIN: We can vectorize this loop!
49 define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
51 %cmp5 = icmp eq i32 %N, 0
52 br i1 %cmp5, label %for.end, label %for.body.preheader
54 for.body.preheader: ; preds = %entry
57 for.body: ; preds = %for.body.preheader, %for.body
58 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
59 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
60 %0 = load float, float* %arrayidx, align 4
61 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
62 %1 = load float, float* %arrayidx1, align 4
63 %mul = fmul float %0, %1
64 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
65 store float %mul, float* %arrayidx2, align 4
66 %inc = add nuw nsw i32 %i.06, 1
67 %exitcond = icmp eq i32 %inc, %N
68 br i1 %exitcond, label %for.end.loopexit, label %for.body
70 for.end.loopexit: ; preds = %for.body
73 for.end: ; preds = %for.end.loopexit, %entry
77 ; Integer loops are always vectorizeable
78 ; CHECK: Checking a loop in "redi"
79 ; CHECK: We can vectorize this loop!
80 define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
82 %cmp5 = icmp eq i32 %N, 0
83 br i1 %cmp5, label %for.end, label %for.body.preheader
85 for.body.preheader: ; preds = %entry
88 for.body: ; preds = %for.body.preheader, %for.body
89 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
90 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
91 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
92 %0 = load i32, i32* %arrayidx, align 4
93 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
94 %1 = load i32, i32* %arrayidx1, align 4
95 %mul = mul nsw i32 %1, %0
96 %add = add nsw i32 %mul, %Red.06
97 %inc = add nuw nsw i32 %i.07, 1
98 %exitcond = icmp eq i32 %inc, %N
99 br i1 %exitcond, label %for.end.loopexit, label %for.body
101 for.end.loopexit: ; preds = %for.body
102 %add.lcssa = phi i32 [ %add, %for.body ]
105 for.end: ; preds = %for.end.loopexit, %entry
106 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
110 ; Floating-point loops need fast-math to be vectorizeable
111 ; LINUX: Checking a loop in "redf"
112 ; LINUX: Potentially unsafe FP op prevents vectorization
113 ; DARWIN: Checking a loop in "redf"
114 ; DARWIN: We can vectorize this loop!
115 define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
117 %cmp5 = icmp eq i32 %N, 0
118 br i1 %cmp5, label %for.end, label %for.body.preheader
120 for.body.preheader: ; preds = %entry
123 for.body: ; preds = %for.body.preheader, %for.body
124 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
125 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
126 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
127 %0 = load float, float* %arrayidx, align 4
128 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
129 %1 = load float, float* %arrayidx1, align 4
130 %mul = fmul float %0, %1
131 %add = fadd float %Red.06, %mul
132 %inc = add nuw nsw i32 %i.07, 1
133 %exitcond = icmp eq i32 %inc, %N
134 br i1 %exitcond, label %for.end.loopexit, label %for.body
136 for.end.loopexit: ; preds = %for.body
137 %add.lcssa = phi float [ %add, %for.body ]
140 for.end: ; preds = %for.end.loopexit, %entry
141 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
142 ret float %Red.0.lcssa
145 ; Make sure calls that turn into builtins are also covered
146 ; LINUX: Checking a loop in "fabs"
147 ; LINUX: Potentially unsafe FP op prevents vectorization
148 ; DARWIN: Checking a loop in "fabs"
149 ; DARWIN: We can vectorize this loop!
150 define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
152 %cmp10 = icmp eq i32 %N, 0
153 br i1 %cmp10, label %for.end, label %for.body
155 for.body: ; preds = %entry, %for.body
156 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
157 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
158 %0 = load float, float* %arrayidx, align 4
159 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
160 %1 = load float, float* %arrayidx1, align 4
161 %fabsf = tail call float @fabsf(float %1) #1
162 %conv3 = fmul float %0, %fabsf
163 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
164 store float %conv3, float* %arrayidx4, align 4
165 %inc = add nuw nsw i32 %i.011, 1
166 %exitcond = icmp eq i32 %inc, %N
167 br i1 %exitcond, label %for.end, label %for.body
169 for.end: ; preds = %for.body, %entry
173 ; Integer loops are always vectorizeable
174 ; CHECK: Checking a loop in "sumi_fast"
175 ; CHECK: We can vectorize this loop!
176 define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
178 %cmp5 = icmp eq i32 %N, 0
179 br i1 %cmp5, label %for.end, label %for.body.preheader
181 for.body.preheader: ; preds = %entry
184 for.body: ; preds = %for.body.preheader, %for.body
185 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
186 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
187 %0 = load i32, i32* %arrayidx, align 4
188 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
189 %1 = load i32, i32* %arrayidx1, align 4
190 %mul = mul nsw i32 %1, %0
191 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
192 store i32 %mul, i32* %arrayidx2, align 4
193 %inc = add nuw nsw i32 %i.06, 1
194 %exitcond = icmp eq i32 %inc, %N
195 br i1 %exitcond, label %for.end.loopexit, label %for.body
197 for.end.loopexit: ; preds = %for.body
200 for.end: ; preds = %for.end.loopexit, %entry
204 ; Floating-point loops can be vectorizeable with fast-math
205 ; CHECK: Checking a loop in "sumf_fast"
206 ; CHECK: We can vectorize this loop!
207 define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
209 %cmp5 = icmp eq i32 %N, 0
210 br i1 %cmp5, label %for.end, label %for.body.preheader
212 for.body.preheader: ; preds = %entry
215 for.body: ; preds = %for.body.preheader, %for.body
216 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
217 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
218 %0 = load float, float* %arrayidx, align 4
219 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
220 %1 = load float, float* %arrayidx1, align 4
221 %mul = fmul fast float %1, %0
222 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
223 store float %mul, float* %arrayidx2, align 4
224 %inc = add nuw nsw i32 %i.06, 1
225 %exitcond = icmp eq i32 %inc, %N
226 br i1 %exitcond, label %for.end.loopexit, label %for.body
228 for.end.loopexit: ; preds = %for.body
231 for.end: ; preds = %for.end.loopexit, %entry
235 ; Integer loops are always vectorizeable
236 ; CHECK: Checking a loop in "redi_fast"
237 ; CHECK: We can vectorize this loop!
238 define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
240 %cmp5 = icmp eq i32 %N, 0
241 br i1 %cmp5, label %for.end, label %for.body.preheader
243 for.body.preheader: ; preds = %entry
246 for.body: ; preds = %for.body.preheader, %for.body
247 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
248 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
249 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
250 %0 = load i32, i32* %arrayidx, align 4
251 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
252 %1 = load i32, i32* %arrayidx1, align 4
253 %mul = mul nsw i32 %1, %0
254 %add = add nsw i32 %mul, %Red.06
255 %inc = add nuw nsw i32 %i.07, 1
256 %exitcond = icmp eq i32 %inc, %N
257 br i1 %exitcond, label %for.end.loopexit, label %for.body
259 for.end.loopexit: ; preds = %for.body
260 %add.lcssa = phi i32 [ %add, %for.body ]
263 for.end: ; preds = %for.end.loopexit, %entry
264 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
268 ; Floating-point loops can be vectorizeable with fast-math
269 ; CHECK: Checking a loop in "redf_fast"
270 ; CHECK: We can vectorize this loop!
271 define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
273 %cmp5 = icmp eq i32 %N, 0
274 br i1 %cmp5, label %for.end, label %for.body.preheader
276 for.body.preheader: ; preds = %entry
279 for.body: ; preds = %for.body.preheader, %for.body
280 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
281 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
282 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
283 %0 = load float, float* %arrayidx, align 4
284 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
285 %1 = load float, float* %arrayidx1, align 4
286 %mul = fmul fast float %1, %0
287 %add = fadd fast float %mul, %Red.06
288 %inc = add nuw nsw i32 %i.07, 1
289 %exitcond = icmp eq i32 %inc, %N
290 br i1 %exitcond, label %for.end.loopexit, label %for.body
292 for.end.loopexit: ; preds = %for.body
293 %add.lcssa = phi float [ %add, %for.body ]
296 for.end: ; preds = %for.end.loopexit, %entry
297 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
298 ret float %Red.0.lcssa
301 ; Make sure calls that turn into builtins are also covered
302 ; CHECK: Checking a loop in "fabs_fast"
303 ; CHECK: We can vectorize this loop!
304 define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
306 %cmp10 = icmp eq i32 %N, 0
307 br i1 %cmp10, label %for.end, label %for.body
309 for.body: ; preds = %entry, %for.body
310 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
311 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
312 %0 = load float, float* %arrayidx, align 4
313 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
314 %1 = load float, float* %arrayidx1, align 4
315 %fabsf = tail call fast float @fabsf(float %1) #2
316 %conv3 = fmul fast float %fabsf, %0
317 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
318 store float %conv3, float* %arrayidx4, align 4
319 %inc = add nuw nsw i32 %i.011, 1
320 %exitcond = icmp eq i32 %inc, %N
321 br i1 %exitcond, label %for.end, label %for.body
323 for.end: ; preds = %for.body, %entry
327 declare float @fabsf(float)
329 attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
330 attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }