1 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
2 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
3 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
4 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
5 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
7 ; CHECK-LABEL: skip_call
8 ; CHECK-NOT: call void @llvm.set.loop.iterations
9 ; CHECK-NOT: call i32 @llvm.loop.decrement
11 define i32 @skip_call(i32 %n) {
13 %cmp6 = icmp eq i32 %n, 0
14 br i1 %cmp6, label %while.end, label %while.body.preheader
20 %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
21 %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
22 %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
23 %add = add nsw i32 %call, %res.07
24 %inc1 = add nuw i32 %i.08, 1
25 %exitcond = icmp eq i32 %inc1, %n
26 br i1 %exitcond, label %while.end.loopexit, label %while.body
32 %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
36 ; CHECK-LABEL: test_target_specific
37 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
38 ; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
39 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
40 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
41 ; CHECK: br i1 [[CMP]], label %loop, label %exit
43 define i32 @test_target_specific(i32* %a, i32* %b) {
47 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
48 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
49 %addr.a = getelementptr i32, i32* %a, i32 %count
50 %addr.b = getelementptr i32, i32* %b, i32 %count
51 %load.a = load i32, i32* %addr.a
52 %load.b = load i32, i32* %addr.b
53 %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
54 %count.next = add nuw i32 %count, 2
55 %cmp = icmp ne i32 %count.next, 100
56 br i1 %cmp, label %loop, label %exit
61 ; CHECK-LABEL: test_fabs_f16
62 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
63 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
64 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
65 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
66 define void @test_fabs_f16(half* %a, half* %b) {
70 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
71 %addr.a = getelementptr half, half* %a, i32 %count
72 %load.a = load half, half* %addr.a
73 %abs = call half @llvm.fabs.f16(half %load.a)
74 %addr.b = getelementptr half, half* %b, i32 %count
75 store half %abs, half *%addr.b
76 %count.next = add nuw i32 %count, 1
77 %cmp = icmp ne i32 %count.next, 100
78 br i1 %cmp, label %loop, label %exit
83 ; CHECK-LABEL: test_fabs
84 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
85 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
86 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
87 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
89 define float @test_fabs(float* %a) {
93 %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
94 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
95 %addr.a = getelementptr float, float* %a, i32 %count
96 %load.a = load float, float* %addr.a
97 %abs = call float @llvm.fabs.f32(float %load.a)
98 %res = fadd float %abs, %acc
99 %count.next = add nuw i32 %count, 1
100 %cmp = icmp ne i32 %count.next, 100
101 br i1 %cmp, label %loop, label %exit
106 ; CHECK-LABEL: test_fabs_64
107 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
108 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
109 ; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
110 ; CHECK-FP64: void @llvm.set.loop.iterations.i32(i32 100)
111 ; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
112 define void @test_fabs_64(double* %a, double* %b) {
116 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
117 %addr.a = getelementptr double, double* %a, i32 %count
118 %load.a = load double, double* %addr.a
119 %abs = call double @llvm.fabs.f64(double %load.a)
120 %addr.b = getelementptr double, double* %b, i32 %count
121 store double %abs, double *%addr.b
122 %count.next = add nuw i32 %count, 1
123 %cmp = icmp ne i32 %count.next, 100
124 br i1 %cmp, label %loop, label %exit
129 ; CHECK-LABEL: test_fabs_vec
130 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
131 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
132 ; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
133 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
134 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
135 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
136 define <4 x float> @test_fabs_vec(<4 x float>* %a) {
140 %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
141 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
142 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
143 %load.a = load <4 x float>, <4 x float>* %addr.a
144 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
145 %res = fadd <4 x float> %abs, %acc
146 %count.next = add nuw i32 %count, 1
147 %cmp = icmp ne i32 %count.next, 100
148 br i1 %cmp, label %loop, label %exit
153 ; CHECK-LABEL: test_log
154 ; CHECK-NOT: call void @llvm.set.loop.iterations
155 ; CHECK-NOT: llvm.loop.decrement
156 define float @test_log(float* %a) {
160 %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
161 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
162 %addr.a = getelementptr float, float* %a, i32 %count
163 %load.a = load float, float* %addr.a
164 %abs = call float @llvm.log.f32(float %load.a)
165 %res = fadd float %abs, %acc
166 %count.next = add nuw i32 %count, 1
167 %cmp = icmp ne i32 %count.next, 100
168 br i1 %cmp, label %loop, label %exit
173 ; CHECK-LABEL: test_sqrt_16
174 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
175 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
176 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
177 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
178 ; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
179 define void @test_sqrt_16(half* %a, half* %b) {
183 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
184 %addr.a = getelementptr half, half* %a, i32 %count
185 %load.a = load half, half* %addr.a
186 %sqrt = call half @llvm.sqrt.f16(half %load.a)
187 %addr.b = getelementptr half, half* %b, i32 %count
188 store half %sqrt, half *%addr.b
189 %count.next = add nuw i32 %count, 1
190 %cmp = icmp ne i32 %count.next, 100
191 br i1 %cmp, label %loop, label %exit
195 ; CHECK-LABEL: test_sqrt
196 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
197 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
198 ; CHECK-FP: call void @llvm.set.loop.iterations
199 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
200 ; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
201 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
202 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
203 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
204 define void @test_sqrt(float* %a, float* %b) {
208 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
209 %addr.a = getelementptr float, float* %a, i32 %count
210 %load.a = load float, float* %addr.a
211 %sqrt = call float @llvm.sqrt.f32(float %load.a)
212 %addr.b = getelementptr float, float* %b, i32 %count
213 store float %sqrt, float* %addr.b
214 %count.next = add nuw i32 %count, 1
215 %cmp = icmp ne i32 %count.next, 100
216 br i1 %cmp, label %loop, label %exit
221 ; CHECK-LABEL: test_sqrt_64
222 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
223 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
224 ; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
225 ; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
226 ; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
227 define void @test_sqrt_64(double* %a, double* %b) {
231 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
232 %addr.a = getelementptr double, double* %a, i32 %count
233 %load.a = load double, double* %addr.a
234 %sqrt = call double @llvm.sqrt.f64(double %load.a)
235 %addr.b = getelementptr double, double* %b, i32 %count
236 store double %sqrt, double *%addr.b
237 %count.next = add nuw i32 %count, 1
238 %cmp = icmp ne i32 %count.next, 100
239 br i1 %cmp, label %loop, label %exit
244 ; CHECK-LABEL: test_sqrt_vec
245 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
246 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
247 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
248 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
249 define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
253 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
254 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
255 %load.a = load <4 x float>, <4 x float>* %addr.a
256 %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
257 %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
258 store <4 x float> %sqrt, <4 x float>* %addr.b
259 %count.next = add nuw i32 %count, 1
260 %cmp = icmp ne i32 %count.next, 100
261 br i1 %cmp, label %loop, label %exit
266 ; CHECK-LABEL: test_overflow
267 ; CHECK: call void @llvm.set.loop.iterations
268 define i32 @test_overflow(i32* %a, i32* %b) {
272 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
273 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
274 %addr.a = getelementptr i32, i32* %a, i32 %count
275 %addr.b = getelementptr i32, i32* %b, i32 %count
276 %load.a = load i32, i32* %addr.a
277 %load.b = load i32, i32* %addr.b
278 %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
279 %res = extractvalue {i32, i1} %sadd, 0
280 %count.next = add nuw i32 %count, 1
281 %cmp = icmp ne i32 %count.next, 100
282 br i1 %cmp, label %loop, label %exit
287 ; TODO: We should be able to generate a qadd/sub
288 ; CHECK-LABEL: test_sat
289 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
290 define i32 @test_sat(i32* %a, i32* %b) {
294 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
295 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
296 %addr.a = getelementptr i32, i32* %a, i32 %count
297 %addr.b = getelementptr i32, i32* %b, i32 %count
298 %load.a = load i32, i32* %addr.a
299 %load.b = load i32, i32* %addr.b
300 %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
301 %count.next = add nuw i32 %count, 1
302 %cmp = icmp ne i32 %count.next, 100
303 br i1 %cmp, label %loop, label %exit
308 ; CHECK-LABEL: test_masked_i32
309 ; CHECK-NOT: call void @llvm.set.loop.iterations
310 ; CHECK-MVEFP: call void @llvm.set.loop.iterations
311 ; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
312 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
313 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
314 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
315 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
316 define arm_aapcs_vfpcc void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
320 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
321 %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
322 %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
323 %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
324 %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
325 %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
326 %res = add <4 x i32> %load.a, %load.b
327 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
328 %count.next = add nuw i32 %count, 1
329 %cmp = icmp ne i32 %count.next, 100
330 br i1 %cmp, label %loop, label %exit
335 ; CHECK-LABEL: test_masked_f32
336 ; CHECK-NOT: call void @llvm.set.loop.iterations
337 ; CHECK-MVEFP: call void @llvm.set.loop.iterations
338 ; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
339 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
340 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
341 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
342 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
343 define arm_aapcs_vfpcc void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
347 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
348 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
349 %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
350 %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
351 %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
352 %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
353 %res = fadd <4 x float> %load.a, %load.b
354 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
355 %count.next = add nuw i32 %count, 1
356 %cmp = icmp ne i32 %count.next, 100
357 br i1 %cmp, label %loop, label %exit
362 ; CHECK-LABEL: test_gather_scatter
363 ; CHECK-NOT: call void @llvm.set.loop.iterations
364 ; CHECK-MVEFP: call void @llvm.set.loop.iterations
365 ; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
366 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
367 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
368 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
369 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
370 define arm_aapcs_vfpcc void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
374 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
375 %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
376 %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
377 %res = fadd <4 x float> %load.a, %load.b
378 call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
379 %count.next = add nuw i32 %count, 1
380 %cmp = icmp ne i32 %count.next, 100
381 br i1 %cmp, label %loop, label %exit
386 declare i32 @bar(...) local_unnamed_addr #1
387 declare i32 @llvm.arm.smlad(i32, i32, i32)
388 declare half @llvm.fabs.f16(half)
389 declare float @llvm.fabs.f32(float)
390 declare double @llvm.fabs.f64(double)
391 declare float @llvm.log.f32(float)
392 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
393 declare half @llvm.sqrt.f16(half)
394 declare float @llvm.sqrt.f32(float)
395 declare double @llvm.sqrt.f64(double)
396 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
397 declare i32 @llvm.sadd.sat.i32(i32, i32)
398 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
399 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
400 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
401 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
402 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
403 declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
404 declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)