1 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
2 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
3 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
4 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
5 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
6 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+lob,+mve.fp -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC
8 ; CHECK-LABEL: skip_call
9 ; CHECK-NOT: call void @llvm.set.loop.iterations
10 ; CHECK-NOT: call i32 @llvm.loop.decrement
12 define i32 @skip_call(i32 %n) {
14 %cmp6 = icmp eq i32 %n, 0
15 br i1 %cmp6, label %while.end, label %while.body.preheader
21 %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
22 %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
23 %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
24 %add = add nsw i32 %call, %res.07
25 %inc1 = add nuw i32 %i.08, 1
26 %exitcond = icmp eq i32 %inc1, %n
27 br i1 %exitcond, label %while.end.loopexit, label %while.body
33 %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
37 ; CHECK-LABEL: test_target_specific
38 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 50)
39 ; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ 50, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
40 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
41 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
42 ; CHECK: br i1 [[CMP]], label %loop, label %exit
44 ; CHECK-LLC-LABEL: test_target_specific:
45 ; CHECK-LLC: mov.w lr, #50
46 ; CHECK-LLC: dls lr, lr
47 ; CHECK-LLC-NOT: mov lr,
48 ; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]:
49 ; CHECK-LLC: le lr, [[LOOP_HEADER]]
53 define i32 @test_target_specific(i32* %a, i32* %b) {
57 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
58 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
59 %addr.a = getelementptr i32, i32* %a, i32 %count
60 %addr.b = getelementptr i32, i32* %b, i32 %count
61 %load.a = load i32, i32* %addr.a
62 %load.b = load i32, i32* %addr.b
63 %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
64 %count.next = add nuw i32 %count, 2
65 %cmp = icmp ne i32 %count.next, 100
66 br i1 %cmp, label %loop, label %exit
71 ; CHECK-LABEL: test_fabs_f16
72 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
73 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
74 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
75 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
76 define void @test_fabs_f16(half* %a, half* %b) {
80 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
81 %addr.a = getelementptr half, half* %a, i32 %count
82 %load.a = load half, half* %addr.a
83 %abs = call half @llvm.fabs.f16(half %load.a)
84 %addr.b = getelementptr half, half* %b, i32 %count
85 store half %abs, half *%addr.b
86 %count.next = add nuw i32 %count, 1
87 %cmp = icmp ne i32 %count.next, 100
88 br i1 %cmp, label %loop, label %exit
93 ; CHECK-LABEL: test_fabs
94 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
95 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
96 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
97 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
99 ; CHECK-LLC-LABEL: test_fabs:
100 ; CHECK-LLC: mov.w lr, #100
101 ; CHECK-LLC: dls lr, lr
102 ; CHECK-LLC-NOT: mov lr,
103 ; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]:
105 ; CHECK-LLC: le lr, [[LOOP_HEADER]]
109 define float @test_fabs(float* %a) {
113 %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
114 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
115 %addr.a = getelementptr float, float* %a, i32 %count
116 %load.a = load float, float* %addr.a
117 %abs = call float @llvm.fabs.f32(float %load.a)
118 %res = fadd float %abs, %acc
119 %count.next = add nuw i32 %count, 1
120 %cmp = icmp ne i32 %count.next, 100
121 br i1 %cmp, label %loop, label %exit
126 ; CHECK-LABEL: test_fabs_64
127 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
128 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
129 ; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
130 ; CHECK-FP64: void @llvm.set.loop.iterations.i32(i32 100)
131 ; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
132 define void @test_fabs_64(double* %a, double* %b) {
136 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
137 %addr.a = getelementptr double, double* %a, i32 %count
138 %load.a = load double, double* %addr.a
139 %abs = call double @llvm.fabs.f64(double %load.a)
140 %addr.b = getelementptr double, double* %b, i32 %count
141 store double %abs, double *%addr.b
142 %count.next = add nuw i32 %count, 1
143 %cmp = icmp ne i32 %count.next, 100
144 br i1 %cmp, label %loop, label %exit
149 ; CHECK-LABEL: test_fabs_vec
150 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
151 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
152 ; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
153 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
154 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
155 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
156 define <4 x float> @test_fabs_vec(<4 x float>* %a) {
160 %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
161 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
162 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
163 %load.a = load <4 x float>, <4 x float>* %addr.a
164 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
165 %res = fadd <4 x float> %abs, %acc
166 %count.next = add nuw i32 %count, 1
167 %cmp = icmp ne i32 %count.next, 100
168 br i1 %cmp, label %loop, label %exit
173 ; CHECK-LABEL: test_log
174 ; CHECK-NOT: call void @llvm.set.loop.iterations
175 ; CHECK-NOT: llvm.loop.decrement
176 define float @test_log(float* %a) {
180 %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
181 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
182 %addr.a = getelementptr float, float* %a, i32 %count
183 %load.a = load float, float* %addr.a
184 %abs = call float @llvm.log.f32(float %load.a)
185 %res = fadd float %abs, %acc
186 %count.next = add nuw i32 %count, 1
187 %cmp = icmp ne i32 %count.next, 100
188 br i1 %cmp, label %loop, label %exit
193 ; CHECK-LABEL: test_sqrt_16
194 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
195 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
196 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
197 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
198 ; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
199 define void @test_sqrt_16(half* %a, half* %b) {
203 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
204 %addr.a = getelementptr half, half* %a, i32 %count
205 %load.a = load half, half* %addr.a
206 %sqrt = call half @llvm.sqrt.f16(half %load.a)
207 %addr.b = getelementptr half, half* %b, i32 %count
208 store half %sqrt, half *%addr.b
209 %count.next = add nuw i32 %count, 1
210 %cmp = icmp ne i32 %count.next, 100
211 br i1 %cmp, label %loop, label %exit
215 ; CHECK-LABEL: test_sqrt
216 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
217 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
218 ; CHECK-FP: call void @llvm.set.loop.iterations
219 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
220 ; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
221 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
222 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
223 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
224 define void @test_sqrt(float* %a, float* %b) {
228 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
229 %addr.a = getelementptr float, float* %a, i32 %count
230 %load.a = load float, float* %addr.a
231 %sqrt = call float @llvm.sqrt.f32(float %load.a)
232 %addr.b = getelementptr float, float* %b, i32 %count
233 store float %sqrt, float* %addr.b
234 %count.next = add nuw i32 %count, 1
235 %cmp = icmp ne i32 %count.next, 100
236 br i1 %cmp, label %loop, label %exit
241 ; CHECK-LABEL: test_sqrt_64
242 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
243 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
244 ; CHECK-FP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
245 ; CHECK-MVEFP-NOT: call void @llvm.set.loop.iterations.i32(i32 100)
246 ; CHECK-FP64: call void @llvm.set.loop.iterations.i32(i32 100)
247 define void @test_sqrt_64(double* %a, double* %b) {
251 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
252 %addr.a = getelementptr double, double* %a, i32 %count
253 %load.a = load double, double* %addr.a
254 %sqrt = call double @llvm.sqrt.f64(double %load.a)
255 %addr.b = getelementptr double, double* %b, i32 %count
256 store double %sqrt, double *%addr.b
257 %count.next = add nuw i32 %count, 1
258 %cmp = icmp ne i32 %count.next, 100
259 br i1 %cmp, label %loop, label %exit
264 ; CHECK-LABEL: test_sqrt_vec
265 ; CHECK-MAIN-NOT: call void @llvm.set.loop.iterations
266 ; CHECK-MVE-NOT: call void @llvm.set.loop.iterations
267 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 100)
268 ; CHECK-MVEFP: call void @llvm.set.loop.iterations.i32(i32 100)
269 define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
273 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
274 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
275 %load.a = load <4 x float>, <4 x float>* %addr.a
276 %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
277 %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
278 store <4 x float> %sqrt, <4 x float>* %addr.b
279 %count.next = add nuw i32 %count, 1
280 %cmp = icmp ne i32 %count.next, 100
281 br i1 %cmp, label %loop, label %exit
286 ; CHECK-LABEL: test_overflow
287 ; CHECK: call void @llvm.set.loop.iterations
288 define i32 @test_overflow(i32* %a, i32* %b) {
292 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
293 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
294 %addr.a = getelementptr i32, i32* %a, i32 %count
295 %addr.b = getelementptr i32, i32* %b, i32 %count
296 %load.a = load i32, i32* %addr.a
297 %load.b = load i32, i32* %addr.b
298 %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
299 %res = extractvalue {i32, i1} %sadd, 0
300 %count.next = add nuw i32 %count, 1
301 %cmp = icmp ne i32 %count.next, 100
302 br i1 %cmp, label %loop, label %exit
307 ; TODO: We should be able to generate a qadd/sub
308 ; CHECK-LABEL: test_sat
309 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 100)
310 define i32 @test_sat(i32* %a, i32* %b) {
314 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
315 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
316 %addr.a = getelementptr i32, i32* %a, i32 %count
317 %addr.b = getelementptr i32, i32* %b, i32 %count
318 %load.a = load i32, i32* %addr.a
319 %load.b = load i32, i32* %addr.b
320 %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
321 %count.next = add nuw i32 %count, 1
322 %cmp = icmp ne i32 %count.next, 100
323 br i1 %cmp, label %loop, label %exit
328 ; CHECK-LABEL: test_masked_i32
329 ; CHECK-NOT: call void @llvm.set.loop.iterations
330 ; CHECK-MVEFP: call void @llvm.set.loop.iterations
331 ; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
332 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
333 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
334 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
335 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
336 define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
340 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
341 %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
342 %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
343 %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
344 %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
345 %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
346 %res = add <4 x i32> %load.a, %load.b
347 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
348 %count.next = add nuw i32 %count, 1
349 %cmp = icmp ne i32 %count.next, 100
350 br i1 %cmp, label %loop, label %exit
355 ; CHECK-LABEL: test_masked_f32
356 ; CHECK-NOT: call void @llvm.set.loop.iterations
357 ; CHECK-MVEFP: call void @llvm.set.loop.iterations
358 ; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
359 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
360 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
361 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
362 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
363 define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
367 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
368 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
369 %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
370 %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
371 %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
372 %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
373 %res = fadd <4 x float> %load.a, %load.b
374 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
375 %count.next = add nuw i32 %count, 1
376 %cmp = icmp ne i32 %count.next, 100
377 br i1 %cmp, label %loop, label %exit
382 ; CHECK-LABEL: test_gather_scatter
383 ; CHECK-NOT: call void @llvm.set.loop.iterations
384 ; CHECK-MVEFP: call void @llvm.set.loop.iterations
385 ; CHECK-MVE: call void @llvm.set.loop.iterations.i32(i32 100)
386 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ 100, %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
387 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1)
388 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
389 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
390 define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
394 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
395 %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
396 %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
397 %res = fadd <4 x float> %load.a, %load.b
398 call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
399 %count.next = add nuw i32 %count, 1
400 %cmp = icmp ne i32 %count.next, 100
401 br i1 %cmp, label %loop, label %exit
406 declare i32 @bar(...) local_unnamed_addr #1
407 declare i32 @llvm.arm.smlad(i32, i32, i32)
408 declare half @llvm.fabs.f16(half)
409 declare float @llvm.fabs.f32(float)
410 declare double @llvm.fabs.f64(double)
411 declare float @llvm.log.f32(float)
412 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
413 declare half @llvm.sqrt.f16(half)
414 declare float @llvm.sqrt.f32(float)
415 declare double @llvm.sqrt.f64(double)
416 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
417 declare i32 @llvm.sadd.sat.i32(i32, i32)
418 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
419 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
420 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
421 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
422 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
423 declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
424 declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)