1 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MAIN
2 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+fullfp16 -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
3 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+fp-armv8,+fullfp16 -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP64
4 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
5 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -hardware-loops %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
6 ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -hardware-loops -disable-arm-loloops=true %s -S -o - | FileCheck %s --check-prefix=DISABLED
8 ; DISABLED-NOT: call i32 @llvm.loop.decrement
10 ; CHECK-LABEL: skip_call
11 ; CHECK-NOT: call i32 @llvm.start.loop.iterations
12 ; CHECK-NOT: call i32 @llvm.loop.decrement
14 define i32 @skip_call(i32 %n) {
16 %cmp6 = icmp eq i32 %n, 0
17 br i1 %cmp6, label %while.end, label %while.body.preheader
23 %i.08 = phi i32 [ %inc1, %while.body ], [ 0, %while.body.preheader ]
24 %res.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
25 %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #2
26 %add = add nsw i32 %call, %res.07
27 %inc1 = add nuw i32 %i.08, 1
28 %exitcond = icmp eq i32 %inc1, %n
29 br i1 %exitcond, label %while.end.loopexit, label %while.body
35 %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.end.loopexit ]
39 ; CHECK-LABEL: test_target_specific
40 ; CHECK: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 50)
41 ; CHECK: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
42 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
43 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
44 ; CHECK: br i1 [[CMP]], label %loop, label %exit
46 define i32 @test_target_specific(i32* %a, i32* %b) {
50 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
51 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
52 %addr.a = getelementptr i32, i32* %a, i32 %count
53 %addr.b = getelementptr i32, i32* %b, i32 %count
54 %load.a = load i32, i32* %addr.a
55 %load.b = load i32, i32* %addr.b
56 %res = call i32 @llvm.arm.smlad(i32 %load.a, i32 %load.b, i32 %acc)
57 %count.next = add nuw i32 %count, 2
58 %cmp = icmp ne i32 %count.next, 100
59 br i1 %cmp, label %loop, label %exit
64 ; CHECK-LABEL: test_fabs_f16
65 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
66 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
67 ; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100)
68 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100)
69 define void @test_fabs_f16(half* %a, half* %b) {
73 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
74 %addr.a = getelementptr half, half* %a, i32 %count
75 %load.a = load half, half* %addr.a
76 %abs = call half @llvm.fabs.f16(half %load.a)
77 %addr.b = getelementptr half, half* %b, i32 %count
78 store half %abs, half *%addr.b
79 %count.next = add nuw i32 %count, 1
80 %cmp = icmp ne i32 %count.next, 100
81 br i1 %cmp, label %loop, label %exit
86 ; CHECK-LABEL: test_fabs
87 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
88 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
89 ; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100)
90 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100)
92 define float @test_fabs(float* %a) {
96 %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
97 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
98 %addr.a = getelementptr float, float* %a, i32 %count
99 %load.a = load float, float* %addr.a
100 %abs = call float @llvm.fabs.f32(float %load.a)
101 %res = fadd float %abs, %acc
102 %count.next = add nuw i32 %count, 1
103 %cmp = icmp ne i32 %count.next, 100
104 br i1 %cmp, label %loop, label %exit
109 ; CHECK-LABEL: test_fabs_64
110 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
111 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
112 ; CHECK-FP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100)
113 ; CHECK-FP64: call i32 @llvm.start.loop.iterations.i32(i32 100)
114 ; CHECK-MVEFP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100)
115 define void @test_fabs_64(double* %a, double* %b) {
119 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
120 %addr.a = getelementptr double, double* %a, i32 %count
121 %load.a = load double, double* %addr.a
122 %abs = call double @llvm.fabs.f64(double %load.a)
123 %addr.b = getelementptr double, double* %b, i32 %count
124 store double %abs, double *%addr.b
125 %count.next = add nuw i32 %count, 1
126 %cmp = icmp ne i32 %count.next, 100
127 br i1 %cmp, label %loop, label %exit
132 ; CHECK-LABEL: test_fabs_vec
133 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
134 ; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
135 ; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
136 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
137 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
138 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
139 define <4 x float> @test_fabs_vec(<4 x float>* %a) {
143 %acc = phi <4 x float> [ zeroinitializer, %entry ], [ %res, %loop ]
144 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
145 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
146 %load.a = load <4 x float>, <4 x float>* %addr.a
147 %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %load.a)
148 %res = fadd <4 x float> %abs, %acc
149 %count.next = add nuw i32 %count, 1
150 %cmp = icmp ne i32 %count.next, 100
151 br i1 %cmp, label %loop, label %exit
156 ; CHECK-LABEL: test_log
157 ; CHECK-NOT: call i32 @llvm.start.loop.iterations
158 ; CHECK-NOT: llvm.loop.decrement
159 define float @test_log(float* %a) {
163 %acc = phi float [ 0.0, %entry ], [ %res, %loop ]
164 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
165 %addr.a = getelementptr float, float* %a, i32 %count
166 %load.a = load float, float* %addr.a
167 %abs = call float @llvm.log.f32(float %load.a)
168 %res = fadd float %abs, %acc
169 %count.next = add nuw i32 %count, 1
170 %cmp = icmp ne i32 %count.next, 100
171 br i1 %cmp, label %loop, label %exit
176 ; CHECK-LABEL: test_sqrt_16
177 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
178 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
179 ; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100)
180 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100)
181 ; CHECK-FP64: call i32 @llvm.start.loop.iterations.i32(i32 100)
182 define void @test_sqrt_16(half* %a, half* %b) {
186 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
187 %addr.a = getelementptr half, half* %a, i32 %count
188 %load.a = load half, half* %addr.a
189 %sqrt = call half @llvm.sqrt.f16(half %load.a)
190 %addr.b = getelementptr half, half* %b, i32 %count
191 store half %sqrt, half *%addr.b
192 %count.next = add nuw i32 %count, 1
193 %cmp = icmp ne i32 %count.next, 100
194 br i1 %cmp, label %loop, label %exit
198 ; CHECK-LABEL: test_sqrt
199 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
200 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
201 ; CHECK-FP: call i32 @llvm.start.loop.iterations
202 ; CHECK-MVEFP: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
203 ; CHECK-MVEFP: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
204 ; CHECK-MVEFP: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
205 ; CHECK-MVEFP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
206 ; CHECK-MVEFP: br i1 [[CMP]], label %loop, label %exit
207 define void @test_sqrt(float* %a, float* %b) {
211 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
212 %addr.a = getelementptr float, float* %a, i32 %count
213 %load.a = load float, float* %addr.a
214 %sqrt = call float @llvm.sqrt.f32(float %load.a)
215 %addr.b = getelementptr float, float* %b, i32 %count
216 store float %sqrt, float* %addr.b
217 %count.next = add nuw i32 %count, 1
218 %cmp = icmp ne i32 %count.next, 100
219 br i1 %cmp, label %loop, label %exit
224 ; CHECK-LABEL: test_sqrt_64
225 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
226 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
227 ; CHECK-FP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100)
228 ; CHECK-MVEFP-NOT: call i32 @llvm.start.loop.iterations.i32(i32 100)
229 ; CHECK-FP64: call i32 @llvm.start.loop.iterations.i32(i32 100)
230 define void @test_sqrt_64(double* %a, double* %b) {
234 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
235 %addr.a = getelementptr double, double* %a, i32 %count
236 %load.a = load double, double* %addr.a
237 %sqrt = call double @llvm.sqrt.f64(double %load.a)
238 %addr.b = getelementptr double, double* %b, i32 %count
239 store double %sqrt, double *%addr.b
240 %count.next = add nuw i32 %count, 1
241 %cmp = icmp ne i32 %count.next, 100
242 br i1 %cmp, label %loop, label %exit
247 ; CHECK-LABEL: test_sqrt_vec
248 ; CHECK-MAIN-NOT: call i32 @llvm.start.loop.iterations
249 ; CHECK-MVE-NOT: call i32 @llvm.start.loop.iterations
250 ; CHECK-FP: call i32 @llvm.start.loop.iterations.i32(i32 100)
251 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations.i32(i32 100)
252 define void @test_sqrt_vec(<4 x float>* %a, <4 x float>* %b) {
256 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
257 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
258 %load.a = load <4 x float>, <4 x float>* %addr.a
259 %sqrt = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %load.a)
260 %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
261 store <4 x float> %sqrt, <4 x float>* %addr.b
262 %count.next = add nuw i32 %count, 1
263 %cmp = icmp ne i32 %count.next, 100
264 br i1 %cmp, label %loop, label %exit
269 ; CHECK-LABEL: test_overflow
270 ; CHECK: call i32 @llvm.start.loop.iterations
271 define i32 @test_overflow(i32* %a, i32* %b) {
275 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
276 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
277 %addr.a = getelementptr i32, i32* %a, i32 %count
278 %addr.b = getelementptr i32, i32* %b, i32 %count
279 %load.a = load i32, i32* %addr.a
280 %load.b = load i32, i32* %addr.b
281 %sadd = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %load.a, i32 %load.b)
282 %res = extractvalue {i32, i1} %sadd, 0
283 %count.next = add nuw i32 %count, 1
284 %cmp = icmp ne i32 %count.next, 100
285 br i1 %cmp, label %loop, label %exit
290 ; TODO: We should be able to generate a qadd/sub
291 ; CHECK-LABEL: test_sat
292 ; CHECK: call i32 @llvm.start.loop.iterations.i32(i32 100)
293 define i32 @test_sat(i32* %a, i32* %b) {
297 %acc = phi i32 [ 0, %entry ], [ %res, %loop ]
298 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
299 %addr.a = getelementptr i32, i32* %a, i32 %count
300 %addr.b = getelementptr i32, i32* %b, i32 %count
301 %load.a = load i32, i32* %addr.a
302 %load.b = load i32, i32* %addr.b
303 %res = call i32 @llvm.sadd.sat.i32(i32 %load.a, i32 %load.b)
304 %count.next = add nuw i32 %count, 1
305 %cmp = icmp ne i32 %count.next, 100
306 br i1 %cmp, label %loop, label %exit
311 ; CHECK-LABEL: test_masked_i32
312 ; CHECK-NOT: call i32 @llvm.start.loop.iterations
313 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
314 ; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
315 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
316 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
317 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
318 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
319 define arm_aapcs_vfpcc void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) {
323 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
324 %addr.a = getelementptr <4 x i32>, <4 x i32>* %a, i32 %count
325 %addr.b = getelementptr <4 x i32>, <4 x i32>* %b, i32 %count
326 %addr.c = getelementptr <4 x i32>, <4 x i32>* %c, i32 %count
327 %load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
328 %load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
329 %res = add <4 x i32> %load.a, %load.b
330 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %res, <4 x i32>* %addr.c, i32 4, <4 x i1> %mask)
331 %count.next = add nuw i32 %count, 1
332 %cmp = icmp ne i32 %count.next, 100
333 br i1 %cmp, label %loop, label %exit
338 ; CHECK-LABEL: test_masked_f32
339 ; CHECK-NOT: call i32 @llvm.start.loop.iterations
340 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
341 ; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
342 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
343 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
344 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
345 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
346 define arm_aapcs_vfpcc void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) {
350 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
351 %addr.a = getelementptr <4 x float>, <4 x float>* %a, i32 %count
352 %addr.b = getelementptr <4 x float>, <4 x float>* %b, i32 %count
353 %addr.c = getelementptr <4 x float>, <4 x float>* %c, i32 %count
354 %load.a = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
355 %load.b = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr.b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
356 %res = fadd <4 x float> %load.a, %load.b
357 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %res, <4 x float>* %addr.c, i32 4, <4 x i1> %mask)
358 %count.next = add nuw i32 %count, 1
359 %cmp = icmp ne i32 %count.next, 100
360 br i1 %cmp, label %loop, label %exit
365 ; CHECK-LABEL: test_gather_scatter
366 ; CHECK-NOT: call i32 @llvm.start.loop.iterations
367 ; CHECK-MVEFP: call i32 @llvm.start.loop.iterations
368 ; CHECK-MVE: [[X:%[^ ]+]] = call i32 @llvm.start.loop.iterations.i32(i32 100)
369 ; CHECK-MVE: [[COUNT:%[^ ]+]] = phi i32 [ [[X]], %entry ], [ [[LOOP_DEC:%[^ ]+]], %loop ]
370 ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[COUNT]], i32 1)
371 ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
372 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit
373 define arm_aapcs_vfpcc void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) {
377 %count = phi i32 [ 0, %entry ], [ %count.next, %loop ]
378 %load.a = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %a, i32 4, <4 x i1> %mask, <4 x float> %passthru)
379 %load.b = call <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*> %b, i32 4, <4 x i1> %mask, <4 x float> %passthru)
380 %res = fadd <4 x float> %load.a, %load.b
381 call void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float> %res, <4 x float*> %c, i32 4, <4 x i1> %mask)
382 %count.next = add nuw i32 %count, 1
383 %cmp = icmp ne i32 %count.next, 100
384 br i1 %cmp, label %loop, label %exit
389 declare i32 @bar(...) local_unnamed_addr #1
390 declare i32 @llvm.arm.smlad(i32, i32, i32)
391 declare half @llvm.fabs.f16(half)
392 declare float @llvm.fabs.f32(float)
393 declare double @llvm.fabs.f64(double)
394 declare float @llvm.log.f32(float)
395 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
396 declare half @llvm.sqrt.f16(half)
397 declare float @llvm.sqrt.f32(float)
398 declare double @llvm.sqrt.f64(double)
399 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
400 declare i32 @llvm.sadd.sat.i32(i32, i32)
401 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
402 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
403 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
404 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
405 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
406 declare <4 x float> @llvm.masked.gather.v4f32.p0v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
407 declare void @llvm.masked.scatter.v4f32.p0v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)