1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
4 define arm_aapcs_vfpcc void @round(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r2, #0
11 ; CHECK-NEXT: popeq {r7, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %vector.ph
13 ; CHECK-NEXT: dlstp.32 lr, r2
14 ; CHECK-NEXT: .LBB0_2: @ %vector.body
15 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
16 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
17 ; CHECK-NEXT: vrinta.f32 q0, q0
18 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
19 ; CHECK-NEXT: letp lr, .LBB0_2
20 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
21 ; CHECK-NEXT: pop {r7, pc}
23 %cmp5 = icmp eq i32 %n, 0
24 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
26 vector.ph: ; preds = %entry
27 %n.rnd.up = add i32 %n, 3
28 %n.vec = and i32 %n.rnd.up, -4
31 vector.body: ; preds = %vector.body, %vector.ph
32 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
33 %next.gep = getelementptr float, ptr %pSrcA, i32 %index
34 %next.gep14 = getelementptr float, ptr %pDst, i32 %index
35 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
36 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
37 %0 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load)
38 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask)
39 %index.next = add i32 %index, 4
40 %1 = icmp eq i32 %index.next, %n.vec
41 br i1 %1, label %for.cond.cleanup, label %vector.body
43 for.cond.cleanup: ; preds = %vector.body, %entry
47 define arm_aapcs_vfpcc void @rint(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 {
49 ; CHECK: @ %bb.0: @ %entry
50 ; CHECK-NEXT: .save {r7, lr}
51 ; CHECK-NEXT: push {r7, lr}
52 ; CHECK-NEXT: cmp r2, #0
54 ; CHECK-NEXT: popeq {r7, pc}
55 ; CHECK-NEXT: .LBB1_1: @ %vector.ph
56 ; CHECK-NEXT: dlstp.32 lr, r2
57 ; CHECK-NEXT: .LBB1_2: @ %vector.body
58 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
59 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
60 ; CHECK-NEXT: vrintx.f32 q0, q0
61 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
62 ; CHECK-NEXT: letp lr, .LBB1_2
63 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
64 ; CHECK-NEXT: pop {r7, pc}
66 %cmp5 = icmp eq i32 %n, 0
67 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
69 vector.ph: ; preds = %entry
70 %n.rnd.up = add i32 %n, 3
71 %n.vec = and i32 %n.rnd.up, -4
74 vector.body: ; preds = %vector.body, %vector.ph
75 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
76 %next.gep = getelementptr float, ptr %pSrcA, i32 %index
77 %next.gep14 = getelementptr float, ptr %pDst, i32 %index
78 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
79 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
80 %0 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load)
81 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask)
82 %index.next = add i32 %index, 4
83 %1 = icmp eq i32 %index.next, %n.vec
84 br i1 %1, label %for.cond.cleanup, label %vector.body
86 for.cond.cleanup: ; preds = %vector.body, %entry
90 define arm_aapcs_vfpcc void @trunc(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 {
92 ; CHECK: @ %bb.0: @ %entry
93 ; CHECK-NEXT: .save {r7, lr}
94 ; CHECK-NEXT: push {r7, lr}
95 ; CHECK-NEXT: cmp r2, #0
97 ; CHECK-NEXT: popeq {r7, pc}
98 ; CHECK-NEXT: .LBB2_1: @ %vector.ph
99 ; CHECK-NEXT: dlstp.32 lr, r2
100 ; CHECK-NEXT: .LBB2_2: @ %vector.body
101 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
102 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
103 ; CHECK-NEXT: vrintz.f32 q0, q0
104 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
105 ; CHECK-NEXT: letp lr, .LBB2_2
106 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
107 ; CHECK-NEXT: pop {r7, pc}
109 %cmp5 = icmp eq i32 %n, 0
110 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
112 vector.ph: ; preds = %entry
113 %n.rnd.up = add i32 %n, 3
114 %n.vec = and i32 %n.rnd.up, -4
115 br label %vector.body
117 vector.body: ; preds = %vector.body, %vector.ph
118 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
119 %next.gep = getelementptr float, ptr %pSrcA, i32 %index
120 %next.gep14 = getelementptr float, ptr %pDst, i32 %index
121 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
122 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
123 %0 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load)
124 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask)
125 %index.next = add i32 %index, 4
126 %1 = icmp eq i32 %index.next, %n.vec
127 br i1 %1, label %for.cond.cleanup, label %vector.body
129 for.cond.cleanup: ; preds = %vector.body, %entry
133 define arm_aapcs_vfpcc void @ceil(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 {
135 ; CHECK: @ %bb.0: @ %entry
136 ; CHECK-NEXT: .save {r7, lr}
137 ; CHECK-NEXT: push {r7, lr}
138 ; CHECK-NEXT: cmp r2, #0
140 ; CHECK-NEXT: popeq {r7, pc}
141 ; CHECK-NEXT: .LBB3_1: @ %vector.ph
142 ; CHECK-NEXT: dlstp.32 lr, r2
143 ; CHECK-NEXT: .LBB3_2: @ %vector.body
144 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
145 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
146 ; CHECK-NEXT: vrintp.f32 q0, q0
147 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
148 ; CHECK-NEXT: letp lr, .LBB3_2
149 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
150 ; CHECK-NEXT: pop {r7, pc}
152 %cmp5 = icmp eq i32 %n, 0
153 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
155 vector.ph: ; preds = %entry
156 %n.rnd.up = add i32 %n, 3
157 %n.vec = and i32 %n.rnd.up, -4
158 br label %vector.body
160 vector.body: ; preds = %vector.body, %vector.ph
161 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
162 %next.gep = getelementptr float, ptr %pSrcA, i32 %index
163 %next.gep14 = getelementptr float, ptr %pDst, i32 %index
164 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
165 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
166 %0 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load)
167 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask)
168 %index.next = add i32 %index, 4
169 %1 = icmp eq i32 %index.next, %n.vec
170 br i1 %1, label %for.cond.cleanup, label %vector.body
172 for.cond.cleanup: ; preds = %vector.body, %entry
176 define arm_aapcs_vfpcc void @floor(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 {
177 ; CHECK-LABEL: floor:
178 ; CHECK: @ %bb.0: @ %entry
179 ; CHECK-NEXT: .save {r7, lr}
180 ; CHECK-NEXT: push {r7, lr}
181 ; CHECK-NEXT: cmp r2, #0
183 ; CHECK-NEXT: popeq {r7, pc}
184 ; CHECK-NEXT: .LBB4_1: @ %vector.ph
185 ; CHECK-NEXT: dlstp.32 lr, r2
186 ; CHECK-NEXT: .LBB4_2: @ %vector.body
187 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
188 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
189 ; CHECK-NEXT: vrintm.f32 q0, q0
190 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
191 ; CHECK-NEXT: letp lr, .LBB4_2
192 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
193 ; CHECK-NEXT: pop {r7, pc}
195 %cmp5 = icmp eq i32 %n, 0
196 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
198 vector.ph: ; preds = %entry
199 %n.rnd.up = add i32 %n, 3
200 %n.vec = and i32 %n.rnd.up, -4
201 br label %vector.body
203 vector.body: ; preds = %vector.body, %vector.ph
204 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
205 %next.gep = getelementptr float, ptr %pSrcA, i32 %index
206 %next.gep14 = getelementptr float, ptr %pDst, i32 %index
207 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
208 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
209 %0 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load)
210 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask)
211 %index.next = add i32 %index, 4
212 %1 = icmp eq i32 %index.next, %n.vec
213 br i1 %1, label %for.cond.cleanup, label %vector.body
215 for.cond.cleanup: ; preds = %vector.body, %entry
219 ; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions
220 define arm_aapcs_vfpcc void @nearbyint(ptr noalias nocapture readonly %pSrcA, ptr noalias nocapture %pDst, i32 %n) #0 {
221 ; CHECK-LABEL: nearbyint:
222 ; CHECK: @ %bb.0: @ %entry
223 ; CHECK-NEXT: .save {r7, lr}
224 ; CHECK-NEXT: push {r7, lr}
225 ; CHECK-NEXT: cmp r2, #0
227 ; CHECK-NEXT: popeq {r7, pc}
228 ; CHECK-NEXT: .LBB5_1: @ %vector.ph
229 ; CHECK-NEXT: dlstp.32 lr, r2
230 ; CHECK-NEXT: .LBB5_2: @ %vector.body
231 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
232 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
233 ; CHECK-NEXT: vrintr.f32 s3, s3
234 ; CHECK-NEXT: vrintr.f32 s2, s2
235 ; CHECK-NEXT: vrintr.f32 s1, s1
236 ; CHECK-NEXT: vrintr.f32 s0, s0
237 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
238 ; CHECK-NEXT: letp lr, .LBB5_2
239 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
240 ; CHECK-NEXT: pop {r7, pc}
242 %cmp5 = icmp eq i32 %n, 0
243 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
245 vector.ph: ; preds = %entry
246 %n.rnd.up = add i32 %n, 3
247 %n.vec = and i32 %n.rnd.up, -4
248 br label %vector.body
250 vector.body: ; preds = %vector.body, %vector.ph
251 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
252 %next.gep = getelementptr float, ptr %pSrcA, i32 %index
253 %next.gep14 = getelementptr float, ptr %pDst, i32 %index
254 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
255 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %next.gep, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
256 %0 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load)
257 call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %next.gep14, i32 4, <4 x i1> %active.lane.mask)
258 %index.next = add i32 %index, 4
259 %1 = icmp eq i32 %index.next, %n.vec
260 br i1 %1, label %for.cond.cleanup, label %vector.body
262 for.cond.cleanup: ; preds = %vector.body, %entry
266 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
268 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) #2
270 declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
272 declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
274 declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
276 declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
278 declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
280 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1
282 declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>) #4