1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
4 define void @vaddq(ptr %x, ptr %y, i32 %n, i32 %z) {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r2, #1
11 ; CHECK-NEXT: poplt {r7, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %for.body.preheader
13 ; CHECK-NEXT: dlstp.32 lr, r2
14 ; CHECK-NEXT: .LBB0_2: @ %for.body
15 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
16 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
17 ; CHECK-NEXT: vadd.i32 q0, q0, r3
18 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
19 ; CHECK-NEXT: letp lr, .LBB0_2
20 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
21 ; CHECK-NEXT: pop {r7, pc}
23 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
24 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
25 %cmp11 = icmp sgt i32 %n, 0
26 br i1 %cmp11, label %for.body, label %for.cond.cleanup
28 for.cond.cleanup: ; preds = %for.body, %entry
31 for.body: ; preds = %entry, %for.body
32 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
33 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
34 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
35 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
36 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
37 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
38 %2 = add <4 x i32> %1, %.splat
39 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
40 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
41 %sub = add nsw i32 %i.012, -4
42 %cmp = icmp sgt i32 %i.012, 4
43 br i1 %cmp, label %for.body, label %for.cond.cleanup
46 define void @vadd(ptr %s1, i32 %c0, i32 %N) {
48 ; CHECK: @ %bb.0: @ %entry
49 ; CHECK-NEXT: .save {r7, lr}
50 ; CHECK-NEXT: push {r7, lr}
51 ; CHECK-NEXT: cmp r2, #1
53 ; CHECK-NEXT: poplt {r7, pc}
54 ; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph
55 ; CHECK-NEXT: dlstp.32 lr, r2
56 ; CHECK-NEXT: .LBB1_2: @ %while.body
57 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
58 ; CHECK-NEXT: vldrw.u32 q0, [r0]
59 ; CHECK-NEXT: vadd.i32 q0, q0, r1
60 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
61 ; CHECK-NEXT: letp lr, .LBB1_2
62 ; CHECK-NEXT: @ %bb.3: @ %while.end
63 ; CHECK-NEXT: pop {r7, pc}
65 %cmp11 = icmp sgt i32 %N, 0
66 br i1 %cmp11, label %while.body.lr.ph, label %while.end
68 while.body.lr.ph: ; preds = %entry
69 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
70 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
73 while.body: ; preds = %while.body.lr.ph, %while.body
74 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
75 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
76 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
77 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
78 %2 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1)
79 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
80 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
81 %sub = add nsw i32 %N.addr.012, -4
82 %cmp = icmp sgt i32 %N.addr.012, 4
83 br i1 %cmp, label %while.body, label %while.end
85 while.end: ; preds = %while.body, %entry
89 define void @vsubq(ptr %x, ptr %y, i32 %n, i32 %z) {
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: .save {r7, lr}
93 ; CHECK-NEXT: push {r7, lr}
94 ; CHECK-NEXT: cmp r2, #1
96 ; CHECK-NEXT: poplt {r7, pc}
97 ; CHECK-NEXT: .LBB2_1: @ %for.body.preheader
98 ; CHECK-NEXT: dlstp.32 lr, r2
99 ; CHECK-NEXT: .LBB2_2: @ %for.body
100 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
101 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
102 ; CHECK-NEXT: vsub.i32 q0, q0, r3
103 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
104 ; CHECK-NEXT: letp lr, .LBB2_2
105 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
106 ; CHECK-NEXT: pop {r7, pc}
108 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
109 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
110 %cmp11 = icmp sgt i32 %n, 0
111 br i1 %cmp11, label %for.body, label %for.cond.cleanup
113 for.cond.cleanup: ; preds = %for.body, %entry
116 for.body: ; preds = %entry, %for.body
117 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
118 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
119 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
120 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
121 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
122 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
123 %2 = sub <4 x i32> %1, %.splat
124 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
125 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
126 %sub = add nsw i32 %i.012, -4
127 %cmp = icmp sgt i32 %i.012, 4
128 br i1 %cmp, label %for.body, label %for.cond.cleanup
131 define void @vsub(ptr %s1, i32 %c0, i32 %N) {
133 ; CHECK: @ %bb.0: @ %entry
134 ; CHECK-NEXT: .save {r7, lr}
135 ; CHECK-NEXT: push {r7, lr}
136 ; CHECK-NEXT: cmp r2, #1
138 ; CHECK-NEXT: poplt {r7, pc}
139 ; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph
140 ; CHECK-NEXT: dlstp.32 lr, r2
141 ; CHECK-NEXT: .LBB3_2: @ %while.body
142 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
143 ; CHECK-NEXT: vldrw.u32 q0, [r0]
144 ; CHECK-NEXT: vsub.i32 q0, q0, r1
145 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
146 ; CHECK-NEXT: letp lr, .LBB3_2
147 ; CHECK-NEXT: @ %bb.3: @ %while.end
148 ; CHECK-NEXT: pop {r7, pc}
150 %cmp11 = icmp sgt i32 %N, 0
151 br i1 %cmp11, label %while.body.lr.ph, label %while.end
153 while.body.lr.ph: ; preds = %entry
154 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
155 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
158 while.body: ; preds = %while.body.lr.ph, %while.body
159 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
160 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
161 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
162 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
163 %2 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1)
164 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
165 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
166 %sub = add nsw i32 %N.addr.012, -4
167 %cmp = icmp sgt i32 %N.addr.012, 4
168 br i1 %cmp, label %while.body, label %while.end
170 while.end: ; preds = %while.body, %entry
174 define void @vmulq(ptr %x, ptr %y, i32 %n, i32 %z) {
175 ; CHECK-LABEL: vmulq:
176 ; CHECK: @ %bb.0: @ %entry
177 ; CHECK-NEXT: .save {r7, lr}
178 ; CHECK-NEXT: push {r7, lr}
179 ; CHECK-NEXT: cmp r2, #1
181 ; CHECK-NEXT: poplt {r7, pc}
182 ; CHECK-NEXT: .LBB4_1: @ %for.body.preheader
183 ; CHECK-NEXT: dlstp.32 lr, r2
184 ; CHECK-NEXT: .LBB4_2: @ %for.body
185 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
186 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
187 ; CHECK-NEXT: vmul.i32 q0, q0, r3
188 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
189 ; CHECK-NEXT: letp lr, .LBB4_2
190 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
191 ; CHECK-NEXT: pop {r7, pc}
193 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
194 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
195 %cmp11 = icmp sgt i32 %n, 0
196 br i1 %cmp11, label %for.body, label %for.cond.cleanup
198 for.cond.cleanup: ; preds = %for.body, %entry
201 for.body: ; preds = %entry, %for.body
202 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
203 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
204 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
205 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
206 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
207 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
208 %2 = mul <4 x i32> %1, %.splat
209 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
210 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
211 %sub = add nsw i32 %i.012, -4
212 %cmp = icmp sgt i32 %i.012, 4
213 br i1 %cmp, label %for.body, label %for.cond.cleanup
216 define void @vmul(ptr %s1, i32 %c0, i32 %N) {
218 ; CHECK: @ %bb.0: @ %entry
219 ; CHECK-NEXT: .save {r7, lr}
220 ; CHECK-NEXT: push {r7, lr}
221 ; CHECK-NEXT: cmp r2, #1
223 ; CHECK-NEXT: poplt {r7, pc}
224 ; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph
225 ; CHECK-NEXT: dlstp.32 lr, r2
226 ; CHECK-NEXT: .LBB5_2: @ %while.body
227 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
228 ; CHECK-NEXT: vldrw.u32 q0, [r0]
229 ; CHECK-NEXT: vmul.i32 q0, q0, r1
230 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
231 ; CHECK-NEXT: letp lr, .LBB5_2
232 ; CHECK-NEXT: @ %bb.3: @ %while.end
233 ; CHECK-NEXT: pop {r7, pc}
235 %cmp11 = icmp sgt i32 %N, 0
236 br i1 %cmp11, label %while.body.lr.ph, label %while.end
238 while.body.lr.ph: ; preds = %entry
239 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
240 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
243 while.body: ; preds = %while.body.lr.ph, %while.body
244 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
245 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
246 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
247 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
248 %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1)
249 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
250 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
251 %sub = add nsw i32 %N.addr.012, -4
252 %cmp = icmp sgt i32 %N.addr.012, 4
253 br i1 %cmp, label %while.body, label %while.end
255 while.end: ; preds = %while.body, %entry
259 define void @vqaddq(ptr %x, ptr %y, i32 %n, i32 %z) {
260 ; CHECK-LABEL: vqaddq:
261 ; CHECK: @ %bb.0: @ %entry
262 ; CHECK-NEXT: .save {r7, lr}
263 ; CHECK-NEXT: push {r7, lr}
264 ; CHECK-NEXT: cmp r2, #1
266 ; CHECK-NEXT: poplt {r7, pc}
267 ; CHECK-NEXT: .LBB6_1: @ %for.body.preheader
268 ; CHECK-NEXT: dlstp.32 lr, r2
269 ; CHECK-NEXT: .LBB6_2: @ %for.body
270 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
271 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
272 ; CHECK-NEXT: vqadd.s32 q0, q0, r3
273 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
274 ; CHECK-NEXT: letp lr, .LBB6_2
275 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
276 ; CHECK-NEXT: pop {r7, pc}
278 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
279 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
280 %cmp11 = icmp sgt i32 %n, 0
281 br i1 %cmp11, label %for.body, label %for.cond.cleanup
283 for.cond.cleanup: ; preds = %for.body, %entry
286 for.body: ; preds = %entry, %for.body
287 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
288 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
289 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
290 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
291 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
292 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
293 %2 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat)
294 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
295 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
296 %sub = add nsw i32 %i.012, -4
297 %cmp = icmp sgt i32 %i.012, 4
298 br i1 %cmp, label %for.body, label %for.cond.cleanup
301 define void @vqaddqu(ptr %x, ptr %y, i32 %n, i32 %z) {
302 ; CHECK-LABEL: vqaddqu:
303 ; CHECK: @ %bb.0: @ %entry
304 ; CHECK-NEXT: .save {r7, lr}
305 ; CHECK-NEXT: push {r7, lr}
306 ; CHECK-NEXT: cmp r2, #1
308 ; CHECK-NEXT: poplt {r7, pc}
309 ; CHECK-NEXT: .LBB7_1: @ %for.body.preheader
310 ; CHECK-NEXT: dlstp.32 lr, r2
311 ; CHECK-NEXT: .LBB7_2: @ %for.body
312 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
313 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
314 ; CHECK-NEXT: vqadd.u32 q0, q0, r3
315 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
316 ; CHECK-NEXT: letp lr, .LBB7_2
317 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
318 ; CHECK-NEXT: pop {r7, pc}
320 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
321 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
322 %cmp11 = icmp sgt i32 %n, 0
323 br i1 %cmp11, label %for.body, label %for.cond.cleanup
325 for.cond.cleanup: ; preds = %for.body, %entry
328 for.body: ; preds = %entry, %for.body
329 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
330 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
331 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
332 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
333 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
334 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
335 %2 = tail call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat)
336 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
337 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
338 %sub = add nsw i32 %i.012, -4
339 %cmp = icmp sgt i32 %i.012, 4
340 br i1 %cmp, label %for.body, label %for.cond.cleanup
343 define void @vqadd(ptr %s1, i32 %c0, i32 %N) {
344 ; CHECK-LABEL: vqadd:
345 ; CHECK: @ %bb.0: @ %entry
346 ; CHECK-NEXT: .save {r7, lr}
347 ; CHECK-NEXT: push {r7, lr}
348 ; CHECK-NEXT: cmp r2, #1
350 ; CHECK-NEXT: poplt {r7, pc}
351 ; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph
352 ; CHECK-NEXT: dlstp.32 lr, r2
353 ; CHECK-NEXT: .LBB8_2: @ %while.body
354 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
355 ; CHECK-NEXT: vldrw.u32 q0, [r0]
356 ; CHECK-NEXT: vqadd.s32 q0, q0, r1
357 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
358 ; CHECK-NEXT: letp lr, .LBB8_2
359 ; CHECK-NEXT: @ %bb.3: @ %while.end
360 ; CHECK-NEXT: pop {r7, pc}
362 %cmp11 = icmp sgt i32 %N, 0
363 br i1 %cmp11, label %while.body.lr.ph, label %while.end
365 while.body.lr.ph: ; preds = %entry
366 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
367 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
370 while.body: ; preds = %while.body.lr.ph, %while.body
371 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
372 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
373 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
374 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
375 %2 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1)
376 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
377 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
378 %sub = add nsw i32 %N.addr.012, -4
379 %cmp = icmp sgt i32 %N.addr.012, 4
380 br i1 %cmp, label %while.body, label %while.end
382 while.end: ; preds = %while.body, %entry
386 define void @vqsubq(ptr %x, ptr %y, i32 %n, i32 %z) {
387 ; CHECK-LABEL: vqsubq:
388 ; CHECK: @ %bb.0: @ %entry
389 ; CHECK-NEXT: .save {r7, lr}
390 ; CHECK-NEXT: push {r7, lr}
391 ; CHECK-NEXT: cmp r2, #1
393 ; CHECK-NEXT: poplt {r7, pc}
394 ; CHECK-NEXT: .LBB9_1: @ %for.body.preheader
395 ; CHECK-NEXT: dlstp.32 lr, r2
396 ; CHECK-NEXT: .LBB9_2: @ %for.body
397 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
398 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
399 ; CHECK-NEXT: vqsub.s32 q0, q0, r3
400 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
401 ; CHECK-NEXT: letp lr, .LBB9_2
402 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
403 ; CHECK-NEXT: pop {r7, pc}
405 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
406 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
407 %cmp11 = icmp sgt i32 %n, 0
408 br i1 %cmp11, label %for.body, label %for.cond.cleanup
410 for.cond.cleanup: ; preds = %for.body, %entry
413 for.body: ; preds = %entry, %for.body
414 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
415 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
416 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
417 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
418 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
419 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
420 %2 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat)
421 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
422 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
423 %sub = add nsw i32 %i.012, -4
424 %cmp = icmp sgt i32 %i.012, 4
425 br i1 %cmp, label %for.body, label %for.cond.cleanup
428 define void @vqsubqu(ptr %x, ptr %y, i32 %n, i32 %z) {
429 ; CHECK-LABEL: vqsubqu:
430 ; CHECK: @ %bb.0: @ %entry
431 ; CHECK-NEXT: .save {r7, lr}
432 ; CHECK-NEXT: push {r7, lr}
433 ; CHECK-NEXT: cmp r2, #1
435 ; CHECK-NEXT: poplt {r7, pc}
436 ; CHECK-NEXT: .LBB10_1: @ %for.body.preheader
437 ; CHECK-NEXT: dlstp.32 lr, r2
438 ; CHECK-NEXT: .LBB10_2: @ %for.body
439 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
440 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
441 ; CHECK-NEXT: vqsub.u32 q0, q0, r3
442 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
443 ; CHECK-NEXT: letp lr, .LBB10_2
444 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
445 ; CHECK-NEXT: pop {r7, pc}
447 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
448 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
449 %cmp11 = icmp sgt i32 %n, 0
450 br i1 %cmp11, label %for.body, label %for.cond.cleanup
452 for.cond.cleanup: ; preds = %for.body, %entry
455 for.body: ; preds = %entry, %for.body
456 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
457 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
458 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
459 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
460 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
461 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
462 %2 = tail call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %1, <4 x i32> %.splat)
463 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
464 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
465 %sub = add nsw i32 %i.012, -4
466 %cmp = icmp sgt i32 %i.012, 4
467 br i1 %cmp, label %for.body, label %for.cond.cleanup
470 define void @vqsub(ptr %s1, i32 %c0, i32 %N) {
471 ; CHECK-LABEL: vqsub:
472 ; CHECK: @ %bb.0: @ %entry
473 ; CHECK-NEXT: .save {r7, lr}
474 ; CHECK-NEXT: push {r7, lr}
475 ; CHECK-NEXT: cmp r2, #1
477 ; CHECK-NEXT: poplt {r7, pc}
478 ; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph
479 ; CHECK-NEXT: dlstp.32 lr, r2
480 ; CHECK-NEXT: .LBB11_2: @ %while.body
481 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
482 ; CHECK-NEXT: vldrw.u32 q0, [r0]
483 ; CHECK-NEXT: vqsub.s32 q0, q0, r1
484 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
485 ; CHECK-NEXT: letp lr, .LBB11_2
486 ; CHECK-NEXT: @ %bb.3: @ %while.end
487 ; CHECK-NEXT: pop {r7, pc}
489 %cmp11 = icmp sgt i32 %N, 0
490 br i1 %cmp11, label %while.body.lr.ph, label %while.end
492 while.body.lr.ph: ; preds = %entry
493 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
494 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
497 while.body: ; preds = %while.body.lr.ph, %while.body
498 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
499 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
500 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
501 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
502 %2 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1)
503 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
504 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
505 %sub = add nsw i32 %N.addr.012, -4
506 %cmp = icmp sgt i32 %N.addr.012, 4
507 br i1 %cmp, label %while.body, label %while.end
509 while.end: ; preds = %while.body, %entry
513 define void @vhaddq(ptr %x, ptr %y, i32 %n, i32 %z) {
514 ; CHECK-LABEL: vhaddq:
515 ; CHECK: @ %bb.0: @ %entry
516 ; CHECK-NEXT: .save {r7, lr}
517 ; CHECK-NEXT: push {r7, lr}
518 ; CHECK-NEXT: cmp r2, #1
520 ; CHECK-NEXT: poplt {r7, pc}
521 ; CHECK-NEXT: .LBB12_1: @ %for.body.preheader
522 ; CHECK-NEXT: dlstp.32 lr, r2
523 ; CHECK-NEXT: .LBB12_2: @ %for.body
524 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
525 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
526 ; CHECK-NEXT: vhadd.s32 q0, q0, r3
527 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
528 ; CHECK-NEXT: letp lr, .LBB12_2
529 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
530 ; CHECK-NEXT: pop {r7, pc}
532 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
533 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
534 %cmp11 = icmp sgt i32 %n, 0
535 br i1 %cmp11, label %for.body, label %for.cond.cleanup
537 for.cond.cleanup: ; preds = %for.body, %entry
540 for.body: ; preds = %entry, %for.body
541 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
542 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
543 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
544 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
545 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
546 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
547 %2 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %1, <4 x i32> %.splat, i32 0)
548 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
549 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
550 %sub = add nsw i32 %i.012, -4
551 %cmp = icmp sgt i32 %i.012, 4
552 br i1 %cmp, label %for.body, label %for.cond.cleanup
555 define void @vhadd(ptr %s1, i32 %c0, i32 %N) {
556 ; CHECK-LABEL: vhadd:
557 ; CHECK: @ %bb.0: @ %entry
558 ; CHECK-NEXT: .save {r7, lr}
559 ; CHECK-NEXT: push {r7, lr}
560 ; CHECK-NEXT: cmp r2, #1
562 ; CHECK-NEXT: poplt {r7, pc}
563 ; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph
564 ; CHECK-NEXT: dlstp.32 lr, r2
565 ; CHECK-NEXT: .LBB13_2: @ %while.body
566 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
567 ; CHECK-NEXT: vldrw.u32 q0, [r0]
568 ; CHECK-NEXT: vhadd.s32 q0, q0, r1
569 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
570 ; CHECK-NEXT: letp lr, .LBB13_2
571 ; CHECK-NEXT: @ %bb.3: @ %while.end
572 ; CHECK-NEXT: pop {r7, pc}
574 %cmp11 = icmp sgt i32 %N, 0
575 br i1 %cmp11, label %while.body.lr.ph, label %while.end
577 while.body.lr.ph: ; preds = %entry
578 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
579 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
582 while.body: ; preds = %while.body.lr.ph, %while.body
583 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
584 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
585 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
586 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
587 %2 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1)
588 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
589 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
590 %sub = add nsw i32 %N.addr.012, -4
591 %cmp = icmp sgt i32 %N.addr.012, 4
592 br i1 %cmp, label %while.body, label %while.end
594 while.end: ; preds = %while.body, %entry
598 define void @vhsubq(ptr %x, ptr %y, i32 %n, i32 %z) {
599 ; CHECK-LABEL: vhsubq:
600 ; CHECK: @ %bb.0: @ %entry
601 ; CHECK-NEXT: .save {r7, lr}
602 ; CHECK-NEXT: push {r7, lr}
603 ; CHECK-NEXT: cmp r2, #1
605 ; CHECK-NEXT: poplt {r7, pc}
606 ; CHECK-NEXT: .LBB14_1: @ %for.body.preheader
607 ; CHECK-NEXT: dlstp.32 lr, r2
608 ; CHECK-NEXT: .LBB14_2: @ %for.body
609 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
610 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
611 ; CHECK-NEXT: vhsub.s32 q0, q0, r3
612 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
613 ; CHECK-NEXT: letp lr, .LBB14_2
614 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
615 ; CHECK-NEXT: pop {r7, pc}
617 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
618 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
619 %cmp11 = icmp sgt i32 %n, 0
620 br i1 %cmp11, label %for.body, label %for.cond.cleanup
622 for.cond.cleanup: ; preds = %for.body, %entry
625 for.body: ; preds = %entry, %for.body
626 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
627 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
628 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
629 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
630 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
631 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
632 %2 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %1, <4 x i32> %.splat, i32 0)
633 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
634 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
635 %sub = add nsw i32 %i.012, -4
636 %cmp = icmp sgt i32 %i.012, 4
637 br i1 %cmp, label %for.body, label %for.cond.cleanup
640 define void @vhsub(ptr %s1, i32 %c0, i32 %N) {
641 ; CHECK-LABEL: vhsub:
642 ; CHECK: @ %bb.0: @ %entry
643 ; CHECK-NEXT: .save {r7, lr}
644 ; CHECK-NEXT: push {r7, lr}
645 ; CHECK-NEXT: cmp r2, #1
647 ; CHECK-NEXT: poplt {r7, pc}
648 ; CHECK-NEXT: .LBB15_1: @ %while.body.lr.ph
649 ; CHECK-NEXT: dlstp.32 lr, r2
650 ; CHECK-NEXT: .LBB15_2: @ %while.body
651 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
652 ; CHECK-NEXT: vldrw.u32 q0, [r0]
653 ; CHECK-NEXT: vhsub.s32 q0, q0, r1
654 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
655 ; CHECK-NEXT: letp lr, .LBB15_2
656 ; CHECK-NEXT: @ %bb.3: @ %while.end
657 ; CHECK-NEXT: pop {r7, pc}
659 %cmp11 = icmp sgt i32 %N, 0
660 br i1 %cmp11, label %while.body.lr.ph, label %while.end
662 while.body.lr.ph: ; preds = %entry
663 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
664 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
667 while.body: ; preds = %while.body.lr.ph, %while.body
668 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
669 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
670 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
671 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
672 %2 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %1)
673 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
674 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
675 %sub = add nsw i32 %N.addr.012, -4
676 %cmp = icmp sgt i32 %N.addr.012, 4
677 br i1 %cmp, label %while.body, label %while.end
679 while.end: ; preds = %while.body, %entry
683 define void @vqdmullbq(ptr %x, ptr %y, i32 %n, i32 %z) {
684 ; CHECK-LABEL: vqdmullbq:
685 ; CHECK: @ %bb.0: @ %entry
686 ; CHECK-NEXT: .save {r7, lr}
687 ; CHECK-NEXT: push {r7, lr}
688 ; CHECK-NEXT: cmp r2, #1
690 ; CHECK-NEXT: poplt {r7, pc}
691 ; CHECK-NEXT: .LBB16_1: @ %for.body.preheader
692 ; CHECK-NEXT: dlstp.32 lr, r2
693 ; CHECK-NEXT: .LBB16_2: @ %for.body
694 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
695 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
696 ; CHECK-NEXT: vqdmullb.s32 q1, q0, r3
697 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
698 ; CHECK-NEXT: letp lr, .LBB16_2
699 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
700 ; CHECK-NEXT: pop {r7, pc}
702 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
703 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
704 %cmp11 = icmp sgt i32 %n, 0
705 br i1 %cmp11, label %for.body, label %for.cond.cleanup
707 for.cond.cleanup: ; preds = %for.body, %entry
710 for.body: ; preds = %entry, %for.body
711 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
712 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
713 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
714 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
715 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
716 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
717 %2 = tail call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %1, <4 x i32> %.splat, i32 0)
718 %3 = bitcast <2 x i64> %2 to <4 x i32>
719 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %y.addr.013, i32 4, <4 x i1> %0)
720 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
721 %sub = add nsw i32 %i.012, -4
722 %cmp = icmp sgt i32 %i.012, 4
723 br i1 %cmp, label %for.body, label %for.cond.cleanup
727 define void @vqdmull(ptr %s1, i32 %c0, i32 %N) {
728 ; CHECK-LABEL: vqdmull:
729 ; CHECK: @ %bb.0: @ %entry
730 ; CHECK-NEXT: .save {r7, lr}
731 ; CHECK-NEXT: push {r7, lr}
732 ; CHECK-NEXT: cmp r2, #1
734 ; CHECK-NEXT: poplt {r7, pc}
735 ; CHECK-NEXT: .LBB17_1: @ %while.body.lr.ph
736 ; CHECK-NEXT: dlstp.32 lr, r2
737 ; CHECK-NEXT: .LBB17_2: @ %while.body
738 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
739 ; CHECK-NEXT: vldrh.s32 q0, [r0]
740 ; CHECK-NEXT: vqdmullb.s16 q0, q0, r1
741 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
742 ; CHECK-NEXT: letp lr, .LBB17_2
743 ; CHECK-NEXT: @ %bb.3: @ %while.end
744 ; CHECK-NEXT: pop {r7, pc}
746 %cmp11 = icmp sgt i32 %N, 0
747 br i1 %cmp11, label %while.body.lr.ph, label %while.end
749 while.body.lr.ph: ; preds = %entry
750 %conv = trunc i32 %c0 to i16
751 %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0
752 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
755 while.body: ; preds = %while.body.lr.ph, %while.body
756 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
757 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
758 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
759 %1 = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %s1.addr.013, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer)
760 %2 = sext <4 x i16> %1 to <4 x i32>
761 %3 = bitcast <4 x i32> %2 to <8 x i16>
762 %4 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %3, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
763 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %s1.addr.013, i32 4, <4 x i1> %0)
764 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
765 %sub = add nsw i32 %N.addr.012, -4
766 %cmp = icmp sgt i32 %N.addr.012, 4
767 br i1 %cmp, label %while.body, label %while.end
769 while.end: ; preds = %while.body, %entry
773 define void @vqdmulhq(ptr %x, ptr %y, i32 %n, i32 %z) {
774 ; CHECK-LABEL: vqdmulhq:
775 ; CHECK: @ %bb.0: @ %entry
776 ; CHECK-NEXT: .save {r7, lr}
777 ; CHECK-NEXT: push {r7, lr}
778 ; CHECK-NEXT: cmp r2, #1
780 ; CHECK-NEXT: poplt {r7, pc}
781 ; CHECK-NEXT: .LBB18_1: @ %for.body.preheader
782 ; CHECK-NEXT: dlstp.32 lr, r2
783 ; CHECK-NEXT: .LBB18_2: @ %for.body
784 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
785 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
786 ; CHECK-NEXT: vqdmulh.s32 q0, q0, r3
787 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
788 ; CHECK-NEXT: letp lr, .LBB18_2
789 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
790 ; CHECK-NEXT: pop {r7, pc}
792 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
793 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
794 %cmp11 = icmp sgt i32 %n, 0
795 br i1 %cmp11, label %for.body, label %for.cond.cleanup
797 for.cond.cleanup: ; preds = %for.body, %entry
800 for.body: ; preds = %entry, %for.body
801 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
802 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
803 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
804 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
805 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
806 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
807 %2 = tail call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %1, <4 x i32> %.splat)
808 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
809 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
810 %sub = add nsw i32 %i.012, -4
811 %cmp = icmp sgt i32 %i.012, 4
812 br i1 %cmp, label %for.body, label %for.cond.cleanup
815 define void @vqdmulh(ptr %s1, i32 %c0, i32 %N) {
816 ; CHECK-LABEL: vqdmulh:
817 ; CHECK: @ %bb.0: @ %entry
818 ; CHECK-NEXT: .save {r7, lr}
819 ; CHECK-NEXT: push {r7, lr}
820 ; CHECK-NEXT: cmp r2, #1
822 ; CHECK-NEXT: poplt {r7, pc}
823 ; CHECK-NEXT: .LBB19_1: @ %while.body.lr.ph
824 ; CHECK-NEXT: dlstp.32 lr, r2
825 ; CHECK-NEXT: .LBB19_2: @ %while.body
826 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
827 ; CHECK-NEXT: vldrw.u32 q0, [r0]
828 ; CHECK-NEXT: vqdmulh.s32 q0, q0, r1
829 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
830 ; CHECK-NEXT: letp lr, .LBB19_2
831 ; CHECK-NEXT: @ %bb.3: @ %while.end
832 ; CHECK-NEXT: pop {r7, pc}
834 %cmp11 = icmp sgt i32 %N, 0
835 br i1 %cmp11, label %while.body.lr.ph, label %while.end
837 while.body.lr.ph: ; preds = %entry
838 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
839 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
842 while.body: ; preds = %while.body.lr.ph, %while.body
843 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
844 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
845 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
846 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
847 %2 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1)
848 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
849 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
850 %sub = add nsw i32 %N.addr.012, -4
851 %cmp = icmp sgt i32 %N.addr.012, 4
852 br i1 %cmp, label %while.body, label %while.end
854 while.end: ; preds = %while.body, %entry
858 define void @vqrdmulhq(ptr %x, ptr %y, i32 %n, i32 %z) {
859 ; CHECK-LABEL: vqrdmulhq:
860 ; CHECK: @ %bb.0: @ %entry
861 ; CHECK-NEXT: .save {r7, lr}
862 ; CHECK-NEXT: push {r7, lr}
863 ; CHECK-NEXT: cmp r2, #1
865 ; CHECK-NEXT: poplt {r7, pc}
866 ; CHECK-NEXT: .LBB20_1: @ %for.body.preheader
867 ; CHECK-NEXT: dlstp.32 lr, r2
868 ; CHECK-NEXT: .LBB20_2: @ %for.body
869 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
870 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
871 ; CHECK-NEXT: vqrdmulh.s32 q0, q0, r3
872 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
873 ; CHECK-NEXT: letp lr, .LBB20_2
874 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
875 ; CHECK-NEXT: pop {r7, pc}
877 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
878 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
879 %cmp11 = icmp sgt i32 %n, 0
880 br i1 %cmp11, label %for.body, label %for.cond.cleanup
882 for.cond.cleanup: ; preds = %for.body, %entry
885 for.body: ; preds = %entry, %for.body
886 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
887 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
888 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
889 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
890 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
891 %add.ptr = getelementptr inbounds i32, ptr %x.addr.014, i32 4
892 %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %1, <4 x i32> %.splat)
893 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
894 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.013, i32 4
895 %sub = add nsw i32 %i.012, -4
896 %cmp = icmp sgt i32 %i.012, 4
897 br i1 %cmp, label %for.body, label %for.cond.cleanup
900 define void @vqrdmulh(ptr %s1, i32 %c0, i32 %N) {
901 ; CHECK-LABEL: vqrdmulh:
902 ; CHECK: @ %bb.0: @ %entry
903 ; CHECK-NEXT: .save {r7, lr}
904 ; CHECK-NEXT: push {r7, lr}
905 ; CHECK-NEXT: cmp r2, #1
907 ; CHECK-NEXT: poplt {r7, pc}
908 ; CHECK-NEXT: .LBB21_1: @ %while.body.lr.ph
909 ; CHECK-NEXT: dlstp.32 lr, r2
910 ; CHECK-NEXT: .LBB21_2: @ %while.body
911 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
912 ; CHECK-NEXT: vldrw.u32 q0, [r0]
913 ; CHECK-NEXT: vqrdmulh.s32 q0, q0, r1
914 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
915 ; CHECK-NEXT: letp lr, .LBB21_2
916 ; CHECK-NEXT: @ %bb.3: @ %while.end
917 ; CHECK-NEXT: pop {r7, pc}
919 %cmp11 = icmp sgt i32 %N, 0
920 br i1 %cmp11, label %while.body.lr.ph, label %while.end
922 while.body.lr.ph: ; preds = %entry
923 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
924 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
927 while.body: ; preds = %while.body.lr.ph, %while.body
928 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
929 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
930 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
931 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
932 %2 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %1)
933 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
934 %add.ptr = getelementptr inbounds i32, ptr %s1.addr.013, i32 4
935 %sub = add nsw i32 %N.addr.012, -4
936 %cmp = icmp sgt i32 %N.addr.012, 4
937 br i1 %cmp, label %while.body, label %while.end
939 while.end: ; preds = %while.body, %entry
943 define void @vmlaq(ptr %x, ptr %y, i32 %n, i32 %z) {
944 ; CHECK-LABEL: vmlaq:
945 ; CHECK: @ %bb.0: @ %entry
946 ; CHECK-NEXT: .save {r7, lr}
947 ; CHECK-NEXT: push {r7, lr}
948 ; CHECK-NEXT: cmp r2, #1
950 ; CHECK-NEXT: poplt {r7, pc}
951 ; CHECK-NEXT: .LBB22_1: @ %for.body.preheader
952 ; CHECK-NEXT: dlstp.32 lr, r2
953 ; CHECK-NEXT: .LBB22_2: @ %for.body
954 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
955 ; CHECK-NEXT: vldrw.u32 q0, [r1]
956 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
957 ; CHECK-NEXT: vmla.i32 q1, q0, r3
958 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
959 ; CHECK-NEXT: letp lr, .LBB22_2
960 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
961 ; CHECK-NEXT: pop {r7, pc}
963 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
964 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
965 %cmp14 = icmp sgt i32 %n, 0
966 br i1 %cmp14, label %for.body, label %for.cond.cleanup
968 for.cond.cleanup: ; preds = %for.body, %entry
971 for.body: ; preds = %entry, %for.body
972 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
973 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
974 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
975 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
976 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
977 %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4
978 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
979 %3 = mul <4 x i32> %2, %.splat
980 %4 = add <4 x i32> %3, %1
981 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %y.addr.016, i32 4, <4 x i1> %0)
982 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4
983 %sub = add nsw i32 %i.015, -4
984 %cmp = icmp sgt i32 %i.015, 4
985 br i1 %cmp, label %for.body, label %for.cond.cleanup
988 define void @vmlaqp(ptr %x, ptr %y, i32 %n, i32 %z) {
989 ; CHECK-LABEL: vmlaqp:
990 ; CHECK: @ %bb.0: @ %entry
991 ; CHECK-NEXT: .save {r7, lr}
992 ; CHECK-NEXT: push {r7, lr}
993 ; CHECK-NEXT: cmp r2, #1
995 ; CHECK-NEXT: poplt {r7, pc}
996 ; CHECK-NEXT: .LBB23_1: @ %for.body.preheader
997 ; CHECK-NEXT: dlstp.32 lr, r2
998 ; CHECK-NEXT: .LBB23_2: @ %for.body
999 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1000 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1001 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1002 ; CHECK-NEXT: vmla.i32 q1, q0, r3
1003 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1004 ; CHECK-NEXT: letp lr, .LBB23_2
1005 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1006 ; CHECK-NEXT: pop {r7, pc}
1008 %cmp15 = icmp sgt i32 %n, 0
1009 br i1 %cmp15, label %for.body, label %for.cond.cleanup
1011 for.cond.cleanup: ; preds = %for.body, %entry
1014 for.body: ; preds = %entry, %for.body
1015 %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1016 %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1017 %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1018 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016)
1019 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.018, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1020 %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4
1021 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1022 %3 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %2, i32 %z, <4 x i1> %0)
1023 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %y.addr.017, i32 4, <4 x i1> %0)
1024 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4
1025 %sub = add nsw i32 %i.016, -4
1026 %cmp = icmp sgt i32 %i.016, 4
1027 br i1 %cmp, label %for.body, label %for.cond.cleanup
1030 define void @vmlasq(ptr %x, ptr %y, i32 %n, i32 %z) {
1031 ; CHECK-LABEL: vmlasq:
1032 ; CHECK: @ %bb.0: @ %entry
1033 ; CHECK-NEXT: .save {r7, lr}
1034 ; CHECK-NEXT: push {r7, lr}
1035 ; CHECK-NEXT: cmp r2, #1
1037 ; CHECK-NEXT: poplt {r7, pc}
1038 ; CHECK-NEXT: .LBB24_1: @ %for.body.preheader
1039 ; CHECK-NEXT: dlstp.32 lr, r2
1040 ; CHECK-NEXT: .LBB24_2: @ %for.body
1041 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1042 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1043 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1044 ; CHECK-NEXT: vmlas.i32 q1, q0, r3
1045 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1046 ; CHECK-NEXT: letp lr, .LBB24_2
1047 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1048 ; CHECK-NEXT: pop {r7, pc}
1050 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
1051 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
1052 %cmp14 = icmp sgt i32 %n, 0
1053 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1055 for.cond.cleanup: ; preds = %for.body, %entry
1058 for.body: ; preds = %entry, %for.body
1059 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1060 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1061 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1062 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1063 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1064 %add.ptr = getelementptr inbounds i32, ptr %x.addr.017, i32 4
1065 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1066 %3 = mul <4 x i32> %2, %1
1067 %4 = add <4 x i32> %3, %.splat
1068 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %4, ptr %y.addr.016, i32 4, <4 x i1> %0)
1069 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.016, i32 4
1070 %sub = add nsw i32 %i.015, -4
1071 %cmp = icmp sgt i32 %i.015, 4
1072 br i1 %cmp, label %for.body, label %for.cond.cleanup
1075 define void @vmlasqp(ptr %x, ptr %y, i32 %n, i32 %z) {
1076 ; CHECK-LABEL: vmlasqp:
1077 ; CHECK: @ %bb.0: @ %entry
1078 ; CHECK-NEXT: .save {r7, lr}
1079 ; CHECK-NEXT: push {r7, lr}
1080 ; CHECK-NEXT: cmp r2, #1
1082 ; CHECK-NEXT: poplt {r7, pc}
1083 ; CHECK-NEXT: .LBB25_1: @ %for.body.preheader
1084 ; CHECK-NEXT: dlstp.32 lr, r2
1085 ; CHECK-NEXT: .LBB25_2: @ %for.body
1086 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1087 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1088 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1089 ; CHECK-NEXT: vmlas.i32 q1, q0, r3
1090 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1091 ; CHECK-NEXT: letp lr, .LBB25_2
1092 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1093 ; CHECK-NEXT: pop {r7, pc}
1095 %cmp15 = icmp sgt i32 %n, 0
1096 br i1 %cmp15, label %for.body, label %for.cond.cleanup
1098 for.cond.cleanup: ; preds = %for.body, %entry
1101 for.body: ; preds = %entry, %for.body
1102 %x.addr.018 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1103 %y.addr.017 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1104 %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1105 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016)
1106 %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %x.addr.018, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1107 %add.ptr = getelementptr inbounds i32, ptr %x.addr.018, i32 4
1108 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %y.addr.017, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1109 %3 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %2, i32 %z, <4 x i1> %0)
1110 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %y.addr.017, i32 4, <4 x i1> %0)
1111 %add.ptr1 = getelementptr inbounds i32, ptr %y.addr.017, i32 4
1112 %sub = add nsw i32 %i.016, -4
1113 %cmp = icmp sgt i32 %i.016, 4
1114 br i1 %cmp, label %for.body, label %for.cond.cleanup
1117 define void @vaddqf(ptr %x, ptr %y, i32 %n, float %z) {
1118 ; CHECK-LABEL: vaddqf:
1119 ; CHECK: @ %bb.0: @ %entry
1120 ; CHECK-NEXT: .save {r7, lr}
1121 ; CHECK-NEXT: push {r7, lr}
1122 ; CHECK-NEXT: cmp r2, #1
1124 ; CHECK-NEXT: poplt {r7, pc}
1125 ; CHECK-NEXT: .LBB26_1: @ %for.body.preheader
1126 ; CHECK-NEXT: dlstp.32 lr, r2
1127 ; CHECK-NEXT: .LBB26_2: @ %for.body
1128 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1129 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1130 ; CHECK-NEXT: vadd.f32 q0, q0, r3
1131 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
1132 ; CHECK-NEXT: letp lr, .LBB26_2
1133 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1134 ; CHECK-NEXT: pop {r7, pc}
1136 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1137 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1138 %cmp11 = icmp sgt i32 %n, 0
1139 br i1 %cmp11, label %for.body, label %for.cond.cleanup
1141 for.cond.cleanup: ; preds = %for.body, %entry
1144 for.body: ; preds = %entry, %for.body
1145 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1146 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1147 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1148 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1149 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1150 %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4
1151 %2 = fadd fast <4 x float> %1, %.splat
1152 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
1153 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4
1154 %sub = add nsw i32 %i.012, -4
1155 %cmp = icmp sgt i32 %i.012, 4
1156 br i1 %cmp, label %for.body, label %for.cond.cleanup
1159 define void @vaddf(ptr %s1, float %c0, i32 %N) {
1160 ; CHECK-LABEL: vaddf:
1161 ; CHECK: @ %bb.0: @ %entry
1162 ; CHECK-NEXT: .save {r7, lr}
1163 ; CHECK-NEXT: push {r7, lr}
1164 ; CHECK-NEXT: cmp r2, #1
1166 ; CHECK-NEXT: poplt {r7, pc}
1167 ; CHECK-NEXT: .LBB27_1: @ %while.body.lr.ph
1168 ; CHECK-NEXT: dlstp.32 lr, r2
1169 ; CHECK-NEXT: .LBB27_2: @ %while.body
1170 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1171 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1172 ; CHECK-NEXT: vadd.f32 q0, q0, r1
1173 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
1174 ; CHECK-NEXT: letp lr, .LBB27_2
1175 ; CHECK-NEXT: @ %bb.3: @ %while.end
1176 ; CHECK-NEXT: pop {r7, pc}
1178 %cmp11 = icmp sgt i32 %N, 0
1179 br i1 %cmp11, label %while.body.lr.ph, label %while.end
1181 while.body.lr.ph: ; preds = %entry
1182 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1183 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1184 br label %while.body
1186 while.body: ; preds = %while.body.lr.ph, %while.body
1187 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1188 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1189 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1190 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1191 %2 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %.splat, <4 x i1> %0, <4 x float> %1)
1192 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
1193 %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4
1194 %sub = add nsw i32 %N.addr.012, -4
1195 %cmp = icmp sgt i32 %N.addr.012, 4
1196 br i1 %cmp, label %while.body, label %while.end
1198 while.end: ; preds = %while.body, %entry
1202 define void @vsubqf(ptr %x, ptr %y, i32 %n, float %z) {
1203 ; CHECK-LABEL: vsubqf:
1204 ; CHECK: @ %bb.0: @ %entry
1205 ; CHECK-NEXT: .save {r7, lr}
1206 ; CHECK-NEXT: push {r7, lr}
1207 ; CHECK-NEXT: cmp r2, #1
1209 ; CHECK-NEXT: poplt {r7, pc}
1210 ; CHECK-NEXT: .LBB28_1: @ %for.body.preheader
1211 ; CHECK-NEXT: dlstp.32 lr, r2
1212 ; CHECK-NEXT: .LBB28_2: @ %for.body
1213 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1214 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1215 ; CHECK-NEXT: vsub.f32 q0, q0, r3
1216 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
1217 ; CHECK-NEXT: letp lr, .LBB28_2
1218 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1219 ; CHECK-NEXT: pop {r7, pc}
1221 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1222 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1223 %cmp11 = icmp sgt i32 %n, 0
1224 br i1 %cmp11, label %for.body, label %for.cond.cleanup
1226 for.cond.cleanup: ; preds = %for.body, %entry
1229 for.body: ; preds = %entry, %for.body
1230 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1231 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1232 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1233 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1234 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1235 %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4
1236 %2 = fsub fast <4 x float> %1, %.splat
1237 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
1238 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4
1239 %sub = add nsw i32 %i.012, -4
1240 %cmp = icmp sgt i32 %i.012, 4
1241 br i1 %cmp, label %for.body, label %for.cond.cleanup
1244 define void @vsubf(ptr %s1, float %c0, i32 %N) {
1245 ; CHECK-LABEL: vsubf:
1246 ; CHECK: @ %bb.0: @ %entry
1247 ; CHECK-NEXT: .save {r7, lr}
1248 ; CHECK-NEXT: push {r7, lr}
1249 ; CHECK-NEXT: cmp r2, #1
1251 ; CHECK-NEXT: poplt {r7, pc}
1252 ; CHECK-NEXT: .LBB29_1: @ %while.body.lr.ph
1253 ; CHECK-NEXT: dlstp.32 lr, r2
1254 ; CHECK-NEXT: .LBB29_2: @ %while.body
1255 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1256 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1257 ; CHECK-NEXT: vsub.f32 q0, q0, r1
1258 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
1259 ; CHECK-NEXT: letp lr, .LBB29_2
1260 ; CHECK-NEXT: @ %bb.3: @ %while.end
1261 ; CHECK-NEXT: pop {r7, pc}
1263 %cmp11 = icmp sgt i32 %N, 0
1264 br i1 %cmp11, label %while.body.lr.ph, label %while.end
1266 while.body.lr.ph: ; preds = %entry
1267 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1268 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1269 br label %while.body
1271 while.body: ; preds = %while.body.lr.ph, %while.body
1272 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1273 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1274 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1275 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1276 %2 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %.splat, <4 x i1> %0, <4 x float> %1)
1277 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
1278 %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4
1279 %sub = add nsw i32 %N.addr.012, -4
1280 %cmp = icmp sgt i32 %N.addr.012, 4
1281 br i1 %cmp, label %while.body, label %while.end
1283 while.end: ; preds = %while.body, %entry
1287 define void @vmulqf(ptr %x, ptr %y, i32 %n, float %z) {
1288 ; CHECK-LABEL: vmulqf:
1289 ; CHECK: @ %bb.0: @ %entry
1290 ; CHECK-NEXT: .save {r7, lr}
1291 ; CHECK-NEXT: push {r7, lr}
1292 ; CHECK-NEXT: cmp r2, #1
1294 ; CHECK-NEXT: poplt {r7, pc}
1295 ; CHECK-NEXT: .LBB30_1: @ %for.body.preheader
1296 ; CHECK-NEXT: dlstp.32 lr, r2
1297 ; CHECK-NEXT: .LBB30_2: @ %for.body
1298 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1299 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1300 ; CHECK-NEXT: vmul.f32 q0, q0, r3
1301 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
1302 ; CHECK-NEXT: letp lr, .LBB30_2
1303 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1304 ; CHECK-NEXT: pop {r7, pc}
1306 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1307 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1308 %cmp11 = icmp sgt i32 %n, 0
1309 br i1 %cmp11, label %for.body, label %for.cond.cleanup
1311 for.cond.cleanup: ; preds = %for.body, %entry
1314 for.body: ; preds = %entry, %for.body
1315 %x.addr.014 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1316 %y.addr.013 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1317 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1318 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1319 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1320 %add.ptr = getelementptr inbounds float, ptr %x.addr.014, i32 4
1321 %2 = fmul fast <4 x float> %1, %.splat
1322 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %y.addr.013, i32 4, <4 x i1> %0)
1323 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.013, i32 4
1324 %sub = add nsw i32 %i.012, -4
1325 %cmp = icmp sgt i32 %i.012, 4
1326 br i1 %cmp, label %for.body, label %for.cond.cleanup
1329 define void @vmulf(ptr %s1, float %c0, i32 %N) {
1330 ; CHECK-LABEL: vmulf:
1331 ; CHECK: @ %bb.0: @ %entry
1332 ; CHECK-NEXT: .save {r7, lr}
1333 ; CHECK-NEXT: push {r7, lr}
1334 ; CHECK-NEXT: cmp r2, #1
1336 ; CHECK-NEXT: poplt {r7, pc}
1337 ; CHECK-NEXT: .LBB31_1: @ %while.body.lr.ph
1338 ; CHECK-NEXT: dlstp.32 lr, r2
1339 ; CHECK-NEXT: .LBB31_2: @ %while.body
1340 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1341 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1342 ; CHECK-NEXT: vmul.f32 q0, q0, r1
1343 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
1344 ; CHECK-NEXT: letp lr, .LBB31_2
1345 ; CHECK-NEXT: @ %bb.3: @ %while.end
1346 ; CHECK-NEXT: pop {r7, pc}
1348 %cmp11 = icmp sgt i32 %N, 0
1349 br i1 %cmp11, label %while.body.lr.ph, label %while.end
1351 while.body.lr.ph: ; preds = %entry
1352 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1353 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1354 br label %while.body
1356 while.body: ; preds = %while.body.lr.ph, %while.body
1357 %s1.addr.013 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1358 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1359 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1360 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.013, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1361 %2 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %.splat, <4 x i1> %0, <4 x float> %1)
1362 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %2, ptr %s1.addr.013, i32 4, <4 x i1> %0)
1363 %add.ptr = getelementptr inbounds float, ptr %s1.addr.013, i32 4
1364 %sub = add nsw i32 %N.addr.012, -4
1365 %cmp = icmp sgt i32 %N.addr.012, 4
1366 br i1 %cmp, label %while.body, label %while.end
1368 while.end: ; preds = %while.body, %entry
1372 define void @vfmaq(ptr %x, ptr %y, i32 %n, float %z) {
1373 ; CHECK-LABEL: vfmaq:
1374 ; CHECK: @ %bb.0: @ %entry
1375 ; CHECK-NEXT: .save {r7, lr}
1376 ; CHECK-NEXT: push {r7, lr}
1377 ; CHECK-NEXT: cmp r2, #1
1379 ; CHECK-NEXT: poplt {r7, pc}
1380 ; CHECK-NEXT: .LBB32_1: @ %for.body.preheader
1381 ; CHECK-NEXT: dlstp.32 lr, r2
1382 ; CHECK-NEXT: .LBB32_2: @ %for.body
1383 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1384 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1385 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1386 ; CHECK-NEXT: vfma.f32 q1, q0, r3
1387 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1388 ; CHECK-NEXT: letp lr, .LBB32_2
1389 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1390 ; CHECK-NEXT: pop {r7, pc}
1392 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1393 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1394 %cmp14 = icmp sgt i32 %n, 0
1395 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1397 for.cond.cleanup: ; preds = %for.body, %entry
1400 for.body: ; preds = %entry, %for.body
1401 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1402 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1403 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1404 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1405 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1406 %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4
1407 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1408 %3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %2, <4 x float> %.splat, <4 x float> %1)
1409 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %y.addr.016, i32 4, <4 x i1> %0)
1410 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4
1411 %sub = add nsw i32 %i.015, -4
1412 %cmp = icmp sgt i32 %i.015, 4
1413 br i1 %cmp, label %for.body, label %for.cond.cleanup
1416 define void @vfma(ptr %s1, ptr %s2, float %c0, i32 %N) {
1417 ; CHECK-LABEL: vfma:
1418 ; CHECK: @ %bb.0: @ %entry
1419 ; CHECK-NEXT: .save {r7, lr}
1420 ; CHECK-NEXT: push {r7, lr}
1421 ; CHECK-NEXT: cmp r3, #1
1423 ; CHECK-NEXT: poplt {r7, pc}
1424 ; CHECK-NEXT: .LBB33_1: @ %while.body.lr.ph
1425 ; CHECK-NEXT: dlstp.32 lr, r3
1426 ; CHECK-NEXT: .LBB33_2: @ %while.body
1427 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1428 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1429 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1430 ; CHECK-NEXT: vfma.f32 q1, q0, r2
1431 ; CHECK-NEXT: vstrw.32 q1, [r0], #16
1432 ; CHECK-NEXT: letp lr, .LBB33_2
1433 ; CHECK-NEXT: @ %bb.3: @ %while.end
1434 ; CHECK-NEXT: pop {r7, pc}
1436 %cmp12 = icmp sgt i32 %N, 0
1437 br i1 %cmp12, label %while.body.lr.ph, label %while.end
1439 while.body.lr.ph: ; preds = %entry
1440 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1441 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1442 br label %while.body
1444 while.body: ; preds = %while.body.lr.ph, %while.body
1445 %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1446 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1447 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
1448 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1449 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s2, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1450 %3 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x float> %1, <4 x i1> %0)
1451 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %s1.addr.014, i32 4, <4 x i1> %0)
1452 %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4
1453 %sub = add nsw i32 %N.addr.013, -4
1454 %cmp = icmp sgt i32 %N.addr.013, 4
1455 br i1 %cmp, label %while.body, label %while.end
1457 while.end: ; preds = %while.body, %entry
1461 define void @vfmasq(ptr %x, ptr %y, i32 %n, float %z) {
1462 ; CHECK-LABEL: vfmasq:
1463 ; CHECK: @ %bb.0: @ %entry
1464 ; CHECK-NEXT: .save {r7, lr}
1465 ; CHECK-NEXT: push {r7, lr}
1466 ; CHECK-NEXT: cmp r2, #1
1468 ; CHECK-NEXT: poplt {r7, pc}
1469 ; CHECK-NEXT: .LBB34_1: @ %for.body.preheader
1470 ; CHECK-NEXT: dlstp.32 lr, r2
1471 ; CHECK-NEXT: .LBB34_2: @ %for.body
1472 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1473 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1474 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1475 ; CHECK-NEXT: vfmas.f32 q1, q0, r3
1476 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1477 ; CHECK-NEXT: letp lr, .LBB34_2
1478 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1479 ; CHECK-NEXT: pop {r7, pc}
1481 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1482 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1483 %cmp14 = icmp sgt i32 %n, 0
1484 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1486 for.cond.cleanup: ; preds = %for.body, %entry
1489 for.body: ; preds = %entry, %for.body
1490 %x.addr.017 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
1491 %y.addr.016 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
1492 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1493 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1494 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %x.addr.017, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1495 %add.ptr = getelementptr inbounds float, ptr %x.addr.017, i32 4
1496 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %y.addr.016, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1497 %3 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %2, <4 x float> %.splat)
1498 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %y.addr.016, i32 4, <4 x i1> %0)
1499 %add.ptr1 = getelementptr inbounds float, ptr %y.addr.016, i32 4
1500 %sub = add nsw i32 %i.015, -4
1501 %cmp = icmp sgt i32 %i.015, 4
1502 br i1 %cmp, label %for.body, label %for.cond.cleanup
1505 define void @vfmas(ptr %s1, ptr %s2, float %c0, i32 %N) {
1506 ; CHECK-LABEL: vfmas:
1507 ; CHECK: @ %bb.0: @ %entry
1508 ; CHECK-NEXT: .save {r7, lr}
1509 ; CHECK-NEXT: push {r7, lr}
1510 ; CHECK-NEXT: cmp r3, #1
1512 ; CHECK-NEXT: poplt {r7, pc}
1513 ; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph
1514 ; CHECK-NEXT: dlstp.32 lr, r3
1515 ; CHECK-NEXT: .LBB35_2: @ %while.body
1516 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1517 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1518 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1519 ; CHECK-NEXT: vfmas.f32 q1, q0, r2
1520 ; CHECK-NEXT: vstrw.32 q1, [r0], #16
1521 ; CHECK-NEXT: letp lr, .LBB35_2
1522 ; CHECK-NEXT: @ %bb.3: @ %while.end
1523 ; CHECK-NEXT: pop {r7, pc}
1525 %cmp12 = icmp sgt i32 %N, 0
1526 br i1 %cmp12, label %while.body.lr.ph, label %while.end
1528 while.body.lr.ph: ; preds = %entry
1529 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1530 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1531 br label %while.body
1533 while.body: ; preds = %while.body.lr.ph, %while.body
1534 %s1.addr.014 = phi ptr [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1535 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1536 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
1537 %1 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s1.addr.014, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1538 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %s2, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1539 %3 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %1, <4 x float> %2, <4 x float> %.splat, <4 x i1> %0)
1540 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %s1.addr.014, i32 4, <4 x i1> %0)
1541 %add.ptr = getelementptr inbounds float, ptr %s1.addr.014, i32 4
1542 %sub = add nsw i32 %N.addr.013, -4
1543 %cmp = icmp sgt i32 %N.addr.013, 4
1544 br i1 %cmp, label %while.body, label %while.end
1546 while.end: ; preds = %while.body, %entry
1550 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
1551 declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32 immarg, <4 x i1>, <4 x i16>)
1552 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
1553 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
1554 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
1555 declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
1557 declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1558 declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1559 declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1560 declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1561 declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
1562 declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
1563 declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1564 declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
1565 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
1566 declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1567 declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32)
1568 declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1569 declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32)
1570 declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
1571 declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
1572 declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1573 declare <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
1574 declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1575 declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
1576 declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
1577 declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
1578 declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1579 declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1580 declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1581 declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
1582 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)