1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
4 define void @vaddq(i32* %x, i32* %y, i32 %n, i32 %z) {
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r2, #1
11 ; CHECK-NEXT: poplt {r7, pc}
12 ; CHECK-NEXT: .LBB0_1: @ %for.body.preheader
13 ; CHECK-NEXT: dlstp.32 lr, r2
14 ; CHECK-NEXT: .LBB0_2: @ %for.body
15 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
16 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
17 ; CHECK-NEXT: vadd.i32 q0, q0, r3
18 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
19 ; CHECK-NEXT: letp lr, .LBB0_2
20 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
21 ; CHECK-NEXT: pop {r7, pc}
23 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
24 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
25 %cmp11 = icmp sgt i32 %n, 0
26 br i1 %cmp11, label %for.body, label %for.cond.cleanup
28 for.cond.cleanup: ; preds = %for.body, %entry
31 for.body: ; preds = %entry, %for.body
32 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
33 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
34 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
35 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
36 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
37 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
38 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
39 %3 = add <4 x i32> %2, %.splat
40 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
41 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
42 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
43 %sub = add nsw i32 %i.012, -4
44 %cmp = icmp sgt i32 %i.012, 4
45 br i1 %cmp, label %for.body, label %for.cond.cleanup
48 define void @vadd(i32* %s1, i32 %c0, i32 %N) {
50 ; CHECK: @ %bb.0: @ %entry
51 ; CHECK-NEXT: .save {r7, lr}
52 ; CHECK-NEXT: push {r7, lr}
53 ; CHECK-NEXT: cmp r2, #1
55 ; CHECK-NEXT: poplt {r7, pc}
56 ; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph
57 ; CHECK-NEXT: dlstp.32 lr, r2
58 ; CHECK-NEXT: .LBB1_2: @ %while.body
59 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
60 ; CHECK-NEXT: vldrw.u32 q0, [r0]
61 ; CHECK-NEXT: vadd.i32 q0, q0, r1
62 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
63 ; CHECK-NEXT: letp lr, .LBB1_2
64 ; CHECK-NEXT: @ %bb.3: @ %while.end
65 ; CHECK-NEXT: pop {r7, pc}
67 %cmp11 = icmp sgt i32 %N, 0
68 br i1 %cmp11, label %while.body.lr.ph, label %while.end
70 while.body.lr.ph: ; preds = %entry
71 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
72 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
75 while.body: ; preds = %while.body.lr.ph, %while.body
76 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
77 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
78 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
79 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
80 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
81 %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
82 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
83 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
84 %sub = add nsw i32 %N.addr.012, -4
85 %cmp = icmp sgt i32 %N.addr.012, 4
86 br i1 %cmp, label %while.body, label %while.end
88 while.end: ; preds = %while.body, %entry
92 define void @vsubq(i32* %x, i32* %y, i32 %n, i32 %z) {
94 ; CHECK: @ %bb.0: @ %entry
95 ; CHECK-NEXT: .save {r7, lr}
96 ; CHECK-NEXT: push {r7, lr}
97 ; CHECK-NEXT: cmp r2, #1
99 ; CHECK-NEXT: poplt {r7, pc}
100 ; CHECK-NEXT: .LBB2_1: @ %for.body.preheader
101 ; CHECK-NEXT: dlstp.32 lr, r2
102 ; CHECK-NEXT: .LBB2_2: @ %for.body
103 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
104 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
105 ; CHECK-NEXT: vsub.i32 q0, q0, r3
106 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
107 ; CHECK-NEXT: letp lr, .LBB2_2
108 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
109 ; CHECK-NEXT: pop {r7, pc}
111 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
112 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
113 %cmp11 = icmp sgt i32 %n, 0
114 br i1 %cmp11, label %for.body, label %for.cond.cleanup
116 for.cond.cleanup: ; preds = %for.body, %entry
119 for.body: ; preds = %entry, %for.body
120 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
121 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
122 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
123 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
124 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
125 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
126 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
127 %3 = sub <4 x i32> %2, %.splat
128 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
129 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
130 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
131 %sub = add nsw i32 %i.012, -4
132 %cmp = icmp sgt i32 %i.012, 4
133 br i1 %cmp, label %for.body, label %for.cond.cleanup
136 define void @vsub(i32* %s1, i32 %c0, i32 %N) {
138 ; CHECK: @ %bb.0: @ %entry
139 ; CHECK-NEXT: .save {r7, lr}
140 ; CHECK-NEXT: push {r7, lr}
141 ; CHECK-NEXT: cmp r2, #1
143 ; CHECK-NEXT: poplt {r7, pc}
144 ; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph
145 ; CHECK-NEXT: dlstp.32 lr, r2
146 ; CHECK-NEXT: .LBB3_2: @ %while.body
147 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
148 ; CHECK-NEXT: vldrw.u32 q0, [r0]
149 ; CHECK-NEXT: vsub.i32 q0, q0, r1
150 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
151 ; CHECK-NEXT: letp lr, .LBB3_2
152 ; CHECK-NEXT: @ %bb.3: @ %while.end
153 ; CHECK-NEXT: pop {r7, pc}
155 %cmp11 = icmp sgt i32 %N, 0
156 br i1 %cmp11, label %while.body.lr.ph, label %while.end
158 while.body.lr.ph: ; preds = %entry
159 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
160 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
163 while.body: ; preds = %while.body.lr.ph, %while.body
164 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
165 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
166 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
167 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
168 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
169 %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
170 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
171 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
172 %sub = add nsw i32 %N.addr.012, -4
173 %cmp = icmp sgt i32 %N.addr.012, 4
174 br i1 %cmp, label %while.body, label %while.end
176 while.end: ; preds = %while.body, %entry
180 define void @vmulq(i32* %x, i32* %y, i32 %n, i32 %z) {
181 ; CHECK-LABEL: vmulq:
182 ; CHECK: @ %bb.0: @ %entry
183 ; CHECK-NEXT: .save {r7, lr}
184 ; CHECK-NEXT: push {r7, lr}
185 ; CHECK-NEXT: cmp r2, #1
187 ; CHECK-NEXT: poplt {r7, pc}
188 ; CHECK-NEXT: .LBB4_1: @ %for.body.preheader
189 ; CHECK-NEXT: dlstp.32 lr, r2
190 ; CHECK-NEXT: .LBB4_2: @ %for.body
191 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
192 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
193 ; CHECK-NEXT: vmul.i32 q0, q0, r3
194 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
195 ; CHECK-NEXT: letp lr, .LBB4_2
196 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
197 ; CHECK-NEXT: pop {r7, pc}
199 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
200 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
201 %cmp11 = icmp sgt i32 %n, 0
202 br i1 %cmp11, label %for.body, label %for.cond.cleanup
204 for.cond.cleanup: ; preds = %for.body, %entry
207 for.body: ; preds = %entry, %for.body
208 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
209 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
210 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
211 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
212 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
213 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
214 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
215 %3 = mul <4 x i32> %2, %.splat
216 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
217 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
218 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
219 %sub = add nsw i32 %i.012, -4
220 %cmp = icmp sgt i32 %i.012, 4
221 br i1 %cmp, label %for.body, label %for.cond.cleanup
224 define void @vmul(i32* %s1, i32 %c0, i32 %N) {
226 ; CHECK: @ %bb.0: @ %entry
227 ; CHECK-NEXT: .save {r7, lr}
228 ; CHECK-NEXT: push {r7, lr}
229 ; CHECK-NEXT: cmp r2, #1
231 ; CHECK-NEXT: poplt {r7, pc}
232 ; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph
233 ; CHECK-NEXT: dlstp.32 lr, r2
234 ; CHECK-NEXT: .LBB5_2: @ %while.body
235 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
236 ; CHECK-NEXT: vldrw.u32 q0, [r0]
237 ; CHECK-NEXT: vmul.i32 q0, q0, r1
238 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
239 ; CHECK-NEXT: letp lr, .LBB5_2
240 ; CHECK-NEXT: @ %bb.3: @ %while.end
241 ; CHECK-NEXT: pop {r7, pc}
243 %cmp11 = icmp sgt i32 %N, 0
244 br i1 %cmp11, label %while.body.lr.ph, label %while.end
246 while.body.lr.ph: ; preds = %entry
247 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
248 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
251 while.body: ; preds = %while.body.lr.ph, %while.body
252 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
253 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
254 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
255 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
256 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
257 %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
258 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
259 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
260 %sub = add nsw i32 %N.addr.012, -4
261 %cmp = icmp sgt i32 %N.addr.012, 4
262 br i1 %cmp, label %while.body, label %while.end
264 while.end: ; preds = %while.body, %entry
268 define void @vqaddq(i32* %x, i32* %y, i32 %n, i32 %z) {
269 ; CHECK-LABEL: vqaddq:
270 ; CHECK: @ %bb.0: @ %entry
271 ; CHECK-NEXT: .save {r7, lr}
272 ; CHECK-NEXT: push {r7, lr}
273 ; CHECK-NEXT: cmp r2, #1
275 ; CHECK-NEXT: poplt {r7, pc}
276 ; CHECK-NEXT: .LBB6_1: @ %for.body.preheader
277 ; CHECK-NEXT: dlstp.32 lr, r2
278 ; CHECK-NEXT: .LBB6_2: @ %for.body
279 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
280 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
281 ; CHECK-NEXT: vqadd.s32 q0, q0, r3
282 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
283 ; CHECK-NEXT: letp lr, .LBB6_2
284 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
285 ; CHECK-NEXT: pop {r7, pc}
287 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
288 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
289 %cmp11 = icmp sgt i32 %n, 0
290 br i1 %cmp11, label %for.body, label %for.cond.cleanup
292 for.cond.cleanup: ; preds = %for.body, %entry
295 for.body: ; preds = %entry, %for.body
296 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
297 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
298 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
299 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
300 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
301 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
302 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
303 %3 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %2, <4 x i32> %.splat)
304 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
305 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
306 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
307 %sub = add nsw i32 %i.012, -4
308 %cmp = icmp sgt i32 %i.012, 4
309 br i1 %cmp, label %for.body, label %for.cond.cleanup
312 define void @vqaddqu(i32* %x, i32* %y, i32 %n, i32 %z) {
313 ; CHECK-LABEL: vqaddqu:
314 ; CHECK: @ %bb.0: @ %entry
315 ; CHECK-NEXT: .save {r7, lr}
316 ; CHECK-NEXT: push {r7, lr}
317 ; CHECK-NEXT: cmp r2, #1
319 ; CHECK-NEXT: poplt {r7, pc}
320 ; CHECK-NEXT: .LBB7_1: @ %for.body.preheader
321 ; CHECK-NEXT: dlstp.32 lr, r2
322 ; CHECK-NEXT: .LBB7_2: @ %for.body
323 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
324 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
325 ; CHECK-NEXT: vqadd.u32 q0, q0, r3
326 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
327 ; CHECK-NEXT: letp lr, .LBB7_2
328 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
329 ; CHECK-NEXT: pop {r7, pc}
331 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
332 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
333 %cmp11 = icmp sgt i32 %n, 0
334 br i1 %cmp11, label %for.body, label %for.cond.cleanup
336 for.cond.cleanup: ; preds = %for.body, %entry
339 for.body: ; preds = %entry, %for.body
340 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
341 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
342 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
343 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
344 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
345 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
346 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
347 %3 = tail call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %2, <4 x i32> %.splat)
348 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
349 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
350 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
351 %sub = add nsw i32 %i.012, -4
352 %cmp = icmp sgt i32 %i.012, 4
353 br i1 %cmp, label %for.body, label %for.cond.cleanup
356 define void @vqadd(i32* %s1, i32 %c0, i32 %N) {
357 ; CHECK-LABEL: vqadd:
358 ; CHECK: @ %bb.0: @ %entry
359 ; CHECK-NEXT: .save {r7, lr}
360 ; CHECK-NEXT: push {r7, lr}
361 ; CHECK-NEXT: cmp r2, #1
363 ; CHECK-NEXT: poplt {r7, pc}
364 ; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph
365 ; CHECK-NEXT: dlstp.32 lr, r2
366 ; CHECK-NEXT: .LBB8_2: @ %while.body
367 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
368 ; CHECK-NEXT: vldrw.u32 q0, [r0]
369 ; CHECK-NEXT: vqadd.s32 q0, q0, r1
370 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
371 ; CHECK-NEXT: letp lr, .LBB8_2
372 ; CHECK-NEXT: @ %bb.3: @ %while.end
373 ; CHECK-NEXT: pop {r7, pc}
375 %cmp11 = icmp sgt i32 %N, 0
376 br i1 %cmp11, label %while.body.lr.ph, label %while.end
378 while.body.lr.ph: ; preds = %entry
379 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
380 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
383 while.body: ; preds = %while.body.lr.ph, %while.body
384 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
385 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
386 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
387 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
388 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
389 %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
390 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
391 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
392 %sub = add nsw i32 %N.addr.012, -4
393 %cmp = icmp sgt i32 %N.addr.012, 4
394 br i1 %cmp, label %while.body, label %while.end
396 while.end: ; preds = %while.body, %entry
400 define void @vqsubq(i32* %x, i32* %y, i32 %n, i32 %z) {
401 ; CHECK-LABEL: vqsubq:
402 ; CHECK: @ %bb.0: @ %entry
403 ; CHECK-NEXT: .save {r7, lr}
404 ; CHECK-NEXT: push {r7, lr}
405 ; CHECK-NEXT: cmp r2, #1
407 ; CHECK-NEXT: poplt {r7, pc}
408 ; CHECK-NEXT: .LBB9_1: @ %for.body.preheader
409 ; CHECK-NEXT: dlstp.32 lr, r2
410 ; CHECK-NEXT: .LBB9_2: @ %for.body
411 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
412 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
413 ; CHECK-NEXT: vqsub.s32 q0, q0, r3
414 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
415 ; CHECK-NEXT: letp lr, .LBB9_2
416 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
417 ; CHECK-NEXT: pop {r7, pc}
419 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
420 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
421 %cmp11 = icmp sgt i32 %n, 0
422 br i1 %cmp11, label %for.body, label %for.cond.cleanup
424 for.cond.cleanup: ; preds = %for.body, %entry
427 for.body: ; preds = %entry, %for.body
428 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
429 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
430 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
431 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
432 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
433 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
434 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
435 %3 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %2, <4 x i32> %.splat)
436 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
437 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
438 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
439 %sub = add nsw i32 %i.012, -4
440 %cmp = icmp sgt i32 %i.012, 4
441 br i1 %cmp, label %for.body, label %for.cond.cleanup
444 define void @vqsubqu(i32* %x, i32* %y, i32 %n, i32 %z) {
445 ; CHECK-LABEL: vqsubqu:
446 ; CHECK: @ %bb.0: @ %entry
447 ; CHECK-NEXT: .save {r7, lr}
448 ; CHECK-NEXT: push {r7, lr}
449 ; CHECK-NEXT: cmp r2, #1
451 ; CHECK-NEXT: poplt {r7, pc}
452 ; CHECK-NEXT: .LBB10_1: @ %for.body.preheader
453 ; CHECK-NEXT: dlstp.32 lr, r2
454 ; CHECK-NEXT: .LBB10_2: @ %for.body
455 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
456 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
457 ; CHECK-NEXT: vqsub.u32 q0, q0, r3
458 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
459 ; CHECK-NEXT: letp lr, .LBB10_2
460 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
461 ; CHECK-NEXT: pop {r7, pc}
463 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
464 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
465 %cmp11 = icmp sgt i32 %n, 0
466 br i1 %cmp11, label %for.body, label %for.cond.cleanup
468 for.cond.cleanup: ; preds = %for.body, %entry
471 for.body: ; preds = %entry, %for.body
472 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
473 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
474 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
475 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
476 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
477 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
478 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
479 %3 = tail call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %2, <4 x i32> %.splat)
480 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
481 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
482 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
483 %sub = add nsw i32 %i.012, -4
484 %cmp = icmp sgt i32 %i.012, 4
485 br i1 %cmp, label %for.body, label %for.cond.cleanup
488 define void @vqsub(i32* %s1, i32 %c0, i32 %N) {
489 ; CHECK-LABEL: vqsub:
490 ; CHECK: @ %bb.0: @ %entry
491 ; CHECK-NEXT: .save {r7, lr}
492 ; CHECK-NEXT: push {r7, lr}
493 ; CHECK-NEXT: cmp r2, #1
495 ; CHECK-NEXT: poplt {r7, pc}
496 ; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph
497 ; CHECK-NEXT: dlstp.32 lr, r2
498 ; CHECK-NEXT: .LBB11_2: @ %while.body
499 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
500 ; CHECK-NEXT: vldrw.u32 q0, [r0]
501 ; CHECK-NEXT: vqsub.s32 q0, q0, r1
502 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
503 ; CHECK-NEXT: letp lr, .LBB11_2
504 ; CHECK-NEXT: @ %bb.3: @ %while.end
505 ; CHECK-NEXT: pop {r7, pc}
507 %cmp11 = icmp sgt i32 %N, 0
508 br i1 %cmp11, label %while.body.lr.ph, label %while.end
510 while.body.lr.ph: ; preds = %entry
511 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
512 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
515 while.body: ; preds = %while.body.lr.ph, %while.body
516 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
517 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
518 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
519 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
520 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
521 %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
522 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
523 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
524 %sub = add nsw i32 %N.addr.012, -4
525 %cmp = icmp sgt i32 %N.addr.012, 4
526 br i1 %cmp, label %while.body, label %while.end
528 while.end: ; preds = %while.body, %entry
532 define void @vhaddq(i32* %x, i32* %y, i32 %n, i32 %z) {
533 ; CHECK-LABEL: vhaddq:
534 ; CHECK: @ %bb.0: @ %entry
535 ; CHECK-NEXT: .save {r7, lr}
536 ; CHECK-NEXT: push {r7, lr}
537 ; CHECK-NEXT: cmp r2, #1
539 ; CHECK-NEXT: poplt {r7, pc}
540 ; CHECK-NEXT: .LBB12_1: @ %for.body.preheader
541 ; CHECK-NEXT: dlstp.32 lr, r2
542 ; CHECK-NEXT: .LBB12_2: @ %for.body
543 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
544 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
545 ; CHECK-NEXT: vhadd.s32 q0, q0, r3
546 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
547 ; CHECK-NEXT: letp lr, .LBB12_2
548 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
549 ; CHECK-NEXT: pop {r7, pc}
551 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
552 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
553 %cmp11 = icmp sgt i32 %n, 0
554 br i1 %cmp11, label %for.body, label %for.cond.cleanup
556 for.cond.cleanup: ; preds = %for.body, %entry
559 for.body: ; preds = %entry, %for.body
560 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
561 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
562 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
563 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
564 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
565 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
566 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
567 %3 = tail call <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32> %2, <4 x i32> %.splat, i32 0)
568 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
569 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
570 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
571 %sub = add nsw i32 %i.012, -4
572 %cmp = icmp sgt i32 %i.012, 4
573 br i1 %cmp, label %for.body, label %for.cond.cleanup
576 define void @vhadd(i32* %s1, i32 %c0, i32 %N) {
577 ; CHECK-LABEL: vhadd:
578 ; CHECK: @ %bb.0: @ %entry
579 ; CHECK-NEXT: .save {r7, lr}
580 ; CHECK-NEXT: push {r7, lr}
581 ; CHECK-NEXT: cmp r2, #1
583 ; CHECK-NEXT: poplt {r7, pc}
584 ; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph
585 ; CHECK-NEXT: dlstp.32 lr, r2
586 ; CHECK-NEXT: .LBB13_2: @ %while.body
587 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
588 ; CHECK-NEXT: vldrw.u32 q0, [r0]
589 ; CHECK-NEXT: vhadd.s32 q0, q0, r1
590 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
591 ; CHECK-NEXT: letp lr, .LBB13_2
592 ; CHECK-NEXT: @ %bb.3: @ %while.end
593 ; CHECK-NEXT: pop {r7, pc}
595 %cmp11 = icmp sgt i32 %N, 0
596 br i1 %cmp11, label %while.body.lr.ph, label %while.end
598 while.body.lr.ph: ; preds = %entry
599 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
600 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
603 while.body: ; preds = %while.body.lr.ph, %while.body
604 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
605 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
606 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
607 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
608 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
609 %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
610 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
611 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
612 %sub = add nsw i32 %N.addr.012, -4
613 %cmp = icmp sgt i32 %N.addr.012, 4
614 br i1 %cmp, label %while.body, label %while.end
616 while.end: ; preds = %while.body, %entry
620 define void @vhsubq(i32* %x, i32* %y, i32 %n, i32 %z) {
621 ; CHECK-LABEL: vhsubq:
622 ; CHECK: @ %bb.0: @ %entry
623 ; CHECK-NEXT: .save {r7, lr}
624 ; CHECK-NEXT: push {r7, lr}
625 ; CHECK-NEXT: cmp r2, #1
627 ; CHECK-NEXT: poplt {r7, pc}
628 ; CHECK-NEXT: .LBB14_1: @ %for.body.preheader
629 ; CHECK-NEXT: dlstp.32 lr, r2
630 ; CHECK-NEXT: .LBB14_2: @ %for.body
631 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
632 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
633 ; CHECK-NEXT: vhsub.s32 q0, q0, r3
634 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
635 ; CHECK-NEXT: letp lr, .LBB14_2
636 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
637 ; CHECK-NEXT: pop {r7, pc}
639 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
640 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
641 %cmp11 = icmp sgt i32 %n, 0
642 br i1 %cmp11, label %for.body, label %for.cond.cleanup
644 for.cond.cleanup: ; preds = %for.body, %entry
647 for.body: ; preds = %entry, %for.body
648 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
649 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
650 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
651 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
652 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
653 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
654 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
655 %3 = tail call <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32> %2, <4 x i32> %.splat, i32 0)
656 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
657 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
658 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
659 %sub = add nsw i32 %i.012, -4
660 %cmp = icmp sgt i32 %i.012, 4
661 br i1 %cmp, label %for.body, label %for.cond.cleanup
664 define void @vhsub(i32* %s1, i32 %c0, i32 %N) {
665 ; CHECK-LABEL: vhsub:
666 ; CHECK: @ %bb.0: @ %entry
667 ; CHECK-NEXT: .save {r7, lr}
668 ; CHECK-NEXT: push {r7, lr}
669 ; CHECK-NEXT: cmp r2, #1
671 ; CHECK-NEXT: poplt {r7, pc}
672 ; CHECK-NEXT: .LBB15_1: @ %while.body.lr.ph
673 ; CHECK-NEXT: dlstp.32 lr, r2
674 ; CHECK-NEXT: .LBB15_2: @ %while.body
675 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
676 ; CHECK-NEXT: vldrw.u32 q0, [r0]
677 ; CHECK-NEXT: vhsub.s32 q0, q0, r1
678 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
679 ; CHECK-NEXT: letp lr, .LBB15_2
680 ; CHECK-NEXT: @ %bb.3: @ %while.end
681 ; CHECK-NEXT: pop {r7, pc}
683 %cmp11 = icmp sgt i32 %N, 0
684 br i1 %cmp11, label %while.body.lr.ph, label %while.end
686 while.body.lr.ph: ; preds = %entry
687 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
688 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
691 while.body: ; preds = %while.body.lr.ph, %while.body
692 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
693 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
694 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
695 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
696 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
697 %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
698 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
699 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
700 %sub = add nsw i32 %N.addr.012, -4
701 %cmp = icmp sgt i32 %N.addr.012, 4
702 br i1 %cmp, label %while.body, label %while.end
704 while.end: ; preds = %while.body, %entry
708 define void @vqdmullbq(i32* %x, i32* %y, i32 %n, i32 %z) {
709 ; CHECK-LABEL: vqdmullbq:
710 ; CHECK: @ %bb.0: @ %entry
711 ; CHECK-NEXT: .save {r7, lr}
712 ; CHECK-NEXT: push {r7, lr}
713 ; CHECK-NEXT: cmp r2, #1
715 ; CHECK-NEXT: poplt {r7, pc}
716 ; CHECK-NEXT: .LBB16_1: @ %for.body.preheader
717 ; CHECK-NEXT: dlstp.32 lr, r2
718 ; CHECK-NEXT: .LBB16_2: @ %for.body
719 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
720 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
721 ; CHECK-NEXT: vqdmullb.s32 q1, q0, r3
722 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
723 ; CHECK-NEXT: letp lr, .LBB16_2
724 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
725 ; CHECK-NEXT: pop {r7, pc}
727 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
728 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
729 %cmp11 = icmp sgt i32 %n, 0
730 br i1 %cmp11, label %for.body, label %for.cond.cleanup
732 for.cond.cleanup: ; preds = %for.body, %entry
735 for.body: ; preds = %entry, %for.body
736 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
737 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
738 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
739 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
740 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
741 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
742 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
743 %3 = tail call <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32> %2, <4 x i32> %.splat, i32 0)
744 %4 = bitcast <2 x i64> %3 to <4 x i32>
745 %5 = bitcast i32* %y.addr.013 to <4 x i32>*
746 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %4, <4 x i32>* %5, i32 4, <4 x i1> %0)
747 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
748 %sub = add nsw i32 %i.012, -4
749 %cmp = icmp sgt i32 %i.012, 4
750 br i1 %cmp, label %for.body, label %for.cond.cleanup
754 define void @vqdmull(i32* %s1, i32 %c0, i32 %N) {
755 ; CHECK-LABEL: vqdmull:
756 ; CHECK: @ %bb.0: @ %entry
757 ; CHECK-NEXT: .save {r7, lr}
758 ; CHECK-NEXT: push {r7, lr}
759 ; CHECK-NEXT: cmp r2, #1
761 ; CHECK-NEXT: poplt {r7, pc}
762 ; CHECK-NEXT: .LBB17_1: @ %while.body.lr.ph
763 ; CHECK-NEXT: dlstp.32 lr, r2
764 ; CHECK-NEXT: .LBB17_2: @ %while.body
765 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
766 ; CHECK-NEXT: vldrh.s32 q0, [r0]
767 ; CHECK-NEXT: vqdmullb.s16 q0, q0, r1
768 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
769 ; CHECK-NEXT: letp lr, .LBB17_2
770 ; CHECK-NEXT: @ %bb.3: @ %while.end
771 ; CHECK-NEXT: pop {r7, pc}
773 %cmp11 = icmp sgt i32 %N, 0
774 br i1 %cmp11, label %while.body.lr.ph, label %while.end
776 while.body.lr.ph: ; preds = %entry
777 %conv = trunc i32 %c0 to i16
778 %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0
779 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
782 while.body: ; preds = %while.body.lr.ph, %while.body
783 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
784 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
785 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
786 %1 = bitcast i32* %s1.addr.013 to <4 x i16>*
787 %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer)
788 %3 = sext <4 x i16> %2 to <4 x i32>
789 %4 = bitcast <4 x i32> %3 to <8 x i16>
790 %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %3)
791 %6 = bitcast i32* %s1.addr.013 to <4 x i32>*
792 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0)
793 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
794 %sub = add nsw i32 %N.addr.012, -4
795 %cmp = icmp sgt i32 %N.addr.012, 4
796 br i1 %cmp, label %while.body, label %while.end
798 while.end: ; preds = %while.body, %entry
802 define void @vqdmulhq(i32* %x, i32* %y, i32 %n, i32 %z) {
803 ; CHECK-LABEL: vqdmulhq:
804 ; CHECK: @ %bb.0: @ %entry
805 ; CHECK-NEXT: .save {r7, lr}
806 ; CHECK-NEXT: push {r7, lr}
807 ; CHECK-NEXT: cmp r2, #1
809 ; CHECK-NEXT: poplt {r7, pc}
810 ; CHECK-NEXT: .LBB18_1: @ %for.body.preheader
811 ; CHECK-NEXT: dlstp.32 lr, r2
812 ; CHECK-NEXT: .LBB18_2: @ %for.body
813 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
814 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
815 ; CHECK-NEXT: vqdmulh.s32 q0, q0, r3
816 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
817 ; CHECK-NEXT: letp lr, .LBB18_2
818 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
819 ; CHECK-NEXT: pop {r7, pc}
821 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
822 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
823 %cmp11 = icmp sgt i32 %n, 0
824 br i1 %cmp11, label %for.body, label %for.cond.cleanup
826 for.cond.cleanup: ; preds = %for.body, %entry
829 for.body: ; preds = %entry, %for.body
830 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
831 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
832 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
833 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
834 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
835 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
836 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
837 %3 = tail call <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32> %2, <4 x i32> %.splat)
838 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
839 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
840 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
841 %sub = add nsw i32 %i.012, -4
842 %cmp = icmp sgt i32 %i.012, 4
843 br i1 %cmp, label %for.body, label %for.cond.cleanup
846 define void @vqdmulh(i32* %s1, i32 %c0, i32 %N) {
847 ; CHECK-LABEL: vqdmulh:
848 ; CHECK: @ %bb.0: @ %entry
849 ; CHECK-NEXT: .save {r7, lr}
850 ; CHECK-NEXT: push {r7, lr}
851 ; CHECK-NEXT: cmp r2, #1
853 ; CHECK-NEXT: poplt {r7, pc}
854 ; CHECK-NEXT: .LBB19_1: @ %while.body.lr.ph
855 ; CHECK-NEXT: dlstp.32 lr, r2
856 ; CHECK-NEXT: .LBB19_2: @ %while.body
857 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
858 ; CHECK-NEXT: vldrw.u32 q0, [r0]
859 ; CHECK-NEXT: vqdmulh.s32 q0, q0, r1
860 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
861 ; CHECK-NEXT: letp lr, .LBB19_2
862 ; CHECK-NEXT: @ %bb.3: @ %while.end
863 ; CHECK-NEXT: pop {r7, pc}
865 %cmp11 = icmp sgt i32 %N, 0
866 br i1 %cmp11, label %while.body.lr.ph, label %while.end
868 while.body.lr.ph: ; preds = %entry
869 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
870 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
873 while.body: ; preds = %while.body.lr.ph, %while.body
874 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
875 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
876 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
877 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
878 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
879 %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
880 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
881 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
882 %sub = add nsw i32 %N.addr.012, -4
883 %cmp = icmp sgt i32 %N.addr.012, 4
884 br i1 %cmp, label %while.body, label %while.end
886 while.end: ; preds = %while.body, %entry
890 define void @vqrdmulhq(i32* %x, i32* %y, i32 %n, i32 %z) {
891 ; CHECK-LABEL: vqrdmulhq:
892 ; CHECK: @ %bb.0: @ %entry
893 ; CHECK-NEXT: .save {r7, lr}
894 ; CHECK-NEXT: push {r7, lr}
895 ; CHECK-NEXT: cmp r2, #1
897 ; CHECK-NEXT: poplt {r7, pc}
898 ; CHECK-NEXT: .LBB20_1: @ %for.body.preheader
899 ; CHECK-NEXT: dlstp.32 lr, r2
900 ; CHECK-NEXT: .LBB20_2: @ %for.body
901 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
902 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
903 ; CHECK-NEXT: vqrdmulh.s32 q0, q0, r3
904 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
905 ; CHECK-NEXT: letp lr, .LBB20_2
906 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
907 ; CHECK-NEXT: pop {r7, pc}
909 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
910 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
911 %cmp11 = icmp sgt i32 %n, 0
912 br i1 %cmp11, label %for.body, label %for.cond.cleanup
914 for.cond.cleanup: ; preds = %for.body, %entry
917 for.body: ; preds = %entry, %for.body
918 %x.addr.014 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
919 %y.addr.013 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
920 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
921 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
922 %1 = bitcast i32* %x.addr.014 to <4 x i32>*
923 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
924 %add.ptr = getelementptr inbounds i32, i32* %x.addr.014, i32 4
925 %3 = tail call <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32> %2, <4 x i32> %.splat)
926 %4 = bitcast i32* %y.addr.013 to <4 x i32>*
927 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %4, i32 4, <4 x i1> %0)
928 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.013, i32 4
929 %sub = add nsw i32 %i.012, -4
930 %cmp = icmp sgt i32 %i.012, 4
931 br i1 %cmp, label %for.body, label %for.cond.cleanup
934 define void @vqrdmulh(i32* %s1, i32 %c0, i32 %N) {
935 ; CHECK-LABEL: vqrdmulh:
936 ; CHECK: @ %bb.0: @ %entry
937 ; CHECK-NEXT: .save {r7, lr}
938 ; CHECK-NEXT: push {r7, lr}
939 ; CHECK-NEXT: cmp r2, #1
941 ; CHECK-NEXT: poplt {r7, pc}
942 ; CHECK-NEXT: .LBB21_1: @ %while.body.lr.ph
943 ; CHECK-NEXT: dlstp.32 lr, r2
944 ; CHECK-NEXT: .LBB21_2: @ %while.body
945 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
946 ; CHECK-NEXT: vldrw.u32 q0, [r0]
947 ; CHECK-NEXT: vqrdmulh.s32 q0, q0, r1
948 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
949 ; CHECK-NEXT: letp lr, .LBB21_2
950 ; CHECK-NEXT: @ %bb.3: @ %while.end
951 ; CHECK-NEXT: pop {r7, pc}
953 %cmp11 = icmp sgt i32 %N, 0
954 br i1 %cmp11, label %while.body.lr.ph, label %while.end
956 while.body.lr.ph: ; preds = %entry
957 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
958 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
961 while.body: ; preds = %while.body.lr.ph, %while.body
962 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
963 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
964 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
965 %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
966 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
967 %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
968 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
969 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
970 %sub = add nsw i32 %N.addr.012, -4
971 %cmp = icmp sgt i32 %N.addr.012, 4
972 br i1 %cmp, label %while.body, label %while.end
974 while.end: ; preds = %while.body, %entry
978 define void @vmlaq(i32* %x, i32* %y, i32 %n, i32 %z) {
979 ; CHECK-LABEL: vmlaq:
980 ; CHECK: @ %bb.0: @ %entry
981 ; CHECK-NEXT: .save {r7, lr}
982 ; CHECK-NEXT: push {r7, lr}
983 ; CHECK-NEXT: cmp r2, #1
985 ; CHECK-NEXT: poplt {r7, pc}
986 ; CHECK-NEXT: .LBB22_1: @ %for.body.preheader
987 ; CHECK-NEXT: dlstp.32 lr, r2
988 ; CHECK-NEXT: .LBB22_2: @ %for.body
989 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
990 ; CHECK-NEXT: vldrw.u32 q0, [r1]
991 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
992 ; CHECK-NEXT: vmla.u32 q1, q0, r3
993 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
994 ; CHECK-NEXT: letp lr, .LBB22_2
995 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
996 ; CHECK-NEXT: pop {r7, pc}
998 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
999 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
1000 %cmp14 = icmp sgt i32 %n, 0
1001 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1003 for.cond.cleanup: ; preds = %for.body, %entry
1006 for.body: ; preds = %entry, %for.body
1007 %x.addr.017 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
1008 %y.addr.016 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
1009 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1010 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1011 %1 = bitcast i32* %x.addr.017 to <4 x i32>*
1012 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1013 %add.ptr = getelementptr inbounds i32, i32* %x.addr.017, i32 4
1014 %3 = bitcast i32* %y.addr.016 to <4 x i32>*
1015 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1016 %5 = mul <4 x i32> %4, %.splat
1017 %6 = add <4 x i32> %5, %2
1018 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %3, i32 4, <4 x i1> %0)
1019 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.016, i32 4
1020 %sub = add nsw i32 %i.015, -4
1021 %cmp = icmp sgt i32 %i.015, 4
1022 br i1 %cmp, label %for.body, label %for.cond.cleanup
1025 define void @vmlaqp(i32* %x, i32* %y, i32 %n, i32 %z) {
1026 ; CHECK-LABEL: vmlaqp:
1027 ; CHECK: @ %bb.0: @ %entry
1028 ; CHECK-NEXT: .save {r7, lr}
1029 ; CHECK-NEXT: push {r7, lr}
1030 ; CHECK-NEXT: cmp r2, #1
1032 ; CHECK-NEXT: poplt {r7, pc}
1033 ; CHECK-NEXT: .LBB23_1: @ %for.body.preheader
1034 ; CHECK-NEXT: dlstp.32 lr, r2
1035 ; CHECK-NEXT: .LBB23_2: @ %for.body
1036 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1037 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1038 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1039 ; CHECK-NEXT: vmla.u32 q1, q0, r3
1040 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1041 ; CHECK-NEXT: letp lr, .LBB23_2
1042 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1043 ; CHECK-NEXT: pop {r7, pc}
1045 %cmp15 = icmp sgt i32 %n, 0
1046 br i1 %cmp15, label %for.body, label %for.cond.cleanup
1048 for.cond.cleanup: ; preds = %for.body, %entry
1051 for.body: ; preds = %entry, %for.body
1052 %x.addr.018 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
1053 %y.addr.017 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
1054 %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1055 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016)
1056 %1 = bitcast i32* %x.addr.018 to <4 x i32>*
1057 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1058 %add.ptr = getelementptr inbounds i32, i32* %x.addr.018, i32 4
1059 %3 = bitcast i32* %y.addr.017 to <4 x i32>*
1060 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1061 %5 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 %z, <4 x i1> %0)
1062 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %3, i32 4, <4 x i1> %0)
1063 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.017, i32 4
1064 %sub = add nsw i32 %i.016, -4
1065 %cmp = icmp sgt i32 %i.016, 4
1066 br i1 %cmp, label %for.body, label %for.cond.cleanup
1069 define void @vmlasq(i32* %x, i32* %y, i32 %n, i32 %z) {
1070 ; CHECK-LABEL: vmlasq:
1071 ; CHECK: @ %bb.0: @ %entry
1072 ; CHECK-NEXT: .save {r7, lr}
1073 ; CHECK-NEXT: push {r7, lr}
1074 ; CHECK-NEXT: cmp r2, #1
1076 ; CHECK-NEXT: poplt {r7, pc}
1077 ; CHECK-NEXT: .LBB24_1: @ %for.body.preheader
1078 ; CHECK-NEXT: dlstp.32 lr, r2
1079 ; CHECK-NEXT: .LBB24_2: @ %for.body
1080 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1081 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1082 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1083 ; CHECK-NEXT: vmlas.u32 q1, q0, r3
1084 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1085 ; CHECK-NEXT: letp lr, .LBB24_2
1086 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1087 ; CHECK-NEXT: pop {r7, pc}
1089 %.splatinsert = insertelement <4 x i32> poison, i32 %z, i32 0
1090 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
1091 %cmp14 = icmp sgt i32 %n, 0
1092 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1094 for.cond.cleanup: ; preds = %for.body, %entry
1097 for.body: ; preds = %entry, %for.body
1098 %x.addr.017 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
1099 %y.addr.016 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
1100 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1101 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1102 %1 = bitcast i32* %x.addr.017 to <4 x i32>*
1103 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1104 %add.ptr = getelementptr inbounds i32, i32* %x.addr.017, i32 4
1105 %3 = bitcast i32* %y.addr.016 to <4 x i32>*
1106 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1107 %5 = mul <4 x i32> %4, %2
1108 %6 = add <4 x i32> %5, %.splat
1109 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %3, i32 4, <4 x i1> %0)
1110 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.016, i32 4
1111 %sub = add nsw i32 %i.015, -4
1112 %cmp = icmp sgt i32 %i.015, 4
1113 br i1 %cmp, label %for.body, label %for.cond.cleanup
1116 define void @vmlasqp(i32* %x, i32* %y, i32 %n, i32 %z) {
1117 ; CHECK-LABEL: vmlasqp:
1118 ; CHECK: @ %bb.0: @ %entry
1119 ; CHECK-NEXT: .save {r7, lr}
1120 ; CHECK-NEXT: push {r7, lr}
1121 ; CHECK-NEXT: cmp r2, #1
1123 ; CHECK-NEXT: poplt {r7, pc}
1124 ; CHECK-NEXT: .LBB25_1: @ %for.body.preheader
1125 ; CHECK-NEXT: dlstp.32 lr, r2
1126 ; CHECK-NEXT: .LBB25_2: @ %for.body
1127 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1128 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1129 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1130 ; CHECK-NEXT: vmlas.u32 q1, q0, r3
1131 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1132 ; CHECK-NEXT: letp lr, .LBB25_2
1133 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1134 ; CHECK-NEXT: pop {r7, pc}
1136 %cmp15 = icmp sgt i32 %n, 0
1137 br i1 %cmp15, label %for.body, label %for.cond.cleanup
1139 for.cond.cleanup: ; preds = %for.body, %entry
1142 for.body: ; preds = %entry, %for.body
1143 %x.addr.018 = phi i32* [ %add.ptr, %for.body ], [ %x, %entry ]
1144 %y.addr.017 = phi i32* [ %add.ptr1, %for.body ], [ %y, %entry ]
1145 %i.016 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1146 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.016)
1147 %1 = bitcast i32* %x.addr.018 to <4 x i32>*
1148 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1149 %add.ptr = getelementptr inbounds i32, i32* %x.addr.018, i32 4
1150 %3 = bitcast i32* %y.addr.017 to <4 x i32>*
1151 %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
1152 %5 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %4, i32 %z, <4 x i1> %0)
1153 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %3, i32 4, <4 x i1> %0)
1154 %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.017, i32 4
1155 %sub = add nsw i32 %i.016, -4
1156 %cmp = icmp sgt i32 %i.016, 4
1157 br i1 %cmp, label %for.body, label %for.cond.cleanup
1160 define void @vaddqf(float* %x, float* %y, i32 %n, float %z) {
1161 ; CHECK-LABEL: vaddqf:
1162 ; CHECK: @ %bb.0: @ %entry
1163 ; CHECK-NEXT: .save {r7, lr}
1164 ; CHECK-NEXT: push {r7, lr}
1165 ; CHECK-NEXT: cmp r2, #1
1167 ; CHECK-NEXT: poplt {r7, pc}
1168 ; CHECK-NEXT: .LBB26_1: @ %for.body.preheader
1169 ; CHECK-NEXT: dlstp.32 lr, r2
1170 ; CHECK-NEXT: .LBB26_2: @ %for.body
1171 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1172 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1173 ; CHECK-NEXT: vadd.f32 q0, q0, r3
1174 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
1175 ; CHECK-NEXT: letp lr, .LBB26_2
1176 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1177 ; CHECK-NEXT: pop {r7, pc}
1179 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1180 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1181 %cmp11 = icmp sgt i32 %n, 0
1182 br i1 %cmp11, label %for.body, label %for.cond.cleanup
1184 for.cond.cleanup: ; preds = %for.body, %entry
1187 for.body: ; preds = %entry, %for.body
1188 %x.addr.014 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ]
1189 %y.addr.013 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ]
1190 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1191 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1192 %1 = bitcast float* %x.addr.014 to <4 x float>*
1193 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1194 %add.ptr = getelementptr inbounds float, float* %x.addr.014, i32 4
1195 %3 = fadd fast <4 x float> %2, %.splat
1196 %4 = bitcast float* %y.addr.013 to <4 x float>*
1197 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %4, i32 4, <4 x i1> %0)
1198 %add.ptr1 = getelementptr inbounds float, float* %y.addr.013, i32 4
1199 %sub = add nsw i32 %i.012, -4
1200 %cmp = icmp sgt i32 %i.012, 4
1201 br i1 %cmp, label %for.body, label %for.cond.cleanup
1204 define void @vaddf(float* %s1, float %c0, i32 %N) {
1205 ; CHECK-LABEL: vaddf:
1206 ; CHECK: @ %bb.0: @ %entry
1207 ; CHECK-NEXT: .save {r7, lr}
1208 ; CHECK-NEXT: push {r7, lr}
1209 ; CHECK-NEXT: cmp r2, #1
1211 ; CHECK-NEXT: poplt {r7, pc}
1212 ; CHECK-NEXT: .LBB27_1: @ %while.body.lr.ph
1213 ; CHECK-NEXT: dlstp.32 lr, r2
1214 ; CHECK-NEXT: .LBB27_2: @ %while.body
1215 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1216 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1217 ; CHECK-NEXT: vadd.f32 q0, q0, r1
1218 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
1219 ; CHECK-NEXT: letp lr, .LBB27_2
1220 ; CHECK-NEXT: @ %bb.3: @ %while.end
1221 ; CHECK-NEXT: pop {r7, pc}
1223 %cmp11 = icmp sgt i32 %N, 0
1224 br i1 %cmp11, label %while.body.lr.ph, label %while.end
1226 while.body.lr.ph: ; preds = %entry
1227 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1228 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1229 br label %while.body
1231 while.body: ; preds = %while.body.lr.ph, %while.body
1232 %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1233 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1234 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1235 %1 = bitcast float* %s1.addr.013 to <4 x float>*
1236 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1237 %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
1238 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
1239 %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
1240 %sub = add nsw i32 %N.addr.012, -4
1241 %cmp = icmp sgt i32 %N.addr.012, 4
1242 br i1 %cmp, label %while.body, label %while.end
1244 while.end: ; preds = %while.body, %entry
1248 define void @vsubqf(float* %x, float* %y, i32 %n, float %z) {
1249 ; CHECK-LABEL: vsubqf:
1250 ; CHECK: @ %bb.0: @ %entry
1251 ; CHECK-NEXT: .save {r7, lr}
1252 ; CHECK-NEXT: push {r7, lr}
1253 ; CHECK-NEXT: cmp r2, #1
1255 ; CHECK-NEXT: poplt {r7, pc}
1256 ; CHECK-NEXT: .LBB28_1: @ %for.body.preheader
1257 ; CHECK-NEXT: dlstp.32 lr, r2
1258 ; CHECK-NEXT: .LBB28_2: @ %for.body
1259 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1260 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1261 ; CHECK-NEXT: vsub.f32 q0, q0, r3
1262 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
1263 ; CHECK-NEXT: letp lr, .LBB28_2
1264 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1265 ; CHECK-NEXT: pop {r7, pc}
1267 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1268 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1269 %cmp11 = icmp sgt i32 %n, 0
1270 br i1 %cmp11, label %for.body, label %for.cond.cleanup
1272 for.cond.cleanup: ; preds = %for.body, %entry
1275 for.body: ; preds = %entry, %for.body
1276 %x.addr.014 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ]
1277 %y.addr.013 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ]
1278 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1279 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1280 %1 = bitcast float* %x.addr.014 to <4 x float>*
1281 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1282 %add.ptr = getelementptr inbounds float, float* %x.addr.014, i32 4
1283 %3 = fsub fast <4 x float> %2, %.splat
1284 %4 = bitcast float* %y.addr.013 to <4 x float>*
1285 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %4, i32 4, <4 x i1> %0)
1286 %add.ptr1 = getelementptr inbounds float, float* %y.addr.013, i32 4
1287 %sub = add nsw i32 %i.012, -4
1288 %cmp = icmp sgt i32 %i.012, 4
1289 br i1 %cmp, label %for.body, label %for.cond.cleanup
1292 define void @vsubf(float* %s1, float %c0, i32 %N) {
1293 ; CHECK-LABEL: vsubf:
1294 ; CHECK: @ %bb.0: @ %entry
1295 ; CHECK-NEXT: .save {r7, lr}
1296 ; CHECK-NEXT: push {r7, lr}
1297 ; CHECK-NEXT: cmp r2, #1
1299 ; CHECK-NEXT: poplt {r7, pc}
1300 ; CHECK-NEXT: .LBB29_1: @ %while.body.lr.ph
1301 ; CHECK-NEXT: dlstp.32 lr, r2
1302 ; CHECK-NEXT: .LBB29_2: @ %while.body
1303 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1304 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1305 ; CHECK-NEXT: vsub.f32 q0, q0, r1
1306 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
1307 ; CHECK-NEXT: letp lr, .LBB29_2
1308 ; CHECK-NEXT: @ %bb.3: @ %while.end
1309 ; CHECK-NEXT: pop {r7, pc}
1311 %cmp11 = icmp sgt i32 %N, 0
1312 br i1 %cmp11, label %while.body.lr.ph, label %while.end
1314 while.body.lr.ph: ; preds = %entry
1315 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1316 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1317 br label %while.body
1319 while.body: ; preds = %while.body.lr.ph, %while.body
1320 %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1321 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1322 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1323 %1 = bitcast float* %s1.addr.013 to <4 x float>*
1324 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1325 %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
1326 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
1327 %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
1328 %sub = add nsw i32 %N.addr.012, -4
1329 %cmp = icmp sgt i32 %N.addr.012, 4
1330 br i1 %cmp, label %while.body, label %while.end
1332 while.end: ; preds = %while.body, %entry
1336 define void @vmulqf(float* %x, float* %y, i32 %n, float %z) {
1337 ; CHECK-LABEL: vmulqf:
1338 ; CHECK: @ %bb.0: @ %entry
1339 ; CHECK-NEXT: .save {r7, lr}
1340 ; CHECK-NEXT: push {r7, lr}
1341 ; CHECK-NEXT: cmp r2, #1
1343 ; CHECK-NEXT: poplt {r7, pc}
1344 ; CHECK-NEXT: .LBB30_1: @ %for.body.preheader
1345 ; CHECK-NEXT: dlstp.32 lr, r2
1346 ; CHECK-NEXT: .LBB30_2: @ %for.body
1347 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1348 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1349 ; CHECK-NEXT: vmul.f32 q0, q0, r3
1350 ; CHECK-NEXT: vstrw.32 q0, [r1], #16
1351 ; CHECK-NEXT: letp lr, .LBB30_2
1352 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1353 ; CHECK-NEXT: pop {r7, pc}
1355 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1356 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1357 %cmp11 = icmp sgt i32 %n, 0
1358 br i1 %cmp11, label %for.body, label %for.cond.cleanup
1360 for.cond.cleanup: ; preds = %for.body, %entry
1363 for.body: ; preds = %entry, %for.body
1364 %x.addr.014 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ]
1365 %y.addr.013 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ]
1366 %i.012 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1367 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.012)
1368 %1 = bitcast float* %x.addr.014 to <4 x float>*
1369 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1370 %add.ptr = getelementptr inbounds float, float* %x.addr.014, i32 4
1371 %3 = fmul fast <4 x float> %2, %.splat
1372 %4 = bitcast float* %y.addr.013 to <4 x float>*
1373 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %4, i32 4, <4 x i1> %0)
1374 %add.ptr1 = getelementptr inbounds float, float* %y.addr.013, i32 4
1375 %sub = add nsw i32 %i.012, -4
1376 %cmp = icmp sgt i32 %i.012, 4
1377 br i1 %cmp, label %for.body, label %for.cond.cleanup
1380 define void @vmulf(float* %s1, float %c0, i32 %N) {
1381 ; CHECK-LABEL: vmulf:
1382 ; CHECK: @ %bb.0: @ %entry
1383 ; CHECK-NEXT: .save {r7, lr}
1384 ; CHECK-NEXT: push {r7, lr}
1385 ; CHECK-NEXT: cmp r2, #1
1387 ; CHECK-NEXT: poplt {r7, pc}
1388 ; CHECK-NEXT: .LBB31_1: @ %while.body.lr.ph
1389 ; CHECK-NEXT: dlstp.32 lr, r2
1390 ; CHECK-NEXT: .LBB31_2: @ %while.body
1391 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1392 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1393 ; CHECK-NEXT: vmul.f32 q0, q0, r1
1394 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
1395 ; CHECK-NEXT: letp lr, .LBB31_2
1396 ; CHECK-NEXT: @ %bb.3: @ %while.end
1397 ; CHECK-NEXT: pop {r7, pc}
1399 %cmp11 = icmp sgt i32 %N, 0
1400 br i1 %cmp11, label %while.body.lr.ph, label %while.end
1402 while.body.lr.ph: ; preds = %entry
1403 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1404 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1405 br label %while.body
1407 while.body: ; preds = %while.body.lr.ph, %while.body
1408 %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1409 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1410 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
1411 %1 = bitcast float* %s1.addr.013 to <4 x float>*
1412 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1413 %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
1414 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
1415 %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
1416 %sub = add nsw i32 %N.addr.012, -4
1417 %cmp = icmp sgt i32 %N.addr.012, 4
1418 br i1 %cmp, label %while.body, label %while.end
1420 while.end: ; preds = %while.body, %entry
1424 define void @vfmaq(float* %x, float* %y, i32 %n, float %z) {
1425 ; CHECK-LABEL: vfmaq:
1426 ; CHECK: @ %bb.0: @ %entry
1427 ; CHECK-NEXT: .save {r7, lr}
1428 ; CHECK-NEXT: push {r7, lr}
1429 ; CHECK-NEXT: cmp r2, #1
1431 ; CHECK-NEXT: poplt {r7, pc}
1432 ; CHECK-NEXT: .LBB32_1: @ %for.body.preheader
1433 ; CHECK-NEXT: dlstp.32 lr, r2
1434 ; CHECK-NEXT: .LBB32_2: @ %for.body
1435 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1436 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1437 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1438 ; CHECK-NEXT: vfma.f32 q1, q0, r3
1439 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1440 ; CHECK-NEXT: letp lr, .LBB32_2
1441 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1442 ; CHECK-NEXT: pop {r7, pc}
1444 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1445 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1446 %cmp14 = icmp sgt i32 %n, 0
1447 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1449 for.cond.cleanup: ; preds = %for.body, %entry
1452 for.body: ; preds = %entry, %for.body
1453 %x.addr.017 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ]
1454 %y.addr.016 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ]
1455 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1456 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1457 %1 = bitcast float* %x.addr.017 to <4 x float>*
1458 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1459 %add.ptr = getelementptr inbounds float, float* %x.addr.017, i32 4
1460 %3 = bitcast float* %y.addr.016 to <4 x float>*
1461 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1462 %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %4, <4 x float> %.splat, <4 x float> %2)
1463 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %3, i32 4, <4 x i1> %0)
1464 %add.ptr1 = getelementptr inbounds float, float* %y.addr.016, i32 4
1465 %sub = add nsw i32 %i.015, -4
1466 %cmp = icmp sgt i32 %i.015, 4
1467 br i1 %cmp, label %for.body, label %for.cond.cleanup
1470 define void @vfma(float* %s1, float* %s2, float %c0, i32 %N) {
1471 ; CHECK-LABEL: vfma:
1472 ; CHECK: @ %bb.0: @ %entry
1473 ; CHECK-NEXT: .save {r7, lr}
1474 ; CHECK-NEXT: push {r7, lr}
1475 ; CHECK-NEXT: cmp r3, #1
1477 ; CHECK-NEXT: poplt {r7, pc}
1478 ; CHECK-NEXT: .LBB33_1: @ %while.body.lr.ph
1479 ; CHECK-NEXT: dlstp.32 lr, r3
1480 ; CHECK-NEXT: .LBB33_2: @ %while.body
1481 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1482 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1483 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1484 ; CHECK-NEXT: vfma.f32 q1, q0, r2
1485 ; CHECK-NEXT: vstrw.32 q1, [r0], #16
1486 ; CHECK-NEXT: letp lr, .LBB33_2
1487 ; CHECK-NEXT: @ %bb.3: @ %while.end
1488 ; CHECK-NEXT: pop {r7, pc}
1490 %cmp12 = icmp sgt i32 %N, 0
1491 br i1 %cmp12, label %while.body.lr.ph, label %while.end
1493 while.body.lr.ph: ; preds = %entry
1494 %0 = bitcast float* %s2 to <4 x float>*
1495 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1496 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1497 br label %while.body
1499 while.body: ; preds = %while.body.lr.ph, %while.body
1500 %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1501 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1502 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
1503 %2 = bitcast float* %s1.addr.014 to <4 x float>*
1504 %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1505 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1506 %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> %.splat, <4 x float> %3, <4 x i1> %1)
1507 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
1508 %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
1509 %sub = add nsw i32 %N.addr.013, -4
1510 %cmp = icmp sgt i32 %N.addr.013, 4
1511 br i1 %cmp, label %while.body, label %while.end
1513 while.end: ; preds = %while.body, %entry
1517 define void @vfmasq(float* %x, float* %y, i32 %n, float %z) {
1518 ; CHECK-LABEL: vfmasq:
1519 ; CHECK: @ %bb.0: @ %entry
1520 ; CHECK-NEXT: .save {r7, lr}
1521 ; CHECK-NEXT: push {r7, lr}
1522 ; CHECK-NEXT: cmp r2, #1
1524 ; CHECK-NEXT: poplt {r7, pc}
1525 ; CHECK-NEXT: .LBB34_1: @ %for.body.preheader
1526 ; CHECK-NEXT: dlstp.32 lr, r2
1527 ; CHECK-NEXT: .LBB34_2: @ %for.body
1528 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1529 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1530 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
1531 ; CHECK-NEXT: vfmas.f32 q1, q0, r3
1532 ; CHECK-NEXT: vstrw.32 q1, [r1], #16
1533 ; CHECK-NEXT: letp lr, .LBB34_2
1534 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1535 ; CHECK-NEXT: pop {r7, pc}
1537 %.splatinsert = insertelement <4 x float> poison, float %z, i32 0
1538 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1539 %cmp14 = icmp sgt i32 %n, 0
1540 br i1 %cmp14, label %for.body, label %for.cond.cleanup
1542 for.cond.cleanup: ; preds = %for.body, %entry
1545 for.body: ; preds = %entry, %for.body
1546 %x.addr.017 = phi float* [ %add.ptr, %for.body ], [ %x, %entry ]
1547 %y.addr.016 = phi float* [ %add.ptr1, %for.body ], [ %y, %entry ]
1548 %i.015 = phi i32 [ %sub, %for.body ], [ %n, %entry ]
1549 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %i.015)
1550 %1 = bitcast float* %x.addr.017 to <4 x float>*
1551 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1552 %add.ptr = getelementptr inbounds float, float* %x.addr.017, i32 4
1553 %3 = bitcast float* %y.addr.016 to <4 x float>*
1554 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %3, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
1555 %5 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %2, <4 x float> %4, <4 x float> %.splat)
1556 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %3, i32 4, <4 x i1> %0)
1557 %add.ptr1 = getelementptr inbounds float, float* %y.addr.016, i32 4
1558 %sub = add nsw i32 %i.015, -4
1559 %cmp = icmp sgt i32 %i.015, 4
1560 br i1 %cmp, label %for.body, label %for.cond.cleanup
1563 define void @vfmas(float* %s1, float* %s2, float %c0, i32 %N) {
1564 ; CHECK-LABEL: vfmas:
1565 ; CHECK: @ %bb.0: @ %entry
1566 ; CHECK-NEXT: .save {r7, lr}
1567 ; CHECK-NEXT: push {r7, lr}
1568 ; CHECK-NEXT: cmp r3, #1
1570 ; CHECK-NEXT: poplt {r7, pc}
1571 ; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph
1572 ; CHECK-NEXT: dlstp.32 lr, r3
1573 ; CHECK-NEXT: .LBB35_2: @ %while.body
1574 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1575 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1576 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1577 ; CHECK-NEXT: vfmas.f32 q1, q0, r2
1578 ; CHECK-NEXT: vstrw.32 q1, [r0], #16
1579 ; CHECK-NEXT: letp lr, .LBB35_2
1580 ; CHECK-NEXT: @ %bb.3: @ %while.end
1581 ; CHECK-NEXT: pop {r7, pc}
1583 %cmp12 = icmp sgt i32 %N, 0
1584 br i1 %cmp12, label %while.body.lr.ph, label %while.end
1586 while.body.lr.ph: ; preds = %entry
1587 %0 = bitcast float* %s2 to <4 x float>*
1588 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
1589 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1590 br label %while.body
1592 while.body: ; preds = %while.body.lr.ph, %while.body
1593 %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1594 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
1595 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
1596 %2 = bitcast float* %s1.addr.014 to <4 x float>*
1597 %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1598 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
1599 %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> %.splat, <4 x i1> %1)
1600 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
1601 %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
1602 %sub = add nsw i32 %N.addr.013, -4
1603 %cmp = icmp sgt i32 %N.addr.013, 4
1604 br i1 %cmp, label %while.body, label %while.end
1606 while.end: ; preds = %while.body, %entry
1610 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
1611 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
1612 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
1613 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
1614 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
1615 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
1617 declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1618 declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1619 declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1620 declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1621 declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
1622 declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
1623 declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1624 declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
1625 declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
1626 declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1627 declare <4 x i32> @llvm.arm.mve.vhadd.v4i32(<4 x i32>, <4 x i32>, i32)
1628 declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
1629 declare <4 x i32> @llvm.arm.mve.vhsub.v4i32(<4 x i32>, <4 x i32>, i32)
1630 declare <2 x i64> @llvm.arm.mve.vqdmull.v2i64.v4i32(<4 x i32>, <4 x i32>, i32) #1
1631 declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
1632 declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1633 declare <4 x i32> @llvm.arm.mve.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
1634 declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
1635 declare <4 x i32> @llvm.arm.mve.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
1636 declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
1637 declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
1638 declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1639 declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1640 declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
1641 declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
1642 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)