1 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
3 ; CHECK-LABEL: vpsel_mul_reduce_add
5 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
6 ; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
7 ; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
10 ; CHECK-NEXT: vldrwt.u32
11 ; CHECK-NEXT: vldrwt.u32
16 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
17 ; CHECK: sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
18 ; CHECK: le lr, [[LOOP]]
19 ; CHECK: vctp.32 [[ELEMS_OUT]]
21 ; CHECK-NEXT: vaddv.u32
22 define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
24 %cmp8 = icmp eq i32 %N, 0
25 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
27 vector.ph: ; preds = %entry
28 %n.rnd.up = add i32 %N, 3
29 %n.vec = and i32 %n.rnd.up, -4
30 %trip.count.minus.1 = add i32 %N, -1
31 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
32 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
35 vector.body: ; preds = %vector.body, %vector.ph
36 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
37 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
38 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
39 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
40 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
41 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
42 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
43 %tmp2 = bitcast i32* %tmp to <4 x i32>*
44 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
45 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
46 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
47 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
48 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
49 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
50 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
51 %rem = urem i32 %index, 16
52 %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
53 %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
54 %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
55 %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
56 %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
57 %add = add nsw <4 x i32> %mul, %vec.phi
58 %index.next = add i32 %index, 4
59 %tmp7 = icmp eq i32 %index.next, %n.vec
60 br i1 %tmp7, label %middle.block, label %vector.body
62 middle.block: ; preds = %vector.body
63 %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
64 %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
65 br label %for.cond.cleanup
67 for.cond.cleanup: ; preds = %middle.block, %entry
68 %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
72 ; CHECK-LABEL: vpsel_mul_reduce_add_2
74 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
75 ; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
76 ; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
79 ; CHECK-NEXT: vldrwt.u32
80 ; CHECK-NEXT: vldrwt.u32
83 ; CHECK-NEXT: vldrwt.u32
88 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
89 ; CHECK: sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
90 ; CHECK: le lr, [[LOOP]]
91 ; CHECK: vctp.32 [[ELEMS_OUT]]
93 ; CHECK-NEXT: vaddv.u32
94 define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
95 i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
97 %cmp8 = icmp eq i32 %N, 0
98 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
100 vector.ph: ; preds = %entry
101 %n.rnd.up = add i32 %N, 3
102 %n.vec = and i32 %n.rnd.up, -4
103 %trip.count.minus.1 = add i32 %N, -1
104 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
105 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
106 br label %vector.body
108 vector.body: ; preds = %vector.body, %vector.ph
109 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
110 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
111 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
112 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
113 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
114 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
115 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
116 %tmp2 = bitcast i32* %tmp to <4 x i32>*
117 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
118 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
119 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
120 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
121 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
122 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
123 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
124 %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
125 %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
126 %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
127 %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
128 %rem = urem i32 %index, 16
129 %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
130 %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
131 %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
132 %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
133 %mul = mul <4 x i32> %sel, %wide.masked.load.a
134 %add = add <4 x i32> %mul, %vec.phi
135 %index.next = add i32 %index, 4
136 %cmp.exit = icmp eq i32 %index.next, %n.vec
137 br i1 %cmp.exit, label %middle.block, label %vector.body
139 middle.block: ; preds = %vector.body
140 %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
141 %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
142 br label %for.cond.cleanup
144 for.cond.cleanup: ; preds = %middle.block, %entry
145 %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
149 ; CHECK-LABEL: and_mul_reduce_add
151 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
152 ; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
154 ; CHECK-NEXT: vldrwt.u32
155 ; CHECK-NEXT: vldrwt.u32
156 ; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
158 ; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr
159 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
160 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
161 ; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4
162 ; CHECK: le lr, [[LOOP]]
163 ; CHECK: vctp.32 [[ELEMS_OUT]]
165 define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
166 i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
168 %cmp8 = icmp eq i32 %N, 0
169 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
171 vector.ph: ; preds = %entry
172 %n.rnd.up = add i32 %N, 3
173 %n.vec = and i32 %n.rnd.up, -4
174 %trip.count.minus.1 = add i32 %N, -1
175 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
176 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
177 br label %vector.body
179 vector.body: ; preds = %vector.body, %vector.ph
180 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
181 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
182 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
183 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
184 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
185 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
186 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
187 %tmp2 = bitcast i32* %tmp to <4 x i32>*
188 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
189 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
190 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
191 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
192 %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
193 %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
194 %mask = and <4 x i1> %cmp, %tmp1
195 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
196 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
197 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
198 %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
199 %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
200 %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
201 %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
202 %add = add <4 x i32> %mul, %vec.phi
203 %index.next = add i32 %index, 4
204 %cmp.exit = icmp eq i32 %index.next, %n.vec
205 br i1 %cmp.exit, label %middle.block, label %vector.body
207 middle.block: ; preds = %vector.body
208 %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
209 %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
210 br label %for.cond.cleanup
212 for.cond.cleanup: ; preds = %middle.block, %entry
213 %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
217 ; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
218 ; CHECK-LABEL: or_mul_reduce_add
220 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
221 ; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
222 ; CHECK: vstr p0, [sp
223 ; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
225 ; CHECK-NEXT: vldrwt.u32
226 ; CHECK-NEXT: vldrwt.u32
227 ; CHECK: vcmp.i32 eq, {{.*}}, zr
228 ; CHECK: vmrs [[VCMP:r[0-9]+]], p0
229 ; CHECK: vldr p0, [sp
230 ; CHECK: vmrs [[VCTP:r[0-9]+]], p0
231 ; CHECK: orr{{.*}} [[VCMP]], [[VCTP]]
232 ; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], [[ELEMS_OUT]], #4
233 ; CHECK-NEXT: vmsr p0
235 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
236 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
237 ; CHECK: le lr, [[LOOP]]
238 ; CHECK: vctp.32 [[ELEMS_OUT]]
240 define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
241 i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
243 %cmp8 = icmp eq i32 %N, 0
244 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
246 vector.ph: ; preds = %entry
247 %n.rnd.up = add i32 %N, 3
248 %n.vec = and i32 %n.rnd.up, -4
249 %trip.count.minus.1 = add i32 %N, -1
250 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
251 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
252 br label %vector.body
254 vector.body: ; preds = %vector.body, %vector.ph
255 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
256 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
257 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
258 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
259 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
260 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
261 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
262 %tmp2 = bitcast i32* %tmp to <4 x i32>*
263 %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
264 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
265 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
266 %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
267 %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
268 %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
269 %mask = or <4 x i1> %cmp, %tmp1
270 %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
271 %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
272 %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
273 %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
274 %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
275 %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
276 %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
277 %add = add <4 x i32> %mul, %vec.phi
278 %index.next = add i32 %index, 4
279 %cmp.exit = icmp eq i32 %index.next, %n.vec
280 br i1 %cmp.exit, label %middle.block, label %vector.body
282 middle.block: ; preds = %vector.body
283 %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
284 %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
285 br label %for.cond.cleanup
287 for.cond.cleanup: ; preds = %middle.block, %entry
288 %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
292 ; Function Attrs: argmemonly nounwind readonly willreturn
293 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
295 ; Function Attrs: nounwind readnone willreturn
296 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)