1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
3 ; TODO: The unrolled pattern is preventing the transform
4 ; CHECK-LABEL: mul_v16i8_unroll
5 ; CHECK-NOT: call i32 @llvm.arm.vcpt
6 define void @mul_v16i8_unroll(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
8 %cmp8 = icmp eq i32 %N, 0
10 %tmp9 = lshr i32 %tmp8, 4
11 %tmp10 = shl nuw i32 %tmp9, 4
12 %tmp11 = add i32 %tmp10, -16
13 %tmp12 = lshr i32 %tmp11, 4
14 %tmp13 = add nuw nsw i32 %tmp12, 1
15 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
17 vector.ph: ; preds = %entry
18 %trip.count.minus.1 = add i32 %N, -1
19 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
20 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
21 %xtraiter = and i32 %tmp13, 1
22 %0 = icmp ult i32 %tmp12, 1
23 br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new
25 vector.ph.new: ; preds = %vector.ph
26 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
27 %unroll_iter = sub i32 %tmp13, %xtraiter
30 vector.body: ; preds = %vector.body, %vector.ph.new
31 %index = phi i32 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ]
32 %niter = phi i32 [ %unroll_iter, %vector.ph.new ], [ %niter.nsub.1, %vector.body ]
33 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
34 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
35 %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
36 %tmp = getelementptr inbounds i8, i8* %a, i32 %index
37 %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
38 %tmp2 = bitcast i8* %tmp to <16 x i8>*
39 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
40 %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
41 %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
42 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
43 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
44 %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
45 %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
46 tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1)
47 %index.next = add nuw nsw i32 %index, 16
48 %niter.nsub = sub i32 %niter, 1
49 %broadcast.splatinsert.1 = insertelement <16 x i32> undef, i32 %index.next, i32 0
50 %broadcast.splat.1 = shufflevector <16 x i32> %broadcast.splatinsert.1, <16 x i32> undef, <16 x i32> zeroinitializer
51 %induction.1 = add <16 x i32> %broadcast.splat.1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
52 %tmp.1 = getelementptr inbounds i8, i8* %a, i32 %index.next
53 %tmp1.1 = icmp ule <16 x i32> %induction.1, %broadcast.splat11
54 %tmp2.1 = bitcast i8* %tmp.1 to <16 x i8>*
55 %wide.masked.load.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
56 %tmp3.1 = getelementptr inbounds i8, i8* %b, i32 %index.next
57 %tmp4.1 = bitcast i8* %tmp3.1 to <16 x i8>*
58 %wide.masked.load2.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
59 %mul.1 = mul nsw <16 x i8> %wide.masked.load2.1, %wide.masked.load.1
60 %tmp6.1 = getelementptr inbounds i8, i8* %c, i32 %index.next
61 %tmp7.1 = bitcast i8* %tmp6.1 to <16 x i8>*
62 tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul.1, <16 x i8>* %tmp7.1, i32 4, <16 x i1> %tmp1.1)
63 %index.next.1 = add i32 %index.next, 16
64 %niter.nsub.1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %niter.nsub, i32 1)
65 %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
66 br i1 %niter.ncmp.1, label %vector.body, label %for.cond.cleanup.loopexit.unr-lcssa.loopexit
68 for.cond.cleanup.loopexit.unr-lcssa.loopexit: ; preds = %vector.body
69 %index.unr.ph = phi i32 [ %index.next.1, %vector.body ]
70 %tmp14.unr.ph = phi i32 [ -2, %vector.body ]
71 br label %for.cond.cleanup.loopexit.unr-lcssa
73 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.cond.cleanup.loopexit.unr-lcssa.loopexit, %vector.ph
74 %index.unr = phi i32 [ 0, %vector.ph ], [ %index.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
75 %tmp14.unr = phi i32 [ %tmp13, %vector.ph ], [ %tmp14.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
76 %lcmp.mod = icmp ne i32 %xtraiter, 0
77 br i1 %lcmp.mod, label %vector.body.epil.preheader, label %for.cond.cleanup.loopexit
79 vector.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa
80 br label %vector.body.epil
82 vector.body.epil: ; preds = %vector.body.epil.preheader
83 %index.epil = phi i32 [ %index.unr, %vector.body.epil.preheader ]
84 %tmp14.epil = phi i32 [ %tmp14.unr, %vector.body.epil.preheader ]
85 %broadcast.splatinsert.epil = insertelement <16 x i32> undef, i32 %index.epil, i32 0
86 %broadcast.splat.epil = shufflevector <16 x i32> %broadcast.splatinsert.epil, <16 x i32> undef, <16 x i32> zeroinitializer
87 %induction.epil = add <16 x i32> %broadcast.splat.epil, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
88 %tmp.epil = getelementptr inbounds i8, i8* %a, i32 %index.epil
89 %tmp1.epil = icmp ule <16 x i32> %induction.epil, %broadcast.splat11
90 %tmp2.epil = bitcast i8* %tmp.epil to <16 x i8>*
91 %wide.masked.load.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
92 %tmp3.epil = getelementptr inbounds i8, i8* %b, i32 %index.epil
93 %tmp4.epil = bitcast i8* %tmp3.epil to <16 x i8>*
94 %wide.masked.load2.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
95 %mul.epil = mul nsw <16 x i8> %wide.masked.load2.epil, %wide.masked.load.epil
96 %tmp6.epil = getelementptr inbounds i8, i8* %c, i32 %index.epil
97 %tmp7.epil = bitcast i8* %tmp6.epil to <16 x i8>*
98 tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul.epil, <16 x i8>* %tmp7.epil, i32 4, <16 x i1> %tmp1.epil)
99 %index.next.epil = add i32 %index.epil, 16
100 %tmp15.epil = add nuw nsw i32 %tmp14.epil, -1
101 %tmp16.epil = icmp ne i32 %tmp15.epil, 0
102 br label %for.cond.cleanup.loopexit.epilog-lcssa
104 for.cond.cleanup.loopexit.epilog-lcssa: ; preds = %vector.body.epil
105 br label %for.cond.cleanup.loopexit
107 for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.cond.cleanup.loopexit.epilog-lcssa
108 br label %for.cond.cleanup
110 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
114 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1
115 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) #2
116 declare i32 @llvm.start.loop.iterations.i32(i32) #3
117 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3