1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
3 ; TODO: The unrolled pattern is preventing the transform
4 ; CHECK-LABEL: mul_v16i8_unroll
5 ; CHECK-NOT: call i32 @llvm.arm.vcpt
6 define void @mul_v16i8_unroll(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
8 %cmp8 = icmp eq i32 %N, 0
10 %tmp9 = lshr i32 %tmp8, 4
11 %tmp10 = shl nuw i32 %tmp9, 4
12 %tmp11 = add i32 %tmp10, -16
13 %tmp12 = lshr i32 %tmp11, 4
14 %tmp13 = add nuw nsw i32 %tmp12, 1
15 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
17 vector.ph: ; preds = %entry
18 %trip.count.minus.1 = add i32 %N, -1
19 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
20 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
21 %xtraiter = and i32 %tmp13, 1
22 %0 = icmp ult i32 %tmp12, 1
23 br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new
25 vector.ph.new: ; preds = %vector.ph
26 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
27 %unroll_iter = sub i32 %tmp13, %xtraiter
30 vector.body: ; preds = %vector.body, %vector.ph.new
31 %index = phi i32 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ]
32 %niter = phi i32 [ %unroll_iter, %vector.ph.new ], [ %niter.nsub.1, %vector.body ]
33 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
34 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
35 %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
36 %tmp = getelementptr inbounds i8, ptr %a, i32 %index
37 %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
38 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
39 %tmp3 = getelementptr inbounds i8, ptr %b, i32 %index
40 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
41 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
42 %tmp6 = getelementptr inbounds i8, ptr %c, i32 %index
43 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %tmp1)
44 %index.next = add nuw nsw i32 %index, 16
45 %niter.nsub = sub i32 %niter, 1
46 %broadcast.splatinsert.1 = insertelement <16 x i32> undef, i32 %index.next, i32 0
47 %broadcast.splat.1 = shufflevector <16 x i32> %broadcast.splatinsert.1, <16 x i32> undef, <16 x i32> zeroinitializer
48 %induction.1 = add <16 x i32> %broadcast.splat.1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
49 %tmp.1 = getelementptr inbounds i8, ptr %a, i32 %index.next
50 %tmp1.1 = icmp ule <16 x i32> %induction.1, %broadcast.splat11
51 %wide.masked.load.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
52 %tmp3.1 = getelementptr inbounds i8, ptr %b, i32 %index.next
53 %wide.masked.load2.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
54 %mul.1 = mul nsw <16 x i8> %wide.masked.load2.1, %wide.masked.load.1
55 %tmp6.1 = getelementptr inbounds i8, ptr %c, i32 %index.next
56 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.1, ptr %tmp6.1, i32 4, <16 x i1> %tmp1.1)
57 %index.next.1 = add i32 %index.next, 16
58 %niter.nsub.1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %niter.nsub, i32 1)
59 %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
60 br i1 %niter.ncmp.1, label %vector.body, label %for.cond.cleanup.loopexit.unr-lcssa.loopexit
62 for.cond.cleanup.loopexit.unr-lcssa.loopexit: ; preds = %vector.body
63 %index.unr.ph = phi i32 [ %index.next.1, %vector.body ]
64 %tmp14.unr.ph = phi i32 [ -2, %vector.body ]
65 br label %for.cond.cleanup.loopexit.unr-lcssa
67 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.cond.cleanup.loopexit.unr-lcssa.loopexit, %vector.ph
68 %index.unr = phi i32 [ 0, %vector.ph ], [ %index.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
69 %tmp14.unr = phi i32 [ %tmp13, %vector.ph ], [ %tmp14.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
70 %lcmp.mod = icmp ne i32 %xtraiter, 0
71 br i1 %lcmp.mod, label %vector.body.epil.preheader, label %for.cond.cleanup.loopexit
73 vector.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa
74 br label %vector.body.epil
76 vector.body.epil: ; preds = %vector.body.epil.preheader
77 %index.epil = phi i32 [ %index.unr, %vector.body.epil.preheader ]
78 %tmp14.epil = phi i32 [ %tmp14.unr, %vector.body.epil.preheader ]
79 %broadcast.splatinsert.epil = insertelement <16 x i32> undef, i32 %index.epil, i32 0
80 %broadcast.splat.epil = shufflevector <16 x i32> %broadcast.splatinsert.epil, <16 x i32> undef, <16 x i32> zeroinitializer
81 %induction.epil = add <16 x i32> %broadcast.splat.epil, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
82 %tmp.epil = getelementptr inbounds i8, ptr %a, i32 %index.epil
83 %tmp1.epil = icmp ule <16 x i32> %induction.epil, %broadcast.splat11
84 %wide.masked.load.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
85 %tmp3.epil = getelementptr inbounds i8, ptr %b, i32 %index.epil
86 %wide.masked.load2.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
87 %mul.epil = mul nsw <16 x i8> %wide.masked.load2.epil, %wide.masked.load.epil
88 %tmp6.epil = getelementptr inbounds i8, ptr %c, i32 %index.epil
89 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.epil, ptr %tmp6.epil, i32 4, <16 x i1> %tmp1.epil)
90 %index.next.epil = add i32 %index.epil, 16
91 %tmp15.epil = add nuw nsw i32 %tmp14.epil, -1
92 %tmp16.epil = icmp ne i32 %tmp15.epil, 0
93 br label %for.cond.cleanup.loopexit.epilog-lcssa
95 for.cond.cleanup.loopexit.epilog-lcssa: ; preds = %vector.body.epil
96 br label %for.cond.cleanup.loopexit
98 for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.cond.cleanup.loopexit.epilog-lcssa
99 br label %for.cond.cleanup
101 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
105 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) #1
106 declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) #2
107 declare i32 @llvm.start.loop.iterations.i32(i32) #3
108 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3