1 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
3 define void @sink_add_mul(i32* %s1, i32 %x, i32* %d, i32 %n) {
4 ; CHECK-LABEL: @sink_add_mul(
6 ; CHECK-NOT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
7 ; CHECK-NOT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
9 ; CHECK: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
10 ; CHECK: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
13 %cmp6 = icmp sgt i32 %n, 0
14 br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
16 vector.ph: ; preds = %for.body.preheader
17 %n.vec = and i32 %n, -4
18 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
19 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
22 vector.body: ; preds = %vector.body, %vector.ph
23 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
24 %0 = getelementptr inbounds i32, i32* %s1, i32 %index
25 %1 = bitcast i32* %0 to <4 x i32>*
26 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
27 %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat9
28 %3 = getelementptr inbounds i32, i32* %d, i32 %index
29 %4 = bitcast i32* %3 to <4 x i32>*
30 %wide.load10 = load <4 x i32>, <4 x i32>* %4, align 4
31 %5 = add nsw <4 x i32> %wide.load10, %2
32 %6 = bitcast i32* %3 to <4 x i32>*
33 store <4 x i32> %5, <4 x i32>* %6, align 4
34 %index.next = add i32 %index, 4
35 %7 = icmp eq i32 %index.next, %n.vec
36 br i1 %7, label %for.cond.cleanup, label %vector.body
38 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
42 define void @sink_add_mul_multiple(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
43 ; CHECK-LABEL: @sink_add_mul_multiple(
45 ; CHECK-NOT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
46 ; CHECK-NOT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
48 ; CHECK: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 %x, i32 0
49 ; CHECK: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
50 ; CHECK: mul nsw <4 x i32> %wide.load, [[TMP3]]
51 ; CHECK: [[TMP2b:%.*]] = insertelement <4 x i32> undef, i32 %x, i32 0
52 ; CHECK: [[TMP3b:%.*]] = shufflevector <4 x i32> [[TMP2b]], <4 x i32> undef, <4 x i32> zeroinitializer
53 ; CHECK: mul nsw <4 x i32> %wide.load18, [[TMP3b]]
56 %cmp13 = icmp sgt i32 %n, 0
57 br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
59 vector.ph: ; preds = %for.body.preheader
60 %n.vec = and i32 %n, -4
61 %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %x, i32 0
62 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
65 vector.body: ; preds = %vector.body, %vector.ph
66 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
67 %0 = getelementptr inbounds i32, i32* %s1, i32 %index
68 %1 = bitcast i32* %0 to <4 x i32>*
69 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
70 %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
71 %3 = getelementptr inbounds i32, i32* %d, i32 %index
72 %4 = bitcast i32* %3 to <4 x i32>*
73 %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
74 %5 = add nsw <4 x i32> %wide.load17, %2
75 %6 = bitcast i32* %3 to <4 x i32>*
76 store <4 x i32> %5, <4 x i32>* %6, align 4
77 %7 = getelementptr inbounds i32, i32* %s2, i32 %index
78 %8 = bitcast i32* %7 to <4 x i32>*
79 %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
80 %9 = mul nsw <4 x i32> %wide.load18, %broadcast.splat16
81 %10 = getelementptr inbounds i32, i32* %d2, i32 %index
82 %11 = bitcast i32* %10 to <4 x i32>*
83 %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
84 %12 = add nsw <4 x i32> %wide.load19, %9
85 %13 = bitcast i32* %10 to <4 x i32>*
86 store <4 x i32> %12, <4 x i32>* %13, align 4
87 %index.next = add i32 %index, 4
88 %14 = icmp eq i32 %index.next, %n.vec
89 br i1 %14, label %for.cond.cleanup, label %vector.body
91 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
96 define void @sink_add_sub_unsinkable(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
97 ; CHECK-LABEL: @sink_add_sub_unsinkable(
99 ; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
100 ; CHECK-NEXT: br i1 [[CMP13]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
102 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
103 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
104 ; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
105 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
108 %cmp13 = icmp sgt i32 %n, 0
109 br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
111 vector.ph: ; preds = %for.body.preheader
112 %n.vec = and i32 %n, -4
113 %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %x, i32 0
114 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
115 br label %vector.body
117 vector.body: ; preds = %vector.body, %vector.ph
118 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
119 %0 = getelementptr inbounds i32, i32* %s1, i32 %index
120 %1 = bitcast i32* %0 to <4 x i32>*
121 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
122 %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
123 %3 = getelementptr inbounds i32, i32* %d, i32 %index
124 %4 = bitcast i32* %3 to <4 x i32>*
125 %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
126 %5 = add nsw <4 x i32> %wide.load17, %2
127 %6 = bitcast i32* %3 to <4 x i32>*
128 store <4 x i32> %5, <4 x i32>* %6, align 4
129 %7 = getelementptr inbounds i32, i32* %s2, i32 %index
130 %8 = bitcast i32* %7 to <4 x i32>*
131 %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
132 %9 = sub nsw <4 x i32> %broadcast.splat16, %wide.load18
133 %10 = getelementptr inbounds i32, i32* %d2, i32 %index
134 %11 = bitcast i32* %10 to <4 x i32>*
135 %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
136 %12 = add nsw <4 x i32> %wide.load19, %9
137 %13 = bitcast i32* %10 to <4 x i32>*
138 store <4 x i32> %12, <4 x i32>* %13, align 4
139 %index.next = add i32 %index, 4
140 %14 = icmp eq i32 %index.next, %n.vec
141 br i1 %14, label %for.cond.cleanup, label %vector.body
143 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
147 define void @sink_sub(i32* %s1, i32 %x, i32* %d, i32 %n) {
148 ; CHECK-LABEL: @sink_sub(
150 ; CHECK-NOT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
151 ; CHECK-NOT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
152 ; CHECK: vector.body:
153 ; CHECK: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
154 ; CHECK: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
157 %cmp6 = icmp sgt i32 %n, 0
158 br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
160 vector.ph: ; preds = %for.body.preheader
161 %n.vec = and i32 %n, -4
162 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
163 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
164 br label %vector.body
166 vector.body: ; preds = %vector.body, %vector.ph
167 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
168 %0 = getelementptr inbounds i32, i32* %s1, i32 %index
169 %1 = bitcast i32* %0 to <4 x i32>*
170 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
171 %2 = sub nsw <4 x i32> %wide.load, %broadcast.splat9
172 %3 = getelementptr inbounds i32, i32* %d, i32 %index
173 %4 = bitcast i32* %3 to <4 x i32>*
174 store <4 x i32> %2, <4 x i32>* %4, align 4
175 %index.next = add i32 %index, 4
176 %5 = icmp eq i32 %index.next, %n.vec
177 br i1 %5, label %for.cond.cleanup, label %vector.body
179 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
183 define void @sink_sub_unsinkable(i32* %s1, i32 %x, i32* %d, i32 %n) {
185 ; CHECK-LABEL: @sink_sub_unsinkable(
187 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
188 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
189 ; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
190 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
191 ; CHECK: vector.body:
192 ; CHECK-NOT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
193 ; CHECK-NOT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
195 %cmp6 = icmp sgt i32 %n, 0
196 br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
198 vector.ph: ; preds = %for.body.preheader
199 %n.vec = and i32 %n, -4
200 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
201 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
202 br label %vector.body
204 vector.body: ; preds = %vector.body, %vector.ph
205 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
206 %0 = getelementptr inbounds i32, i32* %s1, i32 %index
207 %1 = bitcast i32* %0 to <4 x i32>*
208 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
209 %2 = sub nsw <4 x i32> %broadcast.splat9, %wide.load
210 %3 = getelementptr inbounds i32, i32* %d, i32 %index
211 %4 = bitcast i32* %3 to <4 x i32>*
212 store <4 x i32> %2, <4 x i32>* %4, align 4
213 %index.next = add i32 %index, 4
214 %5 = icmp eq i32 %index.next, %n.vec
215 br i1 %5, label %for.cond.cleanup, label %vector.body
217 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry