1 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
2 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
3 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
4 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
5 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
6 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
8 ; Tests to check that post increment addressing modes are used instead of
9 ; updating base pointers with add instructions.
11 ; TODO: I think we should be able to use post inc addressing with VLDM
13 ; CHECK-LABEL: test_fma
16 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
17 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
18 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
19 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
21 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
22 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
23 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
24 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
26 define float @test_fma(float* %a, float* %b, i32 %N) {
31 %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
32 %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
33 %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
34 %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
35 %a.1 = load float, float* %gep.a.1
36 %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
37 %b.1 = load float, float* %gep.b.1
38 %fmul.1 = fmul float %a.1, %b.1
39 %fma.1 = fadd float %fmul.1, %res
40 %idx.2 = or i32 %idx.1, 1
41 %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
42 %a.2 = load float, float* %gep.a.2
43 %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
44 %b.2 = load float, float* %gep.b.2
45 %fmul.2 = fmul float %a.2, %b.2
46 %fma.2 = fadd float %fmul.2, %fma.1
47 %i.next = add nsw nuw i32 %i, -2
48 %idx.next = add nsw nuw i32 %idx.1, 2
49 %cmp = icmp ult i32 %i.next, %N
50 br i1 %cmp, label %loop, label %exit
56 ; CHECK-LABEL: convolve_16bit
57 ; TODO: Both arrays should use indexing
58 ; CHECK-DEFAULT: ldr{{.*}}, #8]!
59 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
61 ; CHECK-COMPLEX: ldr{{.*}}, #8]!
62 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
64 ; DISABLED-NOT: ldr{{.*}}]!
65 ; DISABLED-NOT: str{{.*}}]!
67 define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
68 i32 %filter_dim, i32 %out_width, i32 %out_height,
69 i32** nocapture readonly %convolved) {
71 %cmp92 = icmp eq i32 %out_height, 0
72 br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
74 for.cond1.preheader.lr.ph: ; preds = %entry
75 %xtraiter = and i32 %filter_dim, 3
76 %unroll_iter = sub i32 %filter_dim, %xtraiter
77 br label %for.cond1.preheader
79 for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
80 %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
81 %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
82 %tmp3 = load i32*, i32** %arrayidx22, align 4
83 br label %for.cond9.preheader.us.us.preheader
85 for.cond9.preheader.us.us.preheader: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
86 %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
87 br label %for.cond9.preheader.us.us
89 for.cond9.preheader.us.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
90 %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
91 %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
92 %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
93 %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
94 %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
95 %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
96 %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
97 br label %for.body12.us.us
99 for.body12.us.us: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
100 %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
101 %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
102 %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
103 %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
104 %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
105 %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
106 %conv.us.us = sext i16 %tmp9 to i32
107 %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
108 %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
109 %conv17.us.us = sext i16 %tmp10 to i32
110 %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
111 %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
112 %inc.us.us = or i32 %filter_x.053.us.us, 1
113 %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
114 %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
115 %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
116 %conv.us.us.1 = sext i16 %tmp11 to i32
117 %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
118 %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
119 %conv17.us.us.1 = sext i16 %tmp12 to i32
120 %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
121 %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
122 %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
123 %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
124 %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
125 %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
126 %conv.us.us.2 = sext i16 %tmp13 to i32
127 %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
128 %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
129 %conv17.us.us.2 = sext i16 %tmp14 to i32
130 %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
131 %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
132 %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
133 %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
134 %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
135 %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
136 %conv.us.us.3 = sext i16 %tmp15 to i32
137 %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
138 %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
139 %conv17.us.us.3 = sext i16 %tmp16 to i32
140 %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
141 %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
142 %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
143 %niter.nsub.3 = add i32 %niter, -4
144 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
145 br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
147 for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
148 %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
149 %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
150 br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
152 for.cond5.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
153 %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
154 store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
155 %add25.us = add nuw i32 %res_x.060.us, 1
156 %exitcond99 = icmp eq i32 %add25.us, %out_width
157 br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
159 for.cond.cleanup3: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
160 %add28 = add nuw i32 %res_y.093, 1
161 %exitcond100 = icmp eq i32 %add28, %out_height
162 br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
164 for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry
168 ; CHECK-LABEL: mul_8x8
171 ; CHECK-DEFAULT: str{{.*}}, #16]!
172 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
173 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
175 ; CHECK-COMPLEX: str{{.*}}, #16]!
176 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
177 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
179 ; DISABLED-NOT: ldr{{.*}}]!
180 ; DISABLED-NOT: str{{.*}}]!
182 ; CHECK-T2: @ %for.body.epil
183 ; CHECK-T2: ldrb{{.*}}, #1]!
184 ; CHECK-T2: ldrb{{.*}}, #1]!
185 ; CHECK-T2: str{{.*}}, #4]!
187 define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
189 %cmp9 = icmp eq i32 %N, 0
190 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
192 for.body.preheader: ; preds = %entry
193 %tmp = add i32 %N, -1
194 %xtraiter = and i32 %N, 3
195 %tmp1 = icmp ult i32 %tmp, 3
196 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
198 for.body.preheader.new: ; preds = %for.body.preheader
199 %unroll_iter = sub i32 %N, %xtraiter
202 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
203 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
204 %lcmp.mod = icmp eq i32 %xtraiter, 0
205 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
207 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
208 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
209 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
210 %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil
211 %tmp2 = load i8, i8* %arrayidx.epil, align 1
212 %conv.epil = zext i8 %tmp2 to i32
213 %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
214 %tmp3 = load i8, i8* %arrayidx1.epil, align 1
215 %conv2.epil = zext i8 %tmp3 to i32
216 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
217 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
218 store i32 %mul.epil, i32* %arrayidx3.epil, align 4
219 %inc.epil = add nuw i32 %i.010.epil, 1
220 %epil.iter.sub = add i32 %epil.iter, -1
221 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
222 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
224 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
227 for.body: ; preds = %for.body, %for.body.preheader.new
228 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
229 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
230 %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010
231 %tmp4 = load i8, i8* %arrayidx, align 1
232 %conv = zext i8 %tmp4 to i32
233 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
234 %tmp5 = load i8, i8* %arrayidx1, align 1
235 %conv2 = zext i8 %tmp5 to i32
236 %mul = mul nuw nsw i32 %conv2, %conv
237 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
238 store i32 %mul, i32* %arrayidx3, align 4
239 %inc = or i32 %i.010, 1
240 %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
241 %tmp6 = load i8, i8* %arrayidx.1, align 1
242 %conv.1 = zext i8 %tmp6 to i32
243 %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
244 %tmp7 = load i8, i8* %arrayidx1.1, align 1
245 %conv2.1 = zext i8 %tmp7 to i32
246 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
247 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
248 store i32 %mul.1, i32* %arrayidx3.1, align 4
249 %inc.1 = or i32 %i.010, 2
250 %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
251 %tmp8 = load i8, i8* %arrayidx.2, align 1
252 %conv.2 = zext i8 %tmp8 to i32
253 %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
254 %tmp9 = load i8, i8* %arrayidx1.2, align 1
255 %conv2.2 = zext i8 %tmp9 to i32
256 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
257 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
258 store i32 %mul.2, i32* %arrayidx3.2, align 4
259 %inc.2 = or i32 %i.010, 3
260 %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
261 %tmp10 = load i8, i8* %arrayidx.3, align 1
262 %conv.3 = zext i8 %tmp10 to i32
263 %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
264 %tmp11 = load i8, i8* %arrayidx1.3, align 1
265 %conv2.3 = zext i8 %tmp11 to i32
266 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
267 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
268 store i32 %mul.3, i32* %arrayidx3.3, align 4
269 %inc.3 = add i32 %i.010, 4
270 %niter.nsub.3 = add i32 %niter, -4
271 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
272 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
275 ; CHECK-LABEL: mul_16x8
278 ; CHECK-DEFAULT: str{{.*}}, #16]!
279 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
281 ; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
282 ; CHECK-COMPLEX: str{{.*}}, #16]!
283 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
285 ; DISABLED-NOT: ldr{{.*}}]!
286 ; DISABLED-NOT: str{{.*}}]!
288 ; CHECK-T2: @ %for.body.epil
289 ; CHECK-T2: ldrsh{{.*}}, #2]!
290 ; CHECK-T2: ldrb{{.*}}, #1]!
291 ; CHECK-T2: str{{.*}}, #4]!
293 define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
295 %cmp9 = icmp eq i32 %N, 0
296 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
298 for.body.preheader: ; preds = %entry
299 %tmp = add i32 %N, -1
300 %xtraiter = and i32 %N, 3
301 %tmp1 = icmp ult i32 %tmp, 3
302 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
304 for.body.preheader.new: ; preds = %for.body.preheader
305 %unroll_iter = sub i32 %N, %xtraiter
308 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
309 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
310 %lcmp.mod = icmp eq i32 %xtraiter, 0
311 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
313 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
314 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
315 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
316 %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
317 %tmp2 = load i16, i16* %arrayidx.epil, align 2
318 %conv.epil = sext i16 %tmp2 to i32
319 %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
320 %tmp3 = load i8, i8* %arrayidx1.epil, align 1
321 %conv2.epil = zext i8 %tmp3 to i32
322 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
323 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
324 store i32 %mul.epil, i32* %arrayidx3.epil, align 4
325 %inc.epil = add nuw i32 %i.010.epil, 1
326 %epil.iter.sub = add i32 %epil.iter, -1
327 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
328 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
330 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
333 for.body: ; preds = %for.body, %for.body.preheader.new
334 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
335 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
336 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
337 %tmp4 = load i16, i16* %arrayidx, align 2
338 %conv = sext i16 %tmp4 to i32
339 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
340 %tmp5 = load i8, i8* %arrayidx1, align 1
341 %conv2 = zext i8 %tmp5 to i32
342 %mul = mul nsw i32 %conv2, %conv
343 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
344 store i32 %mul, i32* %arrayidx3, align 4
345 %inc = or i32 %i.010, 1
346 %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
347 %tmp6 = load i16, i16* %arrayidx.1, align 2
348 %conv.1 = sext i16 %tmp6 to i32
349 %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
350 %tmp7 = load i8, i8* %arrayidx1.1, align 1
351 %conv2.1 = zext i8 %tmp7 to i32
352 %mul.1 = mul nsw i32 %conv2.1, %conv.1
353 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
354 store i32 %mul.1, i32* %arrayidx3.1, align 4
355 %inc.1 = or i32 %i.010, 2
356 %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
357 %tmp8 = load i16, i16* %arrayidx.2, align 2
358 %conv.2 = sext i16 %tmp8 to i32
359 %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
360 %tmp9 = load i8, i8* %arrayidx1.2, align 1
361 %conv2.2 = zext i8 %tmp9 to i32
362 %mul.2 = mul nsw i32 %conv2.2, %conv.2
363 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
364 store i32 %mul.2, i32* %arrayidx3.2, align 4
365 %inc.2 = or i32 %i.010, 3
366 %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
367 %tmp10 = load i16, i16* %arrayidx.3, align 2
368 %conv.3 = sext i16 %tmp10 to i32
369 %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
370 %tmp11 = load i8, i8* %arrayidx1.3, align 1
371 %conv2.3 = zext i8 %tmp11 to i32
372 %mul.3 = mul nsw i32 %conv2.3, %conv.3
373 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
374 store i32 %mul.3, i32* %arrayidx3.3, align 4
375 %inc.3 = add i32 %i.010, 4
376 %niter.nsub.3 = add i32 %niter, -4
377 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
378 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
381 ; CHECK-LABEL: mul_16x16
384 ; TODO: pre-indexed loads
385 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
386 ; CHECK-DEFAULT: str{{.*}}, #16]!
387 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
389 ; CHECK-COMPLEX: ldrsh{{.*}}]!
390 ; CHECK-COMPLEX: ldrsh{{.*}}]!
391 ; CHECK-COMPLEX: str{{.*}}]!
393 ; DISABLED-NOT: ldr{{.*}}]!
394 ; DISABLED-NOT: str{{.*}}]!
396 ; CHECK-T2: @ %for.body.epil
397 ; CHECK-T2: ldrsh{{.*}}, #2]!
398 ; CHECK-T2: ldrsh{{.*}}, #2]!
399 ; CHECK-T2: str{{.*}}, #4]!
401 define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) {
403 %cmp9 = icmp eq i32 %N, 0
404 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
406 for.body.preheader: ; preds = %entry
407 %tmp = add i32 %N, -1
408 %xtraiter = and i32 %N, 3
409 %tmp1 = icmp ult i32 %tmp, 3
410 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
412 for.body.preheader.new: ; preds = %for.body.preheader
413 %unroll_iter = sub i32 %N, %xtraiter
416 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
417 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
418 %lcmp.mod = icmp eq i32 %xtraiter, 0
419 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
421 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
422 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
423 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
424 %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
425 %tmp2 = load i16, i16* %arrayidx.epil, align 2
426 %conv.epil = sext i16 %tmp2 to i32
427 %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil
428 %tmp3 = load i16, i16* %arrayidx1.epil, align 2
429 %conv2.epil = sext i16 %tmp3 to i32
430 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
431 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
432 store i32 %mul.epil, i32* %arrayidx3.epil, align 4
433 %inc.epil = add nuw i32 %i.010.epil, 1
434 %epil.iter.sub = add i32 %epil.iter, -1
435 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
436 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
438 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
441 for.body: ; preds = %for.body, %for.body.preheader.new
442 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
443 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
444 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
445 %tmp4 = load i16, i16* %arrayidx, align 2
446 %conv = sext i16 %tmp4 to i32
447 %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
448 %tmp5 = load i16, i16* %arrayidx1, align 2
449 %conv2 = sext i16 %tmp5 to i32
450 %mul = mul nsw i32 %conv2, %conv
451 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
452 store i32 %mul, i32* %arrayidx3, align 4
453 %inc = or i32 %i.010, 1
454 %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
455 %tmp6 = load i16, i16* %arrayidx.1, align 2
456 %conv.1 = sext i16 %tmp6 to i32
457 %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
458 %tmp7 = load i16, i16* %arrayidx1.1, align 2
459 %conv2.1 = sext i16 %tmp7 to i32
460 %mul.1 = mul nsw i32 %conv2.1, %conv.1
461 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
462 store i32 %mul.1, i32* %arrayidx3.1, align 4
463 %inc.1 = or i32 %i.010, 2
464 %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
465 %tmp8 = load i16, i16* %arrayidx.2, align 2
466 %conv.2 = sext i16 %tmp8 to i32
467 %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
468 %tmp9 = load i16, i16* %arrayidx1.2, align 2
469 %conv2.2 = sext i16 %tmp9 to i32
470 %mul.2 = mul nsw i32 %conv2.2, %conv.2
471 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
472 store i32 %mul.2, i32* %arrayidx3.2, align 4
473 %inc.2 = or i32 %i.010, 3
474 %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
475 %tmp10 = load i16, i16* %arrayidx.3, align 2
476 %conv.3 = sext i16 %tmp10 to i32
477 %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
478 %tmp11 = load i16, i16* %arrayidx1.3, align 2
479 %conv2.3 = sext i16 %tmp11 to i32
480 %mul.3 = mul nsw i32 %conv2.3, %conv.3
481 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
482 store i32 %mul.3, i32* %arrayidx3.3, align 4
483 %inc.3 = add i32 %i.010, 4
484 %niter.nsub.3 = add i32 %niter, -4
485 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
486 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
489 ; CHECK-LABEL: mul_8x8_2d
490 ; CHECK: @ %for.body4.us
492 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
493 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
495 ; DISABLED-NOT: ldr{{.*}}]!
496 ; DISABLED-NOT: str{{.*}}]!
498 ; CHECK-T2: @ %for.body4.us.epil
499 ; CHECK-T2: ldrb{{.*}}, #1]!
500 ; CHECK-T2: ldr{{.*}}, #4]!
502 define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
504 %cmp24 = icmp eq i32 %N, 0
505 %cmp222 = icmp eq i32 %M, 0
506 %or.cond = or i1 %cmp24, %cmp222
507 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
509 for.cond1.preheader.us.preheader: ; preds = %entry
510 %tmp = add i32 %M, -1
511 %xtraiter = and i32 %M, 3
512 %tmp1 = icmp ult i32 %tmp, 3
513 %unroll_iter = sub i32 %M, %xtraiter
514 %lcmp.mod = icmp eq i32 %xtraiter, 0
515 br label %for.cond1.preheader.us
517 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
518 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
519 %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us
520 %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us
521 %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
522 %.pre = load i8*, i8** %arrayidx5.us, align 4
523 %.pre30 = load i32*, i32** %arrayidx8.us, align 4
524 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
526 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
527 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
528 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
529 %tmp2 = load i8, i8* %arrayidx.us, align 1
530 %conv.us = zext i8 %tmp2 to i32
531 %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us
532 %tmp3 = load i8, i8* %arrayidx6.us, align 1
533 %conv7.us = zext i8 %tmp3 to i32
534 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
535 %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us
536 %tmp4 = load i32, i32* %arrayidx9.us, align 4
537 %add.us = add nsw i32 %tmp4, %mul.us
538 store i32 %add.us, i32* %arrayidx9.us, align 4
539 %inc.us = or i32 %j.023.us, 1
540 %tmp5 = load i8, i8* %arrayidx.us, align 1
541 %conv.us.1 = zext i8 %tmp5 to i32
542 %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
543 %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
544 %conv7.us.1 = zext i8 %tmp6 to i32
545 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
546 %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us
547 %tmp7 = load i32, i32* %arrayidx9.us.1, align 4
548 %add.us.1 = add nsw i32 %tmp7, %mul.us.1
549 store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
550 %inc.us.1 = or i32 %j.023.us, 2
551 %tmp8 = load i8, i8* %arrayidx.us, align 1
552 %conv.us.2 = zext i8 %tmp8 to i32
553 %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
554 %tmp9 = load i8, i8* %arrayidx6.us.2, align 1
555 %conv7.us.2 = zext i8 %tmp9 to i32
556 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
557 %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1
558 %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
559 %add.us.2 = add nsw i32 %tmp10, %mul.us.2
560 store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
561 %inc.us.2 = or i32 %j.023.us, 3
562 %tmp11 = load i8, i8* %arrayidx.us, align 1
563 %conv.us.3 = zext i8 %tmp11 to i32
564 %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
565 %tmp12 = load i8, i8* %arrayidx6.us.3, align 1
566 %conv7.us.3 = zext i8 %tmp12 to i32
567 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
568 %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2
569 %tmp13 = load i32, i32* %arrayidx9.us.3, align 4
570 %add.us.3 = add nsw i32 %tmp13, %mul.us.3
571 store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
572 %inc.us.3 = add i32 %j.023.us, 4
573 %niter.nsub.3 = add i32 %niter, -4
574 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
575 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
577 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
578 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
579 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
581 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
582 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
583 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
584 %tmp14 = load i8, i8* %arrayidx.us, align 1
585 %conv.us.epil = zext i8 %tmp14 to i32
586 %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil
587 %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1
588 %conv7.us.epil = zext i8 %tmp15 to i32
589 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
590 %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil
591 %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4
592 %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
593 store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
594 %inc.us.epil = add nuw i32 %j.023.us.epil, 1
595 %epil.iter.sub = add i32 %epil.iter, -1
596 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
597 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
599 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
600 %inc11.us = add nuw i32 %i.025.us, 1
601 %exitcond28 = icmp eq i32 %inc11.us, %N
602 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
604 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
608 ; CHECK-LABEL: mul_16x16_2d
609 ; CHECK: @ %for.body4.us
611 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
612 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
614 ; DISABLED-NOT: ldr{{.*}}]!
615 ; DISABLED-NOT: str{{.*}}]!
617 ; CHECK-T2: @ %for.body4.us.epil
618 ; CHECK-T2: ldrsh{{.*}}, #2]!
619 ; CHECK-T2: ldr{{.*}}, #4]!
621 define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
623 %cmp24 = icmp eq i32 %N, 0
624 %cmp222 = icmp eq i32 %M, 0
625 %or.cond = or i1 %cmp24, %cmp222
626 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
628 for.cond1.preheader.us.preheader: ; preds = %entry
629 %tmp = add i32 %M, -1
630 %xtraiter = and i32 %M, 3
631 %tmp1 = icmp ult i32 %tmp, 3
632 %unroll_iter = sub i32 %M, %xtraiter
633 %lcmp.mod = icmp eq i32 %xtraiter, 0
634 br label %for.cond1.preheader.us
636 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
637 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
638 %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us
639 %tmp2 = load i16, i16* %arrayidx.us, align 2
640 %conv.us = sext i16 %tmp2 to i32
641 %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us
642 %tmp3 = load i16*, i16** %arrayidx5.us, align 4
643 %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
644 %tmp4 = load i32*, i32** %arrayidx8.us, align 4
645 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
647 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
648 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
649 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
650 %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us
651 %tmp5 = load i16, i16* %arrayidx6.us, align 2
652 %conv7.us = sext i16 %tmp5 to i32
653 %mul.us = mul nsw i32 %conv7.us, %conv.us
654 %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us
655 %tmp6 = load i32, i32* %arrayidx9.us, align 4
656 %add.us = add nsw i32 %tmp6, %mul.us
657 store i32 %add.us, i32* %arrayidx9.us, align 4
658 %inc.us = or i32 %j.023.us, 1
659 %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
660 %tmp7 = load i16, i16* %arrayidx6.us.1, align 2
661 %conv7.us.1 = sext i16 %tmp7 to i32
662 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
663 %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us
664 %tmp8 = load i32, i32* %arrayidx9.us.1, align 4
665 %add.us.1 = add nsw i32 %tmp8, %mul.us.1
666 store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
667 %inc.us.1 = or i32 %j.023.us, 2
668 %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
669 %tmp9 = load i16, i16* %arrayidx6.us.2, align 2
670 %conv7.us.2 = sext i16 %tmp9 to i32
671 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
672 %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1
673 %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
674 %add.us.2 = add nsw i32 %tmp10, %mul.us.2
675 store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
676 %inc.us.2 = or i32 %j.023.us, 3
677 %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
678 %tmp11 = load i16, i16* %arrayidx6.us.3, align 2
679 %conv7.us.3 = sext i16 %tmp11 to i32
680 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
681 %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2
682 %tmp12 = load i32, i32* %arrayidx9.us.3, align 4
683 %add.us.3 = add nsw i32 %tmp12, %mul.us.3
684 store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
685 %inc.us.3 = add i32 %j.023.us, 4
686 %niter.nsub.3 = add i32 %niter, -4
687 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
688 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
690 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
691 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
692 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
694 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
695 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
696 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
697 %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil
698 %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2
699 %conv7.us.epil = sext i16 %tmp13 to i32
700 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
701 %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil
702 %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4
703 %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
704 store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
705 %inc.us.epil = add nuw i32 %j.023.us.epil, 1
706 %epil.iter.sub = add i32 %epil.iter, -1
707 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
708 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
710 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
711 %inc11.us = add nuw i32 %i.025.us, 1
712 %exitcond28 = icmp eq i32 %inc11.us, %N
713 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
715 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
719 ; CHECK-LABEL: mac_8x8_2d
720 ; CHECK: @ %for.body4.us
722 ; TODO: Both input arrays could use pre-indexed loads.
723 ; TODO: pre-indexed stores.
724 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
725 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
726 ; CHECK-DEFAULT-NOT: str{{.*}}]!
728 ; TODO: Increased complexity shouldn't prevent indexed accesses.
729 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
730 ; CHECK-COMPLEX-NOT: str{{.*}}]!
732 ; DISABLED-NOT: ldr{{.*}}]!
733 ; DISABLED-NOT: str{{.*}}]!
735 ; CHECK-T2: @ %for.body4.us.epil
736 ; CHECK-T2: ldrb{{.*}}, #1]!
738 define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
740 %cmp22 = icmp eq i32 %N, 0
741 %cmp220 = icmp eq i32 %M, 0
742 %or.cond = or i1 %cmp22, %cmp220
743 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
745 for.cond1.preheader.us.preheader: ; preds = %entry
746 %tmp = add i32 %M, -1
747 %xtraiter = and i32 %M, 3
748 %tmp1 = icmp ult i32 %tmp, 3
749 %unroll_iter = sub i32 %M, %xtraiter
750 %lcmp.mod = icmp eq i32 %xtraiter, 0
751 br label %for.cond1.preheader.us
753 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
754 %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
755 %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us
756 %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us
757 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us
758 %.pre = load i8*, i8** %arrayidx5.us, align 4
759 %.pre28 = load i32, i32* %arrayidx8.us, align 4
760 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
762 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
763 %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
764 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
765 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
766 %tmp3 = load i8, i8* %arrayidx.us, align 1
767 %conv.us = zext i8 %tmp3 to i32
768 %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us
769 %tmp4 = load i8, i8* %arrayidx6.us, align 1
770 %conv7.us = zext i8 %tmp4 to i32
771 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
772 %add.us = add nsw i32 %mul.us, %tmp2
773 store i32 %add.us, i32* %arrayidx8.us, align 4
774 %inc.us = or i32 %j.021.us, 1
775 %tmp5 = load i8, i8* %arrayidx.us, align 1
776 %conv.us.1 = zext i8 %tmp5 to i32
777 %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
778 %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
779 %conv7.us.1 = zext i8 %tmp6 to i32
780 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
781 %add.us.1 = add nsw i32 %mul.us.1, %add.us
782 store i32 %add.us.1, i32* %arrayidx8.us, align 4
783 %inc.us.1 = or i32 %j.021.us, 2
784 %tmp7 = load i8, i8* %arrayidx.us, align 1
785 %conv.us.2 = zext i8 %tmp7 to i32
786 %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
787 %tmp8 = load i8, i8* %arrayidx6.us.2, align 1
788 %conv7.us.2 = zext i8 %tmp8 to i32
789 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
790 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
791 store i32 %add.us.2, i32* %arrayidx8.us, align 4
792 %inc.us.2 = or i32 %j.021.us, 3
793 %tmp9 = load i8, i8* %arrayidx.us, align 1
794 %conv.us.3 = zext i8 %tmp9 to i32
795 %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
796 %tmp10 = load i8, i8* %arrayidx6.us.3, align 1
797 %conv7.us.3 = zext i8 %tmp10 to i32
798 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
799 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
800 store i32 %add.us.3, i32* %arrayidx8.us, align 4
801 %inc.us.3 = add i32 %j.021.us, 4
802 %niter.nsub.3 = add i32 %niter, -4
803 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
804 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
806 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
807 %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
808 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
809 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
811 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
812 %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
813 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
814 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
815 %tmp12 = load i8, i8* %arrayidx.us, align 1
816 %conv.us.epil = zext i8 %tmp12 to i32
817 %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil
818 %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1
819 %conv7.us.epil = zext i8 %tmp13 to i32
820 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
821 %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
822 store i32 %add.us.epil, i32* %arrayidx8.us, align 4
823 %inc.us.epil = add nuw i32 %j.021.us.epil, 1
824 %epil.iter.sub = add i32 %epil.iter, -1
825 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
826 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
828 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
829 %inc10.us = add nuw i32 %i.023.us, 1
830 %exitcond26 = icmp eq i32 %inc10.us, %N
831 br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
833 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
837 ; CHECK-LABEL: mac_16x16_2d
838 ; CHECK: @ %for.body4.us
840 ; TODO: pre-indexed loads for both input arrays.
841 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
842 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
844 ; TODO: increased complexity should lead to better codegen.
845 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
847 ; DISABLED-NOT: ldr{{.*}}]!
849 ; CHECK-T2: @ %for.body4.us.epil
850 ; CHECK-T2: ldrsh{{.*}}, #2]!
852 define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
854 %cmp23 = icmp eq i32 %N, 0
855 %cmp220 = icmp eq i32 %M, 0
856 %or.cond = or i1 %cmp23, %cmp220
857 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
859 for.cond1.preheader.us.preheader: ; preds = %entry
860 %tmp = add i32 %M, -1
861 %xtraiter = and i32 %M, 3
862 %tmp1 = icmp ult i32 %tmp, 3
863 %unroll_iter = sub i32 %M, %xtraiter
864 %lcmp.mod = icmp eq i32 %xtraiter, 0
865 br label %for.cond1.preheader.us
867 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
868 %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
869 %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us
870 %tmp2 = load i16, i16* %arrayidx.us, align 2
871 %conv.us = sext i16 %tmp2 to i32
872 %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us
873 %tmp3 = load i16*, i16** %arrayidx5.us, align 4
874 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
875 %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
876 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
878 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
879 %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
880 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
881 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
882 %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us
883 %tmp4 = load i16, i16* %arrayidx6.us, align 2
884 %conv7.us = sext i16 %tmp4 to i32
885 %mul.us = mul nsw i32 %conv7.us, %conv.us
886 %add.us = add nsw i32 %mul.us, %add22.us
887 %inc.us = or i32 %j.021.us, 1
888 %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
889 %tmp5 = load i16, i16* %arrayidx6.us.1, align 2
890 %conv7.us.1 = sext i16 %tmp5 to i32
891 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
892 %add.us.1 = add nsw i32 %mul.us.1, %add.us
893 %inc.us.1 = or i32 %j.021.us, 2
894 %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
895 %tmp6 = load i16, i16* %arrayidx6.us.2, align 2
896 %conv7.us.2 = sext i16 %tmp6 to i32
897 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
898 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
899 %inc.us.2 = or i32 %j.021.us, 3
900 %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
901 %tmp7 = load i16, i16* %arrayidx6.us.3, align 2
902 %conv7.us.3 = sext i16 %tmp7 to i32
903 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
904 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
905 %inc.us.3 = add i32 %j.021.us, 4
906 %niter.nsub.3 = add i32 %niter, -4
907 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
908 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
910 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
911 %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
912 %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
913 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
914 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
916 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
917 %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
918 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
919 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
920 %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil
921 %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2
922 %conv7.us.epil = sext i16 %tmp8 to i32
923 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
924 %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
925 %inc.us.epil = add nuw i32 %j.021.us.epil, 1
926 %epil.iter.sub = add i32 %epil.iter, -1
927 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
928 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
930 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
931 %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
932 store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4
933 %inc10.us = add nuw i32 %i.024.us, 1
934 %exitcond27 = icmp eq i32 %inc10.us, %N
935 br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
937 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
941 ; CHECK-LABEL: mul32x32_backwards
944 ; TODO: post increments for decreasing addresses
945 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
946 ; CHECK-DEFAULT-NOT: str{{.*}}]!
948 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
949 ; CHECK-COMPLEX-NOT: str{{.*}}]!
951 define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
953 %i.08 = add i32 %N, -1
954 %cmp9 = icmp sgt i32 %i.08, -1
955 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
957 for.body.preheader: ; preds = %entry
958 %xtraiter = and i32 %N, 3
959 %lcmp.mod = icmp eq i32 %xtraiter, 0
960 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
962 for.body.prol: ; preds = %for.body.prol, %for.body.preheader
963 %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
964 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
965 %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol
966 %tmp = load i32, i32* %arrayidx.prol, align 4
967 %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol
968 %tmp1 = load i32, i32* %arrayidx1.prol, align 4
969 %mul.prol = mul nsw i32 %tmp1, %tmp
970 %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol
971 store i32 %mul.prol, i32* %arrayidx2.prol, align 4
972 %i.0.prol = add i32 %i.010.prol, -1
973 %prol.iter.sub = add i32 %prol.iter, -1
974 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
975 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
977 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader
978 %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
979 %tmp2 = icmp ult i32 %i.08, 3
980 br i1 %tmp2, label %for.cond.cleanup, label %for.body
982 for.cond.cleanup: ; preds = %for.body, %for.body.prol.loopexit, %entry
985 for.body: ; preds = %for.body, %for.body.prol.loopexit
986 %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
987 %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010
988 %tmp3 = load i32, i32* %arrayidx, align 4
989 %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010
990 %tmp4 = load i32, i32* %arrayidx1, align 4
991 %mul = mul nsw i32 %tmp4, %tmp3
992 %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010
993 store i32 %mul, i32* %arrayidx2, align 4
994 %i.0 = add i32 %i.010, -1
995 %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0
996 %tmp5 = load i32, i32* %arrayidx.1, align 4
997 %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0
998 %tmp6 = load i32, i32* %arrayidx1.1, align 4
999 %mul.1 = mul nsw i32 %tmp6, %tmp5
1000 %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0
1001 store i32 %mul.1, i32* %arrayidx2.1, align 4
1002 %i.0.1 = add i32 %i.010, -2
1003 %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1
1004 %tmp7 = load i32, i32* %arrayidx.2, align 4
1005 %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1
1006 %tmp8 = load i32, i32* %arrayidx1.2, align 4
1007 %mul.2 = mul nsw i32 %tmp8, %tmp7
1008 %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1
1009 store i32 %mul.2, i32* %arrayidx2.2, align 4
1010 %i.0.2 = add i32 %i.010, -3
1011 %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2
1012 %tmp9 = load i32, i32* %arrayidx.3, align 4
1013 %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2
1014 %tmp10 = load i32, i32* %arrayidx1.3, align 4
1015 %mul.3 = mul nsw i32 %tmp10, %tmp9
1016 %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2
1017 store i32 %mul.3, i32* %arrayidx2.3, align 4
1018 %i.0.3 = add i32 %i.010, -4
1019 %cmp.3 = icmp sgt i32 %i.0.3, -1
1020 br i1 %cmp.3, label %for.body, label %for.cond.cleanup
1023 ; CHECK-LABEL: mul32x32_forwards
1024 ; CHECK: @ %for.body
1026 ; TODO: Would be good for the complexity limit didn't have to be increased to
1027 ; enable the pre-indexed accesses.
1029 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
1030 ; CHECK-DEFAULT-NOT: str{{.*}}]!
1032 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1033 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1034 ; CHECK-COMPLEX: str{{.*}}, #16]!
1036 ; CHECK-T2: @ %for.body.epil
1037 ; CHECK-T2: ldr{{.*}}, #4]!
1038 ; CHECK-T2: ldr{{.*}}, #4]!
1039 ; CHECK-T2: str{{.*}}, #4]!
1041 define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
1043 %cmp8 = icmp eq i32 %N, 0
1044 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1046 for.body.preheader: ; preds = %entry
1047 %tmp = add i32 %N, -1
1048 %xtraiter = and i32 %N, 3
1049 %tmp1 = icmp ult i32 %tmp, 3
1050 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1052 for.body.preheader.new: ; preds = %for.body.preheader
1053 %unroll_iter = sub i32 %N, %xtraiter
1056 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1057 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1058 %lcmp.mod = icmp eq i32 %xtraiter, 0
1059 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1061 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
1062 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1063 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1064 %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1065 %tmp2 = load i32, i32* %arrayidx.epil, align 4
1066 %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil
1067 %tmp3 = load i32, i32* %arrayidx1.epil, align 4
1068 %mul.epil = mul nsw i32 %tmp3, %tmp2
1069 %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1070 store i32 %mul.epil, i32* %arrayidx2.epil, align 4
1071 %inc.epil = add nuw nsw i32 %i.09.epil, 1
1072 %epil.iter.sub = add i32 %epil.iter, -1
1073 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1074 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1076 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1079 for.body: ; preds = %for.body, %for.body.preheader.new
1080 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1081 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1082 %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09
1083 %tmp4 = load i32, i32* %arrayidx, align 4
1084 %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09
1085 %tmp5 = load i32, i32* %arrayidx1, align 4
1086 %mul = mul nsw i32 %tmp5, %tmp4
1087 %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09
1088 store i32 %mul, i32* %arrayidx2, align 4
1089 %inc = or i32 %i.09, 1
1090 %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1091 %tmp6 = load i32, i32* %arrayidx.1, align 4
1092 %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc
1093 %tmp7 = load i32, i32* %arrayidx1.1, align 4
1094 %mul.1 = mul nsw i32 %tmp7, %tmp6
1095 %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1096 store i32 %mul.1, i32* %arrayidx2.1, align 4
1097 %inc.1 = or i32 %i.09, 2
1098 %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1099 %tmp8 = load i32, i32* %arrayidx.2, align 4
1100 %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1
1101 %tmp9 = load i32, i32* %arrayidx1.2, align 4
1102 %mul.2 = mul nsw i32 %tmp9, %tmp8
1103 %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1104 store i32 %mul.2, i32* %arrayidx2.2, align 4
1105 %inc.2 = or i32 %i.09, 3
1106 %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1107 %tmp10 = load i32, i32* %arrayidx.3, align 4
1108 %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2
1109 %tmp11 = load i32, i32* %arrayidx1.3, align 4
1110 %mul.3 = mul nsw i32 %tmp11, %tmp10
1111 %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1112 store i32 %mul.3, i32* %arrayidx2.3, align 4
1113 %inc.3 = add nuw nsw i32 %i.09, 4
1114 %niter.nsub.3 = add i32 %niter, -4
1115 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1116 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body