1 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | \
2 ; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
4 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 -lsr-preferred-addressing-mode=none %s -o - | \
5 ; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
7 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | \
8 ; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
10 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=postindexed %s -o - | \
11 ; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
13 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=preindexed %s -o - | \
14 ; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-T2
16 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
17 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
19 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | \
20 ; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
22 ; Tests to check that post increment addressing modes are used instead of
23 ; updating base pointers with add instructions.
25 ; TODO: I think we should be able to use post inc addressing with VLDM
27 ; CHECK-LABEL: test_fma
30 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
31 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
32 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
33 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
35 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
36 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
37 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
38 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
40 define float @test_fma(float* %a, float* %b, i32 %N) {
45 %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
46 %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
47 %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
48 %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
49 %a.1 = load float, float* %gep.a.1
50 %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
51 %b.1 = load float, float* %gep.b.1
52 %fmul.1 = fmul float %a.1, %b.1
53 %fma.1 = fadd float %fmul.1, %res
54 %idx.2 = or i32 %idx.1, 1
55 %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
56 %a.2 = load float, float* %gep.a.2
57 %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
58 %b.2 = load float, float* %gep.b.2
59 %fmul.2 = fmul float %a.2, %b.2
60 %fma.2 = fadd float %fmul.2, %fma.1
61 %i.next = add nsw nuw i32 %i, -2
62 %idx.next = add nsw nuw i32 %idx.1, 2
63 %cmp = icmp ult i32 %i.next, %N
64 br i1 %cmp, label %loop, label %exit
70 ; CHECK-LABEL: convolve_16bit
71 ; TODO: Both arrays should use indexing
72 ; CHECK-DEFAULT: ldr{{.*}}, #8]!
73 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
75 ; CHECK-COMPLEX: ldr{{.*}}, #8]!
76 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
78 ; DISABLED-NOT: ldr{{.*}}]!
79 ; DISABLED-NOT: str{{.*}}]!
81 define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
82 i32 %filter_dim, i32 %out_width, i32 %out_height,
83 i32** nocapture readonly %convolved) {
85 %cmp92 = icmp eq i32 %out_height, 0
86 br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
88 for.cond1.preheader.lr.ph: ; preds = %entry
89 %xtraiter = and i32 %filter_dim, 3
90 %unroll_iter = sub i32 %filter_dim, %xtraiter
91 br label %for.cond1.preheader
93 for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
94 %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
95 %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
96 %tmp3 = load i32*, i32** %arrayidx22, align 4
97 br label %for.cond9.preheader.us.us.preheader
99 for.cond9.preheader.us.us.preheader: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
100 %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
101 br label %for.cond9.preheader.us.us
103 for.cond9.preheader.us.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
104 %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
105 %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
106 %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
107 %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
108 %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
109 %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
110 %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
111 br label %for.body12.us.us
113 for.body12.us.us: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
114 %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
115 %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
116 %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
117 %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
118 %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
119 %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
120 %conv.us.us = sext i16 %tmp9 to i32
121 %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
122 %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
123 %conv17.us.us = sext i16 %tmp10 to i32
124 %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
125 %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
126 %inc.us.us = or i32 %filter_x.053.us.us, 1
127 %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
128 %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
129 %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
130 %conv.us.us.1 = sext i16 %tmp11 to i32
131 %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
132 %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
133 %conv17.us.us.1 = sext i16 %tmp12 to i32
134 %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
135 %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
136 %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
137 %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
138 %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
139 %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
140 %conv.us.us.2 = sext i16 %tmp13 to i32
141 %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
142 %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
143 %conv17.us.us.2 = sext i16 %tmp14 to i32
144 %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
145 %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
146 %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
147 %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
148 %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
149 %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
150 %conv.us.us.3 = sext i16 %tmp15 to i32
151 %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
152 %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
153 %conv17.us.us.3 = sext i16 %tmp16 to i32
154 %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
155 %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
156 %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
157 %niter.nsub.3 = add i32 %niter, -4
158 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
159 br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
161 for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
162 %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
163 %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
164 br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
166 for.cond5.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
167 %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
168 store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
169 %add25.us = add nuw i32 %res_x.060.us, 1
170 %exitcond99 = icmp eq i32 %add25.us, %out_width
171 br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
173 for.cond.cleanup3: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
174 %add28 = add nuw i32 %res_y.093, 1
175 %exitcond100 = icmp eq i32 %add28, %out_height
176 br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
178 for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry
182 ; CHECK-LABEL: mul_8x8
185 ; CHECK-DEFAULT: str{{.*}}, #16]!
186 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
187 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
189 ; CHECK-COMPLEX: str{{.*}}, #16]!
190 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
191 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
193 ; DISABLED-NOT: ldr{{.*}}]!
194 ; DISABLED-NOT: str{{.*}}]!
196 ; CHECK-T2: @ %for.body.epil
197 ; CHECK-T2: ldrb{{.*}}, #1]!
198 ; CHECK-T2: ldrb{{.*}}, #1]!
199 ; CHECK-T2: str{{.*}}, #4]!
201 define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
203 %cmp9 = icmp eq i32 %N, 0
204 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
206 for.body.preheader: ; preds = %entry
207 %tmp = add i32 %N, -1
208 %xtraiter = and i32 %N, 3
209 %tmp1 = icmp ult i32 %tmp, 3
210 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
212 for.body.preheader.new: ; preds = %for.body.preheader
213 %unroll_iter = sub i32 %N, %xtraiter
216 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
217 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
218 %lcmp.mod = icmp eq i32 %xtraiter, 0
219 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
221 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
222 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
223 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
224 %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil
225 %tmp2 = load i8, i8* %arrayidx.epil, align 1
226 %conv.epil = zext i8 %tmp2 to i32
227 %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
228 %tmp3 = load i8, i8* %arrayidx1.epil, align 1
229 %conv2.epil = zext i8 %tmp3 to i32
230 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
231 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
232 store i32 %mul.epil, i32* %arrayidx3.epil, align 4
233 %inc.epil = add nuw i32 %i.010.epil, 1
234 %epil.iter.sub = add i32 %epil.iter, -1
235 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
236 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
238 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
241 for.body: ; preds = %for.body, %for.body.preheader.new
242 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
243 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
244 %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010
245 %tmp4 = load i8, i8* %arrayidx, align 1
246 %conv = zext i8 %tmp4 to i32
247 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
248 %tmp5 = load i8, i8* %arrayidx1, align 1
249 %conv2 = zext i8 %tmp5 to i32
250 %mul = mul nuw nsw i32 %conv2, %conv
251 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
252 store i32 %mul, i32* %arrayidx3, align 4
253 %inc = or i32 %i.010, 1
254 %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
255 %tmp6 = load i8, i8* %arrayidx.1, align 1
256 %conv.1 = zext i8 %tmp6 to i32
257 %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
258 %tmp7 = load i8, i8* %arrayidx1.1, align 1
259 %conv2.1 = zext i8 %tmp7 to i32
260 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
261 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
262 store i32 %mul.1, i32* %arrayidx3.1, align 4
263 %inc.1 = or i32 %i.010, 2
264 %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
265 %tmp8 = load i8, i8* %arrayidx.2, align 1
266 %conv.2 = zext i8 %tmp8 to i32
267 %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
268 %tmp9 = load i8, i8* %arrayidx1.2, align 1
269 %conv2.2 = zext i8 %tmp9 to i32
270 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
271 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
272 store i32 %mul.2, i32* %arrayidx3.2, align 4
273 %inc.2 = or i32 %i.010, 3
274 %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
275 %tmp10 = load i8, i8* %arrayidx.3, align 1
276 %conv.3 = zext i8 %tmp10 to i32
277 %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
278 %tmp11 = load i8, i8* %arrayidx1.3, align 1
279 %conv2.3 = zext i8 %tmp11 to i32
280 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
281 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
282 store i32 %mul.3, i32* %arrayidx3.3, align 4
283 %inc.3 = add i32 %i.010, 4
284 %niter.nsub.3 = add i32 %niter, -4
285 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
286 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
289 ; CHECK-LABEL: mul_16x8
292 ; CHECK-DEFAULT: str{{.*}}, #16]!
293 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
295 ; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
296 ; CHECK-COMPLEX: str{{.*}}, #16]!
297 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
299 ; DISABLED-NOT: ldr{{.*}}]!
300 ; DISABLED-NOT: str{{.*}}]!
302 ; CHECK-T2: @ %for.body.epil
303 ; CHECK-T2: ldrsh{{.*}}, #2]!
304 ; CHECK-T2: ldrb{{.*}}, #1]!
305 ; CHECK-T2: str{{.*}}, #4]!
307 define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
309 %cmp9 = icmp eq i32 %N, 0
310 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
312 for.body.preheader: ; preds = %entry
313 %tmp = add i32 %N, -1
314 %xtraiter = and i32 %N, 3
315 %tmp1 = icmp ult i32 %tmp, 3
316 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
318 for.body.preheader.new: ; preds = %for.body.preheader
319 %unroll_iter = sub i32 %N, %xtraiter
322 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
323 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
324 %lcmp.mod = icmp eq i32 %xtraiter, 0
325 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
327 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
328 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
329 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
330 %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
331 %tmp2 = load i16, i16* %arrayidx.epil, align 2
332 %conv.epil = sext i16 %tmp2 to i32
333 %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
334 %tmp3 = load i8, i8* %arrayidx1.epil, align 1
335 %conv2.epil = zext i8 %tmp3 to i32
336 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
337 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
338 store i32 %mul.epil, i32* %arrayidx3.epil, align 4
339 %inc.epil = add nuw i32 %i.010.epil, 1
340 %epil.iter.sub = add i32 %epil.iter, -1
341 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
342 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
344 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
347 for.body: ; preds = %for.body, %for.body.preheader.new
348 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
349 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
350 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
351 %tmp4 = load i16, i16* %arrayidx, align 2
352 %conv = sext i16 %tmp4 to i32
353 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
354 %tmp5 = load i8, i8* %arrayidx1, align 1
355 %conv2 = zext i8 %tmp5 to i32
356 %mul = mul nsw i32 %conv2, %conv
357 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
358 store i32 %mul, i32* %arrayidx3, align 4
359 %inc = or i32 %i.010, 1
360 %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
361 %tmp6 = load i16, i16* %arrayidx.1, align 2
362 %conv.1 = sext i16 %tmp6 to i32
363 %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
364 %tmp7 = load i8, i8* %arrayidx1.1, align 1
365 %conv2.1 = zext i8 %tmp7 to i32
366 %mul.1 = mul nsw i32 %conv2.1, %conv.1
367 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
368 store i32 %mul.1, i32* %arrayidx3.1, align 4
369 %inc.1 = or i32 %i.010, 2
370 %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
371 %tmp8 = load i16, i16* %arrayidx.2, align 2
372 %conv.2 = sext i16 %tmp8 to i32
373 %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
374 %tmp9 = load i8, i8* %arrayidx1.2, align 1
375 %conv2.2 = zext i8 %tmp9 to i32
376 %mul.2 = mul nsw i32 %conv2.2, %conv.2
377 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
378 store i32 %mul.2, i32* %arrayidx3.2, align 4
379 %inc.2 = or i32 %i.010, 3
380 %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
381 %tmp10 = load i16, i16* %arrayidx.3, align 2
382 %conv.3 = sext i16 %tmp10 to i32
383 %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
384 %tmp11 = load i8, i8* %arrayidx1.3, align 1
385 %conv2.3 = zext i8 %tmp11 to i32
386 %mul.3 = mul nsw i32 %conv2.3, %conv.3
387 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
388 store i32 %mul.3, i32* %arrayidx3.3, align 4
389 %inc.3 = add i32 %i.010, 4
390 %niter.nsub.3 = add i32 %niter, -4
391 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
392 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
395 ; CHECK-LABEL: mul_16x16
398 ; TODO: pre-indexed loads
399 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
400 ; CHECK-DEFAULT: str{{.*}}, #16]!
401 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
403 ; CHECK-COMPLEX: ldrsh{{.*}}]!
404 ; CHECK-COMPLEX: ldrsh{{.*}}]!
405 ; CHECK-COMPLEX: str{{.*}}]!
407 ; DISABLED-NOT: ldr{{.*}}]!
408 ; DISABLED-NOT: str{{.*}}]!
410 ; CHECK-T2: @ %for.body.epil
411 ; CHECK-T2: ldrsh{{.*}}, #2]!
412 ; CHECK-T2: ldrsh{{.*}}, #2]!
413 ; CHECK-T2: str{{.*}}, #4]!
415 define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) {
417 %cmp9 = icmp eq i32 %N, 0
418 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
420 for.body.preheader: ; preds = %entry
421 %tmp = add i32 %N, -1
422 %xtraiter = and i32 %N, 3
423 %tmp1 = icmp ult i32 %tmp, 3
424 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
426 for.body.preheader.new: ; preds = %for.body.preheader
427 %unroll_iter = sub i32 %N, %xtraiter
430 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
431 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
432 %lcmp.mod = icmp eq i32 %xtraiter, 0
433 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
435 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
436 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
437 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
438 %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
439 %tmp2 = load i16, i16* %arrayidx.epil, align 2
440 %conv.epil = sext i16 %tmp2 to i32
441 %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil
442 %tmp3 = load i16, i16* %arrayidx1.epil, align 2
443 %conv2.epil = sext i16 %tmp3 to i32
444 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
445 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
446 store i32 %mul.epil, i32* %arrayidx3.epil, align 4
447 %inc.epil = add nuw i32 %i.010.epil, 1
448 %epil.iter.sub = add i32 %epil.iter, -1
449 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
450 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
452 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
455 for.body: ; preds = %for.body, %for.body.preheader.new
456 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
457 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
458 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
459 %tmp4 = load i16, i16* %arrayidx, align 2
460 %conv = sext i16 %tmp4 to i32
461 %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
462 %tmp5 = load i16, i16* %arrayidx1, align 2
463 %conv2 = sext i16 %tmp5 to i32
464 %mul = mul nsw i32 %conv2, %conv
465 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
466 store i32 %mul, i32* %arrayidx3, align 4
467 %inc = or i32 %i.010, 1
468 %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
469 %tmp6 = load i16, i16* %arrayidx.1, align 2
470 %conv.1 = sext i16 %tmp6 to i32
471 %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
472 %tmp7 = load i16, i16* %arrayidx1.1, align 2
473 %conv2.1 = sext i16 %tmp7 to i32
474 %mul.1 = mul nsw i32 %conv2.1, %conv.1
475 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
476 store i32 %mul.1, i32* %arrayidx3.1, align 4
477 %inc.1 = or i32 %i.010, 2
478 %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
479 %tmp8 = load i16, i16* %arrayidx.2, align 2
480 %conv.2 = sext i16 %tmp8 to i32
481 %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
482 %tmp9 = load i16, i16* %arrayidx1.2, align 2
483 %conv2.2 = sext i16 %tmp9 to i32
484 %mul.2 = mul nsw i32 %conv2.2, %conv.2
485 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
486 store i32 %mul.2, i32* %arrayidx3.2, align 4
487 %inc.2 = or i32 %i.010, 3
488 %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
489 %tmp10 = load i16, i16* %arrayidx.3, align 2
490 %conv.3 = sext i16 %tmp10 to i32
491 %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
492 %tmp11 = load i16, i16* %arrayidx1.3, align 2
493 %conv2.3 = sext i16 %tmp11 to i32
494 %mul.3 = mul nsw i32 %conv2.3, %conv.3
495 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
496 store i32 %mul.3, i32* %arrayidx3.3, align 4
497 %inc.3 = add i32 %i.010, 4
498 %niter.nsub.3 = add i32 %niter, -4
499 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
500 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
503 ; CHECK-LABEL: mul_8x8_2d
504 ; CHECK: @ %for.body4.us
506 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
507 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
509 ; DISABLED-NOT: ldr{{.*}}]!
510 ; DISABLED-NOT: str{{.*}}]!
512 ; CHECK-T2: @ %for.body4.us.epil
513 ; CHECK-T2: ldrb{{.*}}, #1]!
514 ; CHECK-T2: ldr{{.*}}, #4]!
516 define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
518 %cmp24 = icmp eq i32 %N, 0
519 %cmp222 = icmp eq i32 %M, 0
520 %or.cond = or i1 %cmp24, %cmp222
521 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
523 for.cond1.preheader.us.preheader: ; preds = %entry
524 %tmp = add i32 %M, -1
525 %xtraiter = and i32 %M, 3
526 %tmp1 = icmp ult i32 %tmp, 3
527 %unroll_iter = sub i32 %M, %xtraiter
528 %lcmp.mod = icmp eq i32 %xtraiter, 0
529 br label %for.cond1.preheader.us
531 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
532 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
533 %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us
534 %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us
535 %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
536 %.pre = load i8*, i8** %arrayidx5.us, align 4
537 %.pre30 = load i32*, i32** %arrayidx8.us, align 4
538 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
540 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
541 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
542 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
543 %tmp2 = load i8, i8* %arrayidx.us, align 1
544 %conv.us = zext i8 %tmp2 to i32
545 %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us
546 %tmp3 = load i8, i8* %arrayidx6.us, align 1
547 %conv7.us = zext i8 %tmp3 to i32
548 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
549 %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us
550 %tmp4 = load i32, i32* %arrayidx9.us, align 4
551 %add.us = add nsw i32 %tmp4, %mul.us
552 store i32 %add.us, i32* %arrayidx9.us, align 4
553 %inc.us = or i32 %j.023.us, 1
554 %tmp5 = load i8, i8* %arrayidx.us, align 1
555 %conv.us.1 = zext i8 %tmp5 to i32
556 %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
557 %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
558 %conv7.us.1 = zext i8 %tmp6 to i32
559 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
560 %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us
561 %tmp7 = load i32, i32* %arrayidx9.us.1, align 4
562 %add.us.1 = add nsw i32 %tmp7, %mul.us.1
563 store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
564 %inc.us.1 = or i32 %j.023.us, 2
565 %tmp8 = load i8, i8* %arrayidx.us, align 1
566 %conv.us.2 = zext i8 %tmp8 to i32
567 %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
568 %tmp9 = load i8, i8* %arrayidx6.us.2, align 1
569 %conv7.us.2 = zext i8 %tmp9 to i32
570 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
571 %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1
572 %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
573 %add.us.2 = add nsw i32 %tmp10, %mul.us.2
574 store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
575 %inc.us.2 = or i32 %j.023.us, 3
576 %tmp11 = load i8, i8* %arrayidx.us, align 1
577 %conv.us.3 = zext i8 %tmp11 to i32
578 %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
579 %tmp12 = load i8, i8* %arrayidx6.us.3, align 1
580 %conv7.us.3 = zext i8 %tmp12 to i32
581 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
582 %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2
583 %tmp13 = load i32, i32* %arrayidx9.us.3, align 4
584 %add.us.3 = add nsw i32 %tmp13, %mul.us.3
585 store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
586 %inc.us.3 = add i32 %j.023.us, 4
587 %niter.nsub.3 = add i32 %niter, -4
588 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
589 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
591 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
592 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
593 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
595 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
596 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
597 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
598 %tmp14 = load i8, i8* %arrayidx.us, align 1
599 %conv.us.epil = zext i8 %tmp14 to i32
600 %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil
601 %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1
602 %conv7.us.epil = zext i8 %tmp15 to i32
603 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
604 %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil
605 %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4
606 %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
607 store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
608 %inc.us.epil = add nuw i32 %j.023.us.epil, 1
609 %epil.iter.sub = add i32 %epil.iter, -1
610 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
611 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
613 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
614 %inc11.us = add nuw i32 %i.025.us, 1
615 %exitcond28 = icmp eq i32 %inc11.us, %N
616 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
618 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
622 ; CHECK-LABEL: mul_16x16_2d
623 ; CHECK: @ %for.body4.us
625 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
626 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
628 ; DISABLED-NOT: ldr{{.*}}]!
629 ; DISABLED-NOT: str{{.*}}]!
631 ; CHECK-T2: @ %for.body4.us.epil
632 ; CHECK-T2: ldrsh{{.*}}, #2]!
633 ; CHECK-T2: ldr{{.*}}, #4]!
635 define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
637 %cmp24 = icmp eq i32 %N, 0
638 %cmp222 = icmp eq i32 %M, 0
639 %or.cond = or i1 %cmp24, %cmp222
640 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
642 for.cond1.preheader.us.preheader: ; preds = %entry
643 %tmp = add i32 %M, -1
644 %xtraiter = and i32 %M, 3
645 %tmp1 = icmp ult i32 %tmp, 3
646 %unroll_iter = sub i32 %M, %xtraiter
647 %lcmp.mod = icmp eq i32 %xtraiter, 0
648 br label %for.cond1.preheader.us
650 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
651 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
652 %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us
653 %tmp2 = load i16, i16* %arrayidx.us, align 2
654 %conv.us = sext i16 %tmp2 to i32
655 %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us
656 %tmp3 = load i16*, i16** %arrayidx5.us, align 4
657 %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
658 %tmp4 = load i32*, i32** %arrayidx8.us, align 4
659 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
661 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
662 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
663 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
664 %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us
665 %tmp5 = load i16, i16* %arrayidx6.us, align 2
666 %conv7.us = sext i16 %tmp5 to i32
667 %mul.us = mul nsw i32 %conv7.us, %conv.us
668 %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us
669 %tmp6 = load i32, i32* %arrayidx9.us, align 4
670 %add.us = add nsw i32 %tmp6, %mul.us
671 store i32 %add.us, i32* %arrayidx9.us, align 4
672 %inc.us = or i32 %j.023.us, 1
673 %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
674 %tmp7 = load i16, i16* %arrayidx6.us.1, align 2
675 %conv7.us.1 = sext i16 %tmp7 to i32
676 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
677 %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us
678 %tmp8 = load i32, i32* %arrayidx9.us.1, align 4
679 %add.us.1 = add nsw i32 %tmp8, %mul.us.1
680 store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
681 %inc.us.1 = or i32 %j.023.us, 2
682 %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
683 %tmp9 = load i16, i16* %arrayidx6.us.2, align 2
684 %conv7.us.2 = sext i16 %tmp9 to i32
685 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
686 %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1
687 %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
688 %add.us.2 = add nsw i32 %tmp10, %mul.us.2
689 store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
690 %inc.us.2 = or i32 %j.023.us, 3
691 %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
692 %tmp11 = load i16, i16* %arrayidx6.us.3, align 2
693 %conv7.us.3 = sext i16 %tmp11 to i32
694 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
695 %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2
696 %tmp12 = load i32, i32* %arrayidx9.us.3, align 4
697 %add.us.3 = add nsw i32 %tmp12, %mul.us.3
698 store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
699 %inc.us.3 = add i32 %j.023.us, 4
700 %niter.nsub.3 = add i32 %niter, -4
701 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
702 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
704 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
705 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
706 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
708 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
709 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
710 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
711 %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil
712 %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2
713 %conv7.us.epil = sext i16 %tmp13 to i32
714 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
715 %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil
716 %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4
717 %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
718 store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
719 %inc.us.epil = add nuw i32 %j.023.us.epil, 1
720 %epil.iter.sub = add i32 %epil.iter, -1
721 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
722 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
724 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
725 %inc11.us = add nuw i32 %i.025.us, 1
726 %exitcond28 = icmp eq i32 %inc11.us, %N
727 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
729 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
733 ; CHECK-LABEL: mac_8x8_2d
734 ; CHECK: @ %for.body4.us
736 ; TODO: Both input arrays could use pre-indexed loads.
737 ; TODO: pre-indexed stores.
738 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
739 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
740 ; CHECK-DEFAULT-NOT: str{{.*}}]!
742 ; TODO: Increased complexity shouldn't prevent indexed accesses.
743 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
744 ; CHECK-COMPLEX-NOT: str{{.*}}]!
746 ; DISABLED-NOT: ldr{{.*}}]!
747 ; DISABLED-NOT: str{{.*}}]!
749 ; CHECK-T2: @ %for.body4.us.epil
750 ; CHECK-T2: ldrb{{.*}}, #1]!
752 define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
754 %cmp22 = icmp eq i32 %N, 0
755 %cmp220 = icmp eq i32 %M, 0
756 %or.cond = or i1 %cmp22, %cmp220
757 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
759 for.cond1.preheader.us.preheader: ; preds = %entry
760 %tmp = add i32 %M, -1
761 %xtraiter = and i32 %M, 3
762 %tmp1 = icmp ult i32 %tmp, 3
763 %unroll_iter = sub i32 %M, %xtraiter
764 %lcmp.mod = icmp eq i32 %xtraiter, 0
765 br label %for.cond1.preheader.us
767 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
768 %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
769 %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us
770 %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us
771 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us
772 %.pre = load i8*, i8** %arrayidx5.us, align 4
773 %.pre28 = load i32, i32* %arrayidx8.us, align 4
774 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
776 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
777 %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
778 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
779 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
780 %tmp3 = load i8, i8* %arrayidx.us, align 1
781 %conv.us = zext i8 %tmp3 to i32
782 %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us
783 %tmp4 = load i8, i8* %arrayidx6.us, align 1
784 %conv7.us = zext i8 %tmp4 to i32
785 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
786 %add.us = add nsw i32 %mul.us, %tmp2
787 store i32 %add.us, i32* %arrayidx8.us, align 4
788 %inc.us = or i32 %j.021.us, 1
789 %tmp5 = load i8, i8* %arrayidx.us, align 1
790 %conv.us.1 = zext i8 %tmp5 to i32
791 %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
792 %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
793 %conv7.us.1 = zext i8 %tmp6 to i32
794 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
795 %add.us.1 = add nsw i32 %mul.us.1, %add.us
796 store i32 %add.us.1, i32* %arrayidx8.us, align 4
797 %inc.us.1 = or i32 %j.021.us, 2
798 %tmp7 = load i8, i8* %arrayidx.us, align 1
799 %conv.us.2 = zext i8 %tmp7 to i32
800 %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
801 %tmp8 = load i8, i8* %arrayidx6.us.2, align 1
802 %conv7.us.2 = zext i8 %tmp8 to i32
803 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
804 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
805 store i32 %add.us.2, i32* %arrayidx8.us, align 4
806 %inc.us.2 = or i32 %j.021.us, 3
807 %tmp9 = load i8, i8* %arrayidx.us, align 1
808 %conv.us.3 = zext i8 %tmp9 to i32
809 %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
810 %tmp10 = load i8, i8* %arrayidx6.us.3, align 1
811 %conv7.us.3 = zext i8 %tmp10 to i32
812 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
813 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
814 store i32 %add.us.3, i32* %arrayidx8.us, align 4
815 %inc.us.3 = add i32 %j.021.us, 4
816 %niter.nsub.3 = add i32 %niter, -4
817 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
818 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
820 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
821 %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
822 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
823 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
825 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
826 %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
827 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
828 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
829 %tmp12 = load i8, i8* %arrayidx.us, align 1
830 %conv.us.epil = zext i8 %tmp12 to i32
831 %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil
832 %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1
833 %conv7.us.epil = zext i8 %tmp13 to i32
834 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
835 %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
836 store i32 %add.us.epil, i32* %arrayidx8.us, align 4
837 %inc.us.epil = add nuw i32 %j.021.us.epil, 1
838 %epil.iter.sub = add i32 %epil.iter, -1
839 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
840 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
842 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
843 %inc10.us = add nuw i32 %i.023.us, 1
844 %exitcond26 = icmp eq i32 %inc10.us, %N
845 br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
847 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
851 ; CHECK-LABEL: mac_16x16_2d
852 ; CHECK: @ %for.body4.us
854 ; TODO: pre-indexed loads for both input arrays.
855 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
856 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
858 ; TODO: increased complexity should lead to better codegen.
859 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
861 ; DISABLED-NOT: ldr{{.*}}]!
863 ; CHECK-T2: @ %for.body4.us.epil
864 ; CHECK-T2: ldrsh{{.*}}, #2]!
866 define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
868 %cmp23 = icmp eq i32 %N, 0
869 %cmp220 = icmp eq i32 %M, 0
870 %or.cond = or i1 %cmp23, %cmp220
871 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
873 for.cond1.preheader.us.preheader: ; preds = %entry
874 %tmp = add i32 %M, -1
875 %xtraiter = and i32 %M, 3
876 %tmp1 = icmp ult i32 %tmp, 3
877 %unroll_iter = sub i32 %M, %xtraiter
878 %lcmp.mod = icmp eq i32 %xtraiter, 0
879 br label %for.cond1.preheader.us
881 for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
882 %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
883 %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us
884 %tmp2 = load i16, i16* %arrayidx.us, align 2
885 %conv.us = sext i16 %tmp2 to i32
886 %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us
887 %tmp3 = load i16*, i16** %arrayidx5.us, align 4
888 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
889 %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
890 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
892 for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
893 %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
894 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
895 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
896 %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us
897 %tmp4 = load i16, i16* %arrayidx6.us, align 2
898 %conv7.us = sext i16 %tmp4 to i32
899 %mul.us = mul nsw i32 %conv7.us, %conv.us
900 %add.us = add nsw i32 %mul.us, %add22.us
901 %inc.us = or i32 %j.021.us, 1
902 %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
903 %tmp5 = load i16, i16* %arrayidx6.us.1, align 2
904 %conv7.us.1 = sext i16 %tmp5 to i32
905 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
906 %add.us.1 = add nsw i32 %mul.us.1, %add.us
907 %inc.us.1 = or i32 %j.021.us, 2
908 %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
909 %tmp6 = load i16, i16* %arrayidx6.us.2, align 2
910 %conv7.us.2 = sext i16 %tmp6 to i32
911 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
912 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
913 %inc.us.2 = or i32 %j.021.us, 3
914 %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
915 %tmp7 = load i16, i16* %arrayidx6.us.3, align 2
916 %conv7.us.3 = sext i16 %tmp7 to i32
917 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
918 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
919 %inc.us.3 = add i32 %j.021.us, 4
920 %niter.nsub.3 = add i32 %niter, -4
921 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
922 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
924 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
925 %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
926 %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
927 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
928 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
930 for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
931 %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
932 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
933 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
934 %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil
935 %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2
936 %conv7.us.epil = sext i16 %tmp8 to i32
937 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
938 %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
939 %inc.us.epil = add nuw i32 %j.021.us.epil, 1
940 %epil.iter.sub = add i32 %epil.iter, -1
941 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
942 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
944 for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
945 %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
946 store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4
947 %inc10.us = add nuw i32 %i.024.us, 1
948 %exitcond27 = icmp eq i32 %inc10.us, %N
949 br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
951 for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
955 ; CHECK-LABEL: mul32x32_backwards
958 ; TODO: post increments for decreasing addresses
959 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
960 ; CHECK-DEFAULT-NOT: str{{.*}}]!
962 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
963 ; CHECK-COMPLEX-NOT: str{{.*}}]!
965 define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
967 %i.08 = add i32 %N, -1
968 %cmp9 = icmp sgt i32 %i.08, -1
969 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
971 for.body.preheader: ; preds = %entry
972 %xtraiter = and i32 %N, 3
973 %lcmp.mod = icmp eq i32 %xtraiter, 0
974 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
976 for.body.prol: ; preds = %for.body.prol, %for.body.preheader
977 %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
978 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
979 %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol
980 %tmp = load i32, i32* %arrayidx.prol, align 4
981 %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol
982 %tmp1 = load i32, i32* %arrayidx1.prol, align 4
983 %mul.prol = mul nsw i32 %tmp1, %tmp
984 %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol
985 store i32 %mul.prol, i32* %arrayidx2.prol, align 4
986 %i.0.prol = add i32 %i.010.prol, -1
987 %prol.iter.sub = add i32 %prol.iter, -1
988 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
989 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
991 for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader
992 %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
993 %tmp2 = icmp ult i32 %i.08, 3
994 br i1 %tmp2, label %for.cond.cleanup, label %for.body
996 for.cond.cleanup: ; preds = %for.body, %for.body.prol.loopexit, %entry
999 for.body: ; preds = %for.body, %for.body.prol.loopexit
1000 %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
1001 %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010
1002 %tmp3 = load i32, i32* %arrayidx, align 4
1003 %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010
1004 %tmp4 = load i32, i32* %arrayidx1, align 4
1005 %mul = mul nsw i32 %tmp4, %tmp3
1006 %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010
1007 store i32 %mul, i32* %arrayidx2, align 4
1008 %i.0 = add i32 %i.010, -1
1009 %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0
1010 %tmp5 = load i32, i32* %arrayidx.1, align 4
1011 %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0
1012 %tmp6 = load i32, i32* %arrayidx1.1, align 4
1013 %mul.1 = mul nsw i32 %tmp6, %tmp5
1014 %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0
1015 store i32 %mul.1, i32* %arrayidx2.1, align 4
1016 %i.0.1 = add i32 %i.010, -2
1017 %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1
1018 %tmp7 = load i32, i32* %arrayidx.2, align 4
1019 %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1
1020 %tmp8 = load i32, i32* %arrayidx1.2, align 4
1021 %mul.2 = mul nsw i32 %tmp8, %tmp7
1022 %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1
1023 store i32 %mul.2, i32* %arrayidx2.2, align 4
1024 %i.0.2 = add i32 %i.010, -3
1025 %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2
1026 %tmp9 = load i32, i32* %arrayidx.3, align 4
1027 %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2
1028 %tmp10 = load i32, i32* %arrayidx1.3, align 4
1029 %mul.3 = mul nsw i32 %tmp10, %tmp9
1030 %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2
1031 store i32 %mul.3, i32* %arrayidx2.3, align 4
1032 %i.0.3 = add i32 %i.010, -4
1033 %cmp.3 = icmp sgt i32 %i.0.3, -1
1034 br i1 %cmp.3, label %for.body, label %for.cond.cleanup
1037 ; CHECK-LABEL: mul32x32_forwards
1038 ; CHECK: @ %for.body
1040 ; TODO: Would be good for the complexity limit didn't have to be increased to
1041 ; enable the pre-indexed accesses.
1043 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
1044 ; CHECK-DEFAULT-NOT: str{{.*}}]!
1046 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1047 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1048 ; CHECK-COMPLEX: str{{.*}}, #16]!
1050 ; CHECK-T2: @ %for.body.epil
1051 ; CHECK-T2: ldr{{.*}}, #4]!
1052 ; CHECK-T2: ldr{{.*}}, #4]!
1053 ; CHECK-T2: str{{.*}}, #4]!
1055 define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
1057 %cmp8 = icmp eq i32 %N, 0
1058 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1060 for.body.preheader: ; preds = %entry
1061 %tmp = add i32 %N, -1
1062 %xtraiter = and i32 %N, 3
1063 %tmp1 = icmp ult i32 %tmp, 3
1064 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1066 for.body.preheader.new: ; preds = %for.body.preheader
1067 %unroll_iter = sub i32 %N, %xtraiter
1070 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1071 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1072 %lcmp.mod = icmp eq i32 %xtraiter, 0
1073 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1075 for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
1076 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1077 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1078 %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1079 %tmp2 = load i32, i32* %arrayidx.epil, align 4
1080 %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil
1081 %tmp3 = load i32, i32* %arrayidx1.epil, align 4
1082 %mul.epil = mul nsw i32 %tmp3, %tmp2
1083 %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1084 store i32 %mul.epil, i32* %arrayidx2.epil, align 4
1085 %inc.epil = add nuw nsw i32 %i.09.epil, 1
1086 %epil.iter.sub = add i32 %epil.iter, -1
1087 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1088 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1090 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1093 for.body: ; preds = %for.body, %for.body.preheader.new
1094 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1095 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1096 %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09
1097 %tmp4 = load i32, i32* %arrayidx, align 4
1098 %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09
1099 %tmp5 = load i32, i32* %arrayidx1, align 4
1100 %mul = mul nsw i32 %tmp5, %tmp4
1101 %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09
1102 store i32 %mul, i32* %arrayidx2, align 4
1103 %inc = or i32 %i.09, 1
1104 %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1105 %tmp6 = load i32, i32* %arrayidx.1, align 4
1106 %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc
1107 %tmp7 = load i32, i32* %arrayidx1.1, align 4
1108 %mul.1 = mul nsw i32 %tmp7, %tmp6
1109 %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1110 store i32 %mul.1, i32* %arrayidx2.1, align 4
1111 %inc.1 = or i32 %i.09, 2
1112 %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1113 %tmp8 = load i32, i32* %arrayidx.2, align 4
1114 %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1
1115 %tmp9 = load i32, i32* %arrayidx1.2, align 4
1116 %mul.2 = mul nsw i32 %tmp9, %tmp8
1117 %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1118 store i32 %mul.2, i32* %arrayidx2.2, align 4
1119 %inc.2 = or i32 %i.09, 3
1120 %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1121 %tmp10 = load i32, i32* %arrayidx.3, align 4
1122 %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2
1123 %tmp11 = load i32, i32* %arrayidx1.3, align 4
1124 %mul.3 = mul nsw i32 %tmp11, %tmp10
1125 %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1126 store i32 %mul.3, i32* %arrayidx2.3, align 4
1127 %inc.3 = add nuw nsw i32 %i.09, 4
1128 %niter.nsub.3 = add i32 %niter, -4
1129 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1130 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body