1 ; RUN: opt -arm-parallel-dsp -dce -mtriple=armv7-a -S %s -o - | FileCheck %s
3 ; CHECK-LABEL: single_block
4 ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
5 ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
6 ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
7 ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
8 ; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)
9 define i32 @single_block(i16* %a, i16* %b, i32 %acc) {
11 %ld.a.0 = load i16, i16* %a
12 %sext.a.0 = sext i16 %ld.a.0 to i32
13 %ld.b.0 = load i16, i16* %b
14 %sext.b.0 = sext i16 %ld.b.0 to i32
15 %mul.0 = mul i32 %sext.a.0, %sext.b.0
16 %addr.a.1 = getelementptr i16, i16* %a, i32 1
17 %addr.b.1 = getelementptr i16, i16* %b, i32 1
18 %ld.a.1 = load i16, i16* %addr.a.1
19 %sext.a.1 = sext i16 %ld.a.1 to i32
20 %ld.b.1 = load i16, i16* %addr.b.1
21 %sext.b.1 = sext i16 %ld.b.1 to i32
22 %mul.1 = mul i32 %sext.a.1, %sext.b.1
23 %add = add i32 %mul.0, %mul.1
24 %res = add i32 %add, %acc
28 ; CHECK-LABEL: single_block_64
29 ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
30 ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
31 ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
32 ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
33 ; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 %acc)
34 define i64 @single_block_64(i16* %a, i16* %b, i64 %acc) {
36 %ld.a.0 = load i16, i16* %a
37 %sext.a.0 = sext i16 %ld.a.0 to i32
38 %ld.b.0 = load i16, i16* %b
39 %sext.b.0 = sext i16 %ld.b.0 to i32
40 %mul.0 = mul i32 %sext.a.0, %sext.b.0
41 %addr.a.1 = getelementptr i16, i16* %a, i32 1
42 %addr.b.1 = getelementptr i16, i16* %b, i32 1
43 %ld.a.1 = load i16, i16* %addr.a.1
44 %sext.a.1 = sext i16 %ld.a.1 to i32
45 %ld.b.1 = load i16, i16* %addr.b.1
46 %sext.b.1 = sext i16 %ld.b.1 to i32
47 %mul.1 = mul i32 %sext.a.1, %sext.b.1
48 %sext.mul.0 = sext i32 %mul.0 to i64
49 %sext.mul.1 = sext i32 %mul.1 to i64
50 %add = add i64 %sext.mul.0, %sext.mul.1
51 %res = add i64 %add, %acc
55 ; CHECK-LABEL: multi_block
56 ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
57 ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
58 ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
59 ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
60 ; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)
61 define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {
63 %ld.a.0 = load i16, i16* %a
64 %sext.a.0 = sext i16 %ld.a.0 to i32
65 %ld.b.0 = load i16, i16* %b
66 %sext.b.0 = sext i16 %ld.b.0 to i32
67 %mul.0 = mul i32 %sext.a.0, %sext.b.0
68 %addr.a.1 = getelementptr i16, i16* %a, i32 1
69 %addr.b.1 = getelementptr i16, i16* %b, i32 1
70 %ld.a.1 = load i16, i16* %addr.a.1
71 %sext.a.1 = sext i16 %ld.a.1 to i32
72 %ld.b.1 = load i16, i16* %addr.b.1
73 %sext.b.1 = sext i16 %ld.b.1 to i32
74 %mul.1 = mul i32 %sext.a.1, %sext.b.1
75 %add = add i32 %mul.0, %mul.1
79 %res = add i32 %add, %acc
83 ; CHECK-LABEL: multi_block_64
84 ; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
85 ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]
86 ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
87 ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]]
88 ; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 0)
89 define i64 @multi_block_64(i16* %a, i16* %b, i64 %acc) {
91 %ld.a.0 = load i16, i16* %a
92 %sext.a.0 = sext i16 %ld.a.0 to i32
93 %ld.b.0 = load i16, i16* %b
94 %sext.b.0 = sext i16 %ld.b.0 to i32
95 %mul.0 = mul i32 %sext.a.0, %sext.b.0
96 %addr.a.1 = getelementptr i16, i16* %a, i32 1
97 %addr.b.1 = getelementptr i16, i16* %b, i32 1
98 %ld.a.1 = load i16, i16* %addr.a.1
99 %sext.a.1 = sext i16 %ld.a.1 to i32
100 %ld.b.1 = load i16, i16* %addr.b.1
101 %sext.b.1 = sext i16 %ld.b.1 to i32
102 %mul.1 = mul i32 %sext.a.1, %sext.b.1
103 %sext.mul.0 = sext i32 %mul.0 to i64
104 %sext.mul.1 = sext i32 %mul.1 to i64
105 %add = add i64 %sext.mul.0, %sext.mul.1
109 %res = add i64 %add, %acc
113 ; CHECK-LABEL: multi_block_1
114 ; CHECK-NOT: call i32 @llvm.arm.smlad
115 define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) {
117 %ld.a.0 = load i16, i16* %a
118 %sext.a.0 = sext i16 %ld.a.0 to i32
119 %ld.b.0 = load i16, i16* %b
120 %sext.b.0 = sext i16 %ld.b.0 to i32
121 %mul.0 = mul i32 %sext.a.0, %sext.b.0
125 %addr.a.1 = getelementptr i16, i16* %a, i32 1
126 %addr.b.1 = getelementptr i16, i16* %b, i32 1
127 %ld.a.1 = load i16, i16* %addr.a.1
128 %sext.a.1 = sext i16 %ld.a.1 to i32
129 %ld.b.1 = load i16, i16* %addr.b.1
130 %sext.b.1 = sext i16 %ld.b.1 to i32
131 %mul.1 = mul i32 %sext.a.1, %sext.b.1
132 %add = add i32 %mul.0, %mul.1
133 %res = add i32 %add, %acc
137 ; TODO: Four smlads should be generated here, but mul.0 and mul.3 remain as
139 ; CHECK-LABEL: num_load_limit
140 ; CHECK: call i32 @llvm.arm.smlad
141 ; CHECK: call i32 @llvm.arm.smlad
142 ; CHECK: call i32 @llvm.arm.smlad
143 ; CHECK-NOT: call i32 @llvm.arm.smlad
144 define i32 @num_load_limit(i16* %a, i16* %b, i32 %acc) {
146 %ld.a.0 = load i16, i16* %a
147 %sext.a.0 = sext i16 %ld.a.0 to i32
148 %ld.b.0 = load i16, i16* %b
149 %sext.b.0 = sext i16 %ld.b.0 to i32
150 %mul.0 = mul i32 %sext.a.0, %sext.b.0
151 %addr.a.1 = getelementptr i16, i16* %a, i32 1
152 %addr.b.1 = getelementptr i16, i16* %b, i32 1
153 %ld.a.1 = load i16, i16* %addr.a.1
154 %sext.a.1 = sext i16 %ld.a.1 to i32
155 %ld.b.1 = load i16, i16* %addr.b.1
156 %sext.b.1 = sext i16 %ld.b.1 to i32
157 %mul.1 = mul i32 %sext.a.1, %sext.b.1
158 %add.0 = add i32 %mul.0, %mul.1
160 %addr.a.2 = getelementptr i16, i16* %a, i32 2
161 %addr.b.2 = getelementptr i16, i16* %b, i32 2
162 %ld.a.2 = load i16, i16* %addr.a.2
163 %sext.a.2 = sext i16 %ld.a.2 to i32
164 %ld.b.2 = load i16, i16* %addr.b.2
165 %sext.b.2 = sext i16 %ld.b.2 to i32
166 %mul.2 = mul i32 %sext.a.0, %sext.b.0
167 %addr.a.3 = getelementptr i16, i16* %a, i32 3
168 %addr.b.3 = getelementptr i16, i16* %b, i32 3
169 %ld.a.3 = load i16, i16* %addr.a.3
170 %sext.a.3 = sext i16 %ld.a.3 to i32
171 %ld.b.3 = load i16, i16* %addr.b.3
172 %sext.b.3 = sext i16 %ld.b.3 to i32
173 %mul.3 = mul i32 %sext.a.1, %sext.b.3
174 %add.3 = add i32 %mul.2, %mul.3
176 %addr.a.4 = getelementptr i16, i16* %a, i32 4
177 %addr.b.4 = getelementptr i16, i16* %b, i32 4
178 %ld.a.4 = load i16, i16* %addr.a.4
179 %sext.a.4 = sext i16 %ld.a.4 to i32
180 %ld.b.4 = load i16, i16* %addr.b.4
181 %sext.b.4 = sext i16 %ld.b.4 to i32
182 %mul.4 = mul i32 %sext.a.4, %sext.b.4
183 %addr.a.5 = getelementptr i16, i16* %a, i32 5
184 %addr.b.5 = getelementptr i16, i16* %b, i32 5
185 %ld.a.5 = load i16, i16* %addr.a.5
186 %sext.a.5 = sext i16 %ld.a.5 to i32
187 %ld.b.5 = load i16, i16* %addr.b.5
188 %sext.b.5 = sext i16 %ld.b.5 to i32
189 %mul.5 = mul i32 %sext.a.5, %sext.b.5
190 %add.5 = add i32 %mul.4, %mul.5
192 %addr.a.6 = getelementptr i16, i16* %a, i32 6
193 %addr.b.6 = getelementptr i16, i16* %b, i32 6
194 %ld.a.6 = load i16, i16* %addr.a.6
195 %sext.a.6 = sext i16 %ld.a.6 to i32
196 %ld.b.6 = load i16, i16* %addr.b.6
197 %sext.b.6 = sext i16 %ld.b.6 to i32
198 %mul.6 = mul i32 %sext.a.6, %sext.b.6
199 %addr.a.7 = getelementptr i16, i16* %a, i32 7
200 %addr.b.7 = getelementptr i16, i16* %b, i32 7
201 %ld.a.7 = load i16, i16* %addr.a.7
202 %sext.a.7 = sext i16 %ld.a.7 to i32
203 %ld.b.7 = load i16, i16* %addr.b.7
204 %sext.b.7 = sext i16 %ld.b.7 to i32
205 %mul.7 = mul i32 %sext.a.7, %sext.b.7
206 %add.7 = add i32 %mul.6, %mul.7
208 %add.10 = add i32 %add.7, %add.5
209 %add.11 = add i32 %add.3, %add.0
210 %add.12 = add i32 %add.10, %add.11
211 %res = add i32 %add.12, %acc
215 ; CHECK-LABEL: too_many_loads
216 ; CHECK-NOT: call i32 @llvm.arm.smlad
217 define i32 @too_many_loads(i16* %a, i16* %b, i32 %acc) {
219 %ld.a.0 = load i16, i16* %a
220 %sext.a.0 = sext i16 %ld.a.0 to i32
221 %ld.b.0 = load i16, i16* %b
222 %sext.b.0 = sext i16 %ld.b.0 to i32
223 %mul.0 = mul i32 %sext.a.0, %sext.b.0
224 %addr.a.1 = getelementptr i16, i16* %a, i32 1
225 %addr.b.1 = getelementptr i16, i16* %b, i32 1
226 %ld.a.1 = load i16, i16* %addr.a.1
227 %sext.a.1 = sext i16 %ld.a.1 to i32
228 %ld.b.1 = load i16, i16* %addr.b.1
229 %sext.b.1 = sext i16 %ld.b.1 to i32
230 %mul.1 = mul i32 %sext.a.1, %sext.b.1
231 %add.0 = add i32 %mul.0, %mul.1
233 %addr.a.2 = getelementptr i16, i16* %a, i32 2
234 %addr.b.2 = getelementptr i16, i16* %b, i32 2
235 %ld.a.2 = load i16, i16* %addr.a.2
236 %sext.a.2 = sext i16 %ld.a.2 to i32
237 %ld.b.2 = load i16, i16* %addr.b.2
238 %sext.b.2 = sext i16 %ld.b.2 to i32
239 %mul.2 = mul i32 %sext.a.0, %sext.b.0
240 %addr.a.3 = getelementptr i16, i16* %a, i32 3
241 %addr.b.3 = getelementptr i16, i16* %b, i32 3
242 %ld.a.3 = load i16, i16* %addr.a.3
243 %sext.a.3 = sext i16 %ld.a.3 to i32
244 %ld.b.3 = load i16, i16* %addr.b.3
245 %sext.b.3 = sext i16 %ld.b.3 to i32
246 %mul.3 = mul i32 %sext.a.1, %sext.b.3
247 %add.3 = add i32 %mul.2, %mul.3
249 %addr.a.4 = getelementptr i16, i16* %a, i32 4
250 %addr.b.4 = getelementptr i16, i16* %b, i32 4
251 %ld.a.4 = load i16, i16* %addr.a.4
252 %sext.a.4 = sext i16 %ld.a.4 to i32
253 %ld.b.4 = load i16, i16* %addr.b.4
254 %sext.b.4 = sext i16 %ld.b.4 to i32
255 %mul.4 = mul i32 %sext.a.4, %sext.b.4
256 %addr.a.5 = getelementptr i16, i16* %a, i32 5
257 %addr.b.5 = getelementptr i16, i16* %b, i32 5
258 %ld.a.5 = load i16, i16* %addr.a.5
259 %sext.a.5 = sext i16 %ld.a.5 to i32
260 %ld.b.5 = load i16, i16* %addr.b.5
261 %sext.b.5 = sext i16 %ld.b.5 to i32
262 %mul.5 = mul i32 %sext.a.5, %sext.b.5
263 %add.5 = add i32 %mul.4, %mul.5
265 %addr.a.6 = getelementptr i16, i16* %a, i32 6
266 %addr.b.6 = getelementptr i16, i16* %b, i32 6
267 %ld.a.6 = load i16, i16* %addr.a.6
268 %sext.a.6 = sext i16 %ld.a.6 to i32
269 %ld.b.6 = load i16, i16* %addr.b.6
270 %sext.b.6 = sext i16 %ld.b.6 to i32
271 %mul.6 = mul i32 %sext.a.6, %sext.b.6
272 %addr.a.7 = getelementptr i16, i16* %a, i32 7
273 %addr.b.7 = getelementptr i16, i16* %b, i32 7
274 %ld.a.7 = load i16, i16* %addr.a.7
275 %sext.a.7 = sext i16 %ld.a.7 to i32
276 %ld.b.7 = load i16, i16* %addr.b.7
277 %sext.b.7 = sext i16 %ld.b.7 to i32
278 %mul.7 = mul i32 %sext.a.7, %sext.b.7
279 %add.7 = add i32 %mul.6, %mul.7
281 %addr.a.8 = getelementptr i16, i16* %a, i32 7
282 %addr.b.8 = getelementptr i16, i16* %b, i32 7
283 %ld.a.8 = load i16, i16* %addr.a.8
284 %sext.a.8 = sext i16 %ld.a.8 to i32
285 %ld.b.8 = load i16, i16* %addr.b.8
286 %sext.b.8 = sext i16 %ld.b.8 to i32
287 %mul.8 = mul i32 %sext.a.8, %sext.b.8
289 %add.10 = add i32 %add.7, %add.5
290 %add.11 = add i32 %add.3, %add.0
291 %add.12 = add i32 %add.10, %add.11
292 %add.13 = add i32 %add.12, %acc
293 %res = add i32 %add.13, %mul.8