1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 ; Various reductions generated fro SLP vectorizing unrolled loops. Generated
5 ; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed.
7 define i32 @addv2i32i32(ptr %x) {
8 ; CHECK-LABEL: addv2i32i32:
9 ; CHECK: @ %bb.0: @ %entry
10 ; CHECK-NEXT: ldrd r0, r1, [r0]
11 ; CHECK-NEXT: add r0, r1
14 %0 = load i32, ptr %x, align 4
15 %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1
16 %1 = load i32, ptr %arrayidx.1, align 4
17 %add.1 = add nsw i32 %1, %0
21 define i32 @addv4i32i32(ptr %x) {
22 ; CHECK-LABEL: addv4i32i32:
23 ; CHECK: @ %bb.0: @ %entry
24 ; CHECK-NEXT: vldrw.u32 q0, [r0]
25 ; CHECK-NEXT: vaddv.u32 r0, q0
28 %0 = load <4 x i32>, ptr %x, align 4
29 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
33 define i32 @addv8i32i32(ptr %x) {
34 ; CHECK-LABEL: addv8i32i32:
35 ; CHECK: @ %bb.0: @ %entry
36 ; CHECK-NEXT: vldrw.u32 q1, [r0]
37 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
38 ; CHECK-NEXT: vaddv.u32 r0, q1
39 ; CHECK-NEXT: vaddva.u32 r0, q0
42 %0 = load <8 x i32>, ptr %x, align 4
43 %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
47 define i32 @addv16i32i32(ptr %x) {
48 ; CHECK-LABEL: addv16i32i32:
49 ; CHECK: @ %bb.0: @ %entry
50 ; CHECK-NEXT: vldrw.u32 q1, [r0]
51 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
52 ; CHECK-NEXT: vaddv.u32 r2, q1
53 ; CHECK-NEXT: vaddva.u32 r2, q0
54 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
55 ; CHECK-NEXT: vaddva.u32 r2, q0
56 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
57 ; CHECK-NEXT: vaddva.u32 r2, q0
58 ; CHECK-NEXT: mov r0, r2
61 %0 = load <16 x i32>, ptr %x, align 4
62 %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
66 define i32 @addv24i32i32(ptr %x) {
67 ; CHECK-LABEL: addv24i32i32:
68 ; CHECK: @ %bb.0: @ %entry
69 ; CHECK-NEXT: vldrw.u32 q1, [r0]
70 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
71 ; CHECK-NEXT: vaddv.u32 r2, q1
72 ; CHECK-NEXT: vaddva.u32 r2, q0
73 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
74 ; CHECK-NEXT: vaddva.u32 r2, q0
75 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
76 ; CHECK-NEXT: vaddva.u32 r2, q0
77 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
78 ; CHECK-NEXT: vaddva.u32 r2, q0
79 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
80 ; CHECK-NEXT: vaddva.u32 r2, q0
81 ; CHECK-NEXT: mov r0, r2
84 %0 = load <8 x i32>, ptr %x, align 4
85 %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8
86 %1 = load <16 x i32>, ptr %arrayidx.8, align 4
87 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
88 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
89 %op.rdx = add nsw i32 %2, %3
93 define i32 @addv32i32i32(ptr %x) {
94 ; CHECK-LABEL: addv32i32i32:
95 ; CHECK: @ %bb.0: @ %entry
96 ; CHECK-NEXT: vldrw.u32 q1, [r0]
97 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
98 ; CHECK-NEXT: mov r1, r0
99 ; CHECK-NEXT: vaddv.u32 r0, q1
100 ; CHECK-NEXT: vaddva.u32 r0, q0
101 ; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
102 ; CHECK-NEXT: vaddva.u32 r0, q0
103 ; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
104 ; CHECK-NEXT: vaddva.u32 r0, q0
105 ; CHECK-NEXT: vldrw.u32 q0, [r1, #64]
106 ; CHECK-NEXT: vaddva.u32 r0, q0
107 ; CHECK-NEXT: vldrw.u32 q0, [r1, #80]
108 ; CHECK-NEXT: vaddva.u32 r0, q0
109 ; CHECK-NEXT: vldrw.u32 q0, [r1, #96]
110 ; CHECK-NEXT: vaddva.u32 r0, q0
111 ; CHECK-NEXT: vldrw.u32 q0, [r1, #112]
112 ; CHECK-NEXT: vaddva.u32 r0, q0
115 %0 = load <32 x i32>, ptr %x, align 4
116 %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %0)
120 define i32 @addv64i32i32(ptr %x) {
121 ; CHECK-LABEL: addv64i32i32:
122 ; CHECK: @ %bb.0: @ %entry
123 ; CHECK-NEXT: vldrw.u32 q1, [r0]
124 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
125 ; CHECK-NEXT: vaddv.u32 r2, q1
126 ; CHECK-NEXT: vaddva.u32 r2, q0
127 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
128 ; CHECK-NEXT: vaddva.u32 r2, q0
129 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
130 ; CHECK-NEXT: vaddva.u32 r2, q0
131 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
132 ; CHECK-NEXT: vaddva.u32 r2, q0
133 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
134 ; CHECK-NEXT: vaddva.u32 r2, q0
135 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
136 ; CHECK-NEXT: vaddva.u32 r2, q0
137 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
138 ; CHECK-NEXT: vaddva.u32 r2, q0
139 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
140 ; CHECK-NEXT: vaddva.u32 r2, q0
141 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
142 ; CHECK-NEXT: vaddva.u32 r2, q0
143 ; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
144 ; CHECK-NEXT: vaddva.u32 r2, q0
145 ; CHECK-NEXT: vldrw.u32 q0, [r0, #176]
146 ; CHECK-NEXT: vaddva.u32 r2, q0
147 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
148 ; CHECK-NEXT: vaddva.u32 r2, q0
149 ; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
150 ; CHECK-NEXT: vaddva.u32 r2, q0
151 ; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
152 ; CHECK-NEXT: vaddva.u32 r2, q0
153 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
154 ; CHECK-NEXT: vaddva.u32 r2, q0
155 ; CHECK-NEXT: mov r0, r2
158 %0 = load <64 x i32>, ptr %x, align 4
159 %1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0)
163 define i32 @addv128i32i32(ptr %x) {
164 ; CHECK-LABEL: addv128i32i32:
165 ; CHECK: @ %bb.0: @ %entry
166 ; CHECK-NEXT: vldrw.u32 q1, [r0]
167 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
168 ; CHECK-NEXT: vaddv.u32 r2, q1
169 ; CHECK-NEXT: vaddva.u32 r2, q0
170 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
171 ; CHECK-NEXT: vaddva.u32 r2, q0
172 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
173 ; CHECK-NEXT: vaddva.u32 r2, q0
174 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
175 ; CHECK-NEXT: vaddva.u32 r2, q0
176 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
177 ; CHECK-NEXT: vaddva.u32 r2, q0
178 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
179 ; CHECK-NEXT: vaddva.u32 r2, q0
180 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
181 ; CHECK-NEXT: vaddva.u32 r2, q0
182 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
183 ; CHECK-NEXT: vaddva.u32 r2, q0
184 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
185 ; CHECK-NEXT: vaddva.u32 r2, q0
186 ; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
187 ; CHECK-NEXT: vaddva.u32 r2, q0
188 ; CHECK-NEXT: vldrw.u32 q0, [r0, #176]
189 ; CHECK-NEXT: vaddva.u32 r2, q0
190 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
191 ; CHECK-NEXT: vaddva.u32 r2, q0
192 ; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
193 ; CHECK-NEXT: vaddva.u32 r2, q0
194 ; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
195 ; CHECK-NEXT: vaddva.u32 r2, q0
196 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
197 ; CHECK-NEXT: vaddva.u32 r2, q0
198 ; CHECK-NEXT: vldrw.u32 q0, [r0, #256]
199 ; CHECK-NEXT: vaddva.u32 r2, q0
200 ; CHECK-NEXT: vldrw.u32 q0, [r0, #272]
201 ; CHECK-NEXT: vaddva.u32 r2, q0
202 ; CHECK-NEXT: vldrw.u32 q0, [r0, #288]
203 ; CHECK-NEXT: vaddva.u32 r2, q0
204 ; CHECK-NEXT: vldrw.u32 q0, [r0, #304]
205 ; CHECK-NEXT: vaddva.u32 r2, q0
206 ; CHECK-NEXT: vldrw.u32 q0, [r0, #320]
207 ; CHECK-NEXT: vaddva.u32 r2, q0
208 ; CHECK-NEXT: vldrw.u32 q0, [r0, #336]
209 ; CHECK-NEXT: vaddva.u32 r2, q0
210 ; CHECK-NEXT: vldrw.u32 q0, [r0, #352]
211 ; CHECK-NEXT: vaddva.u32 r2, q0
212 ; CHECK-NEXT: vldrw.u32 q0, [r0, #368]
213 ; CHECK-NEXT: vaddva.u32 r2, q0
214 ; CHECK-NEXT: vldrw.u32 q0, [r0, #384]
215 ; CHECK-NEXT: vaddva.u32 r2, q0
216 ; CHECK-NEXT: vldrw.u32 q0, [r0, #400]
217 ; CHECK-NEXT: vaddva.u32 r2, q0
218 ; CHECK-NEXT: vldrw.u32 q0, [r0, #416]
219 ; CHECK-NEXT: vaddva.u32 r2, q0
220 ; CHECK-NEXT: vldrw.u32 q0, [r0, #432]
221 ; CHECK-NEXT: vaddva.u32 r2, q0
222 ; CHECK-NEXT: vldrw.u32 q0, [r0, #448]
223 ; CHECK-NEXT: vaddva.u32 r2, q0
224 ; CHECK-NEXT: vldrw.u32 q0, [r0, #464]
225 ; CHECK-NEXT: vaddva.u32 r2, q0
226 ; CHECK-NEXT: vldrw.u32 q0, [r0, #480]
227 ; CHECK-NEXT: vaddva.u32 r2, q0
228 ; CHECK-NEXT: vldrw.u32 q0, [r0, #496]
229 ; CHECK-NEXT: vaddva.u32 r2, q0
230 ; CHECK-NEXT: mov r0, r2
233 %wide.load = load <4 x i32>, ptr %x, align 4
234 %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
235 %1 = getelementptr inbounds i32, ptr %x, i32 4
236 %wide.load.1 = load <4 x i32>, ptr %1, align 4
237 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1)
239 %4 = getelementptr inbounds i32, ptr %x, i32 8
240 %wide.load.2 = load <4 x i32>, ptr %4, align 4
241 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2)
243 %7 = getelementptr inbounds i32, ptr %x, i32 12
244 %wide.load.3 = load <4 x i32>, ptr %7, align 4
245 %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3)
247 %10 = getelementptr inbounds i32, ptr %x, i32 16
248 %wide.load.4 = load <4 x i32>, ptr %10, align 4
249 %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4)
250 %12 = add i32 %11, %9
251 %13 = getelementptr inbounds i32, ptr %x, i32 20
252 %wide.load.5 = load <4 x i32>, ptr %13, align 4
253 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5)
254 %15 = add i32 %14, %12
255 %16 = getelementptr inbounds i32, ptr %x, i32 24
256 %wide.load.6 = load <4 x i32>, ptr %16, align 4
257 %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6)
258 %18 = add i32 %17, %15
259 %19 = getelementptr inbounds i32, ptr %x, i32 28
260 %wide.load.7 = load <4 x i32>, ptr %19, align 4
261 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7)
262 %21 = add i32 %20, %18
263 %22 = getelementptr inbounds i32, ptr %x, i32 32
264 %wide.load.8 = load <4 x i32>, ptr %22, align 4
265 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8)
266 %24 = add i32 %23, %21
267 %25 = getelementptr inbounds i32, ptr %x, i32 36
268 %wide.load.9 = load <4 x i32>, ptr %25, align 4
269 %26 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9)
270 %27 = add i32 %26, %24
271 %28 = getelementptr inbounds i32, ptr %x, i32 40
272 %wide.load.10 = load <4 x i32>, ptr %28, align 4
273 %29 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10)
274 %30 = add i32 %29, %27
275 %31 = getelementptr inbounds i32, ptr %x, i32 44
276 %wide.load.11 = load <4 x i32>, ptr %31, align 4
277 %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11)
278 %33 = add i32 %32, %30
279 %34 = getelementptr inbounds i32, ptr %x, i32 48
280 %wide.load.12 = load <4 x i32>, ptr %34, align 4
281 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12)
282 %36 = add i32 %35, %33
283 %37 = getelementptr inbounds i32, ptr %x, i32 52
284 %wide.load.13 = load <4 x i32>, ptr %37, align 4
285 %38 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13)
286 %39 = add i32 %38, %36
287 %40 = getelementptr inbounds i32, ptr %x, i32 56
288 %wide.load.14 = load <4 x i32>, ptr %40, align 4
289 %41 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14)
290 %42 = add i32 %41, %39
291 %43 = getelementptr inbounds i32, ptr %x, i32 60
292 %wide.load.15 = load <4 x i32>, ptr %43, align 4
293 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15)
294 %45 = add i32 %44, %42
295 %46 = getelementptr inbounds i32, ptr %x, i32 64
296 %wide.load.16 = load <4 x i32>, ptr %46, align 4
297 %47 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16)
298 %48 = add i32 %47, %45
299 %49 = getelementptr inbounds i32, ptr %x, i32 68
300 %wide.load.17 = load <4 x i32>, ptr %49, align 4
301 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17)
302 %51 = add i32 %50, %48
303 %52 = getelementptr inbounds i32, ptr %x, i32 72
304 %wide.load.18 = load <4 x i32>, ptr %52, align 4
305 %53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18)
306 %54 = add i32 %53, %51
307 %55 = getelementptr inbounds i32, ptr %x, i32 76
308 %wide.load.19 = load <4 x i32>, ptr %55, align 4
309 %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19)
310 %57 = add i32 %56, %54
311 %58 = getelementptr inbounds i32, ptr %x, i32 80
312 %wide.load.20 = load <4 x i32>, ptr %58, align 4
313 %59 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20)
314 %60 = add i32 %59, %57
315 %61 = getelementptr inbounds i32, ptr %x, i32 84
316 %wide.load.21 = load <4 x i32>, ptr %61, align 4
317 %62 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21)
318 %63 = add i32 %62, %60
319 %64 = getelementptr inbounds i32, ptr %x, i32 88
320 %wide.load.22 = load <4 x i32>, ptr %64, align 4
321 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22)
322 %66 = add i32 %65, %63
323 %67 = getelementptr inbounds i32, ptr %x, i32 92
324 %wide.load.23 = load <4 x i32>, ptr %67, align 4
325 %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23)
326 %69 = add i32 %68, %66
327 %70 = getelementptr inbounds i32, ptr %x, i32 96
328 %wide.load.24 = load <4 x i32>, ptr %70, align 4
329 %71 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24)
330 %72 = add i32 %71, %69
331 %73 = getelementptr inbounds i32, ptr %x, i32 100
332 %wide.load.25 = load <4 x i32>, ptr %73, align 4
333 %74 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25)
334 %75 = add i32 %74, %72
335 %76 = getelementptr inbounds i32, ptr %x, i32 104
336 %wide.load.26 = load <4 x i32>, ptr %76, align 4
337 %77 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26)
338 %78 = add i32 %77, %75
339 %79 = getelementptr inbounds i32, ptr %x, i32 108
340 %wide.load.27 = load <4 x i32>, ptr %79, align 4
341 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27)
342 %81 = add i32 %80, %78
343 %82 = getelementptr inbounds i32, ptr %x, i32 112
344 %wide.load.28 = load <4 x i32>, ptr %82, align 4
345 %83 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28)
346 %84 = add i32 %83, %81
347 %85 = getelementptr inbounds i32, ptr %x, i32 116
348 %wide.load.29 = load <4 x i32>, ptr %85, align 4
349 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29)
350 %87 = add i32 %86, %84
351 %88 = getelementptr inbounds i32, ptr %x, i32 120
352 %wide.load.30 = load <4 x i32>, ptr %88, align 4
353 %89 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30)
354 %90 = add i32 %89, %87
355 %91 = getelementptr inbounds i32, ptr %x, i32 124
356 %wide.load.31 = load <4 x i32>, ptr %91, align 4
357 %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31)
358 %93 = add i32 %92, %90
362 define i32 @addv2i32i16(ptr %x) {
363 ; CHECK-LABEL: addv2i32i16:
364 ; CHECK: @ %bb.0: @ %entry
365 ; CHECK-NEXT: ldrsh.w r1, [r0]
366 ; CHECK-NEXT: ldrsh.w r0, [r0, #2]
367 ; CHECK-NEXT: add r0, r1
370 %0 = load i16, ptr %x, align 2
371 %conv = sext i16 %0 to i32
372 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
373 %1 = load i16, ptr %arrayidx.1, align 2
374 %conv.1 = sext i16 %1 to i32
375 %add.1 = add nsw i32 %conv, %conv.1
379 define i32 @addv4i32i16(ptr %x) {
380 ; CHECK-LABEL: addv4i32i16:
381 ; CHECK: @ %bb.0: @ %entry
382 ; CHECK-NEXT: vldrh.s32 q0, [r0]
383 ; CHECK-NEXT: vaddv.u32 r0, q0
386 %0 = load <4 x i16>, ptr %x, align 2
387 %1 = sext <4 x i16> %0 to <4 x i32>
388 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
392 define i32 @addv8i32i16(ptr %x) {
393 ; CHECK-LABEL: addv8i32i16:
394 ; CHECK: @ %bb.0: @ %entry
395 ; CHECK-NEXT: vldrh.u16 q0, [r0]
396 ; CHECK-NEXT: vaddv.s16 r0, q0
399 %0 = load <8 x i16>, ptr %x, align 2
400 %1 = sext <8 x i16> %0 to <8 x i32>
401 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
405 define i32 @addv16i32i16(ptr %x) {
406 ; CHECK-LABEL: addv16i32i16:
407 ; CHECK: @ %bb.0: @ %entry
408 ; CHECK-NEXT: vldrh.s32 q1, [r0]
409 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
410 ; CHECK-NEXT: vaddv.u32 r2, q1
411 ; CHECK-NEXT: vaddva.u32 r2, q0
412 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
413 ; CHECK-NEXT: vaddva.u32 r2, q0
414 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
415 ; CHECK-NEXT: vaddva.u32 r2, q0
416 ; CHECK-NEXT: mov r0, r2
419 %0 = load <16 x i16>, ptr %x, align 2
420 %1 = sext <16 x i16> %0 to <16 x i32>
421 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
425 define i32 @addv24i32i16(ptr %x) {
426 ; CHECK-LABEL: addv24i32i16:
427 ; CHECK: @ %bb.0: @ %entry
428 ; CHECK-NEXT: vldrh.s32 q1, [r0]
429 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
430 ; CHECK-NEXT: vaddv.u32 r2, q1
431 ; CHECK-NEXT: vaddva.u32 r2, q0
432 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
433 ; CHECK-NEXT: vaddva.u32 r2, q0
434 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
435 ; CHECK-NEXT: vaddva.u32 r2, q0
436 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
437 ; CHECK-NEXT: vaddva.s16 r2, q0
438 ; CHECK-NEXT: mov r0, r2
441 %0 = load <16 x i16>, ptr %x, align 2
442 %1 = sext <16 x i16> %0 to <16 x i32>
443 %arrayidx.16 = getelementptr inbounds i16, ptr %x, i32 16
444 %2 = load <8 x i16>, ptr %arrayidx.16, align 2
445 %3 = sext <8 x i16> %2 to <8 x i32>
446 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
447 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
448 %op.rdx = add nsw i32 %4, %5
452 define i32 @addv32i32i16(ptr %x) {
453 ; CHECK-LABEL: addv32i32i16:
454 ; CHECK: @ %bb.0: @ %entry
455 ; CHECK-NEXT: vldrh.s32 q1, [r0]
456 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
457 ; CHECK-NEXT: vaddv.u32 r2, q1
458 ; CHECK-NEXT: vaddva.u32 r2, q0
459 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
460 ; CHECK-NEXT: vaddva.u32 r2, q0
461 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
462 ; CHECK-NEXT: vaddva.u32 r2, q0
463 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
464 ; CHECK-NEXT: vaddva.u32 r2, q0
465 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
466 ; CHECK-NEXT: vaddva.u32 r2, q0
467 ; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
468 ; CHECK-NEXT: vaddva.u32 r2, q0
469 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
470 ; CHECK-NEXT: vaddva.u32 r2, q0
471 ; CHECK-NEXT: mov r0, r2
474 %0 = load <32 x i16>, ptr %x, align 2
475 %1 = sext <32 x i16> %0 to <32 x i32>
476 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
480 define i32 @addv64i32i16(ptr %x) {
481 ; CHECK-LABEL: addv64i32i16:
482 ; CHECK: @ %bb.0: @ %entry
483 ; CHECK-NEXT: vldrh.s32 q1, [r0]
484 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
485 ; CHECK-NEXT: ldrsh.w r1, [r0, #120]
486 ; CHECK-NEXT: vaddv.u32 r2, q1
487 ; CHECK-NEXT: ldrsh.w r3, [r0, #122]
488 ; CHECK-NEXT: vaddva.u32 r2, q0
489 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
490 ; CHECK-NEXT: ldrsh.w r12, [r0, #124]
491 ; CHECK-NEXT: vaddva.u32 r2, q0
492 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
493 ; CHECK-NEXT: vaddva.u32 r2, q0
494 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
495 ; CHECK-NEXT: vaddva.u32 r2, q0
496 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
497 ; CHECK-NEXT: vaddva.u32 r2, q0
498 ; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
499 ; CHECK-NEXT: vaddva.u32 r2, q0
500 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
501 ; CHECK-NEXT: vaddva.u32 r2, q0
502 ; CHECK-NEXT: vldrh.s32 q0, [r0, #64]
503 ; CHECK-NEXT: vaddva.u32 r2, q0
504 ; CHECK-NEXT: vldrh.s32 q0, [r0, #72]
505 ; CHECK-NEXT: vaddva.u32 r2, q0
506 ; CHECK-NEXT: vldrh.s32 q0, [r0, #80]
507 ; CHECK-NEXT: vaddva.u32 r2, q0
508 ; CHECK-NEXT: vldrh.s32 q0, [r0, #88]
509 ; CHECK-NEXT: vaddva.u32 r2, q0
510 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
511 ; CHECK-NEXT: vaddva.s16 r2, q0
512 ; CHECK-NEXT: vldrh.s32 q0, [r0, #112]
513 ; CHECK-NEXT: ldrsh.w r0, [r0, #126]
514 ; CHECK-NEXT: vaddva.u32 r2, q0
515 ; CHECK-NEXT: add r1, r2
516 ; CHECK-NEXT: add r1, r3
517 ; CHECK-NEXT: add r1, r12
518 ; CHECK-NEXT: add r0, r1
521 %0 = load <32 x i16>, ptr %x, align 2
522 %1 = sext <32 x i16> %0 to <32 x i32>
523 %arrayidx.32 = getelementptr inbounds i16, ptr %x, i32 32
524 %2 = load <16 x i16>, ptr %arrayidx.32, align 2
525 %3 = sext <16 x i16> %2 to <16 x i32>
526 %arrayidx.48 = getelementptr inbounds i16, ptr %x, i32 48
527 %4 = load <8 x i16>, ptr %arrayidx.48, align 2
528 %5 = sext <8 x i16> %4 to <8 x i32>
529 %arrayidx.56 = getelementptr inbounds i16, ptr %x, i32 56
530 %6 = load <4 x i16>, ptr %arrayidx.56, align 2
531 %7 = sext <4 x i16> %6 to <4 x i32>
532 %arrayidx.60 = getelementptr inbounds i16, ptr %x, i32 60
533 %8 = load i16, ptr %arrayidx.60, align 2
534 %conv.60 = sext i16 %8 to i32
535 %arrayidx.61 = getelementptr inbounds i16, ptr %x, i32 61
536 %9 = load i16, ptr %arrayidx.61, align 2
537 %conv.61 = sext i16 %9 to i32
538 %arrayidx.62 = getelementptr inbounds i16, ptr %x, i32 62
539 %10 = load i16, ptr %arrayidx.62, align 2
540 %conv.62 = sext i16 %10 to i32
541 %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
542 %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
543 %op.rdx = add nsw i32 %11, %12
544 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
545 %op.rdx8 = add nsw i32 %op.rdx, %13
546 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
547 %op.rdx9 = add nsw i32 %op.rdx8, %14
548 %15 = add nsw i32 %op.rdx9, %conv.60
549 %16 = add nsw i32 %15, %conv.61
550 %17 = add nsw i32 %16, %conv.62
551 %arrayidx.63 = getelementptr inbounds i16, ptr %x, i32 63
552 %18 = load i16, ptr %arrayidx.63, align 2
553 %conv.63 = sext i16 %18 to i32
554 %add.63 = add nsw i32 %17, %conv.63
558 define i32 @addv128i32i16(ptr %x) {
559 ; CHECK-LABEL: addv128i32i16:
560 ; CHECK: @ %bb.0: @ %entry
561 ; CHECK-NEXT: vldrh.u16 q1, [r0]
562 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
563 ; CHECK-NEXT: vaddv.s16 r2, q1
564 ; CHECK-NEXT: vaddva.s16 r2, q0
565 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
566 ; CHECK-NEXT: vaddva.s16 r2, q0
567 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
568 ; CHECK-NEXT: vaddva.s16 r2, q0
569 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
570 ; CHECK-NEXT: vaddva.s16 r2, q0
571 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
572 ; CHECK-NEXT: vaddva.s16 r2, q0
573 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
574 ; CHECK-NEXT: vaddva.s16 r2, q0
575 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
576 ; CHECK-NEXT: vaddva.s16 r2, q0
577 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
578 ; CHECK-NEXT: vaddva.s16 r2, q0
579 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
580 ; CHECK-NEXT: vaddva.s16 r2, q0
581 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
582 ; CHECK-NEXT: vaddva.s16 r2, q0
583 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
584 ; CHECK-NEXT: vaddva.s16 r2, q0
585 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
586 ; CHECK-NEXT: vaddva.s16 r2, q0
587 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
588 ; CHECK-NEXT: vaddva.s16 r2, q0
589 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
590 ; CHECK-NEXT: vaddva.s16 r2, q0
591 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
592 ; CHECK-NEXT: vaddva.s16 r2, q0
593 ; CHECK-NEXT: mov r0, r2
596 %wide.load = load <8 x i16>, ptr %x, align 2
597 %0 = sext <8 x i16> %wide.load to <8 x i32>
598 %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
599 %2 = getelementptr inbounds i16, ptr %x, i32 8
600 %wide.load.1 = load <8 x i16>, ptr %2, align 2
601 %3 = sext <8 x i16> %wide.load.1 to <8 x i32>
602 %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
604 %6 = getelementptr inbounds i16, ptr %x, i32 16
605 %wide.load.2 = load <8 x i16>, ptr %6, align 2
606 %7 = sext <8 x i16> %wide.load.2 to <8 x i32>
607 %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
609 %10 = getelementptr inbounds i16, ptr %x, i32 24
610 %wide.load.3 = load <8 x i16>, ptr %10, align 2
611 %11 = sext <8 x i16> %wide.load.3 to <8 x i32>
612 %12 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %11)
613 %13 = add i32 %12, %9
614 %14 = getelementptr inbounds i16, ptr %x, i32 32
615 %wide.load.4 = load <8 x i16>, ptr %14, align 2
616 %15 = sext <8 x i16> %wide.load.4 to <8 x i32>
617 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
618 %17 = add i32 %16, %13
619 %18 = getelementptr inbounds i16, ptr %x, i32 40
620 %wide.load.5 = load <8 x i16>, ptr %18, align 2
621 %19 = sext <8 x i16> %wide.load.5 to <8 x i32>
622 %20 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
623 %21 = add i32 %20, %17
624 %22 = getelementptr inbounds i16, ptr %x, i32 48
625 %wide.load.6 = load <8 x i16>, ptr %22, align 2
626 %23 = sext <8 x i16> %wide.load.6 to <8 x i32>
627 %24 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %23)
628 %25 = add i32 %24, %21
629 %26 = getelementptr inbounds i16, ptr %x, i32 56
630 %wide.load.7 = load <8 x i16>, ptr %26, align 2
631 %27 = sext <8 x i16> %wide.load.7 to <8 x i32>
632 %28 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %27)
633 %29 = add i32 %28, %25
634 %30 = getelementptr inbounds i16, ptr %x, i32 64
635 %wide.load.8 = load <8 x i16>, ptr %30, align 2
636 %31 = sext <8 x i16> %wide.load.8 to <8 x i32>
637 %32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %31)
638 %33 = add i32 %32, %29
639 %34 = getelementptr inbounds i16, ptr %x, i32 72
640 %wide.load.9 = load <8 x i16>, ptr %34, align 2
641 %35 = sext <8 x i16> %wide.load.9 to <8 x i32>
642 %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35)
643 %37 = add i32 %36, %33
644 %38 = getelementptr inbounds i16, ptr %x, i32 80
645 %wide.load.10 = load <8 x i16>, ptr %38, align 2
646 %39 = sext <8 x i16> %wide.load.10 to <8 x i32>
647 %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
648 %41 = add i32 %40, %37
649 %42 = getelementptr inbounds i16, ptr %x, i32 88
650 %wide.load.11 = load <8 x i16>, ptr %42, align 2
651 %43 = sext <8 x i16> %wide.load.11 to <8 x i32>
652 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
653 %45 = add i32 %44, %41
654 %46 = getelementptr inbounds i16, ptr %x, i32 96
655 %wide.load.12 = load <8 x i16>, ptr %46, align 2
656 %47 = sext <8 x i16> %wide.load.12 to <8 x i32>
657 %48 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
658 %49 = add i32 %48, %45
659 %50 = getelementptr inbounds i16, ptr %x, i32 104
660 %wide.load.13 = load <8 x i16>, ptr %50, align 2
661 %51 = sext <8 x i16> %wide.load.13 to <8 x i32>
662 %52 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %51)
663 %53 = add i32 %52, %49
664 %54 = getelementptr inbounds i16, ptr %x, i32 112
665 %wide.load.14 = load <8 x i16>, ptr %54, align 2
666 %55 = sext <8 x i16> %wide.load.14 to <8 x i32>
667 %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55)
668 %57 = add i32 %56, %53
669 %58 = getelementptr inbounds i16, ptr %x, i32 120
670 %wide.load.15 = load <8 x i16>, ptr %58, align 2
671 %59 = sext <8 x i16> %wide.load.15 to <8 x i32>
672 %60 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %59)
673 %61 = add i32 %60, %57
677 define i32 @addv2i32i8(ptr %x) {
678 ; CHECK-LABEL: addv2i32i8:
679 ; CHECK: @ %bb.0: @ %entry
680 ; CHECK-NEXT: ldrb r1, [r0]
681 ; CHECK-NEXT: ldrb r0, [r0, #1]
682 ; CHECK-NEXT: add r0, r1
685 %0 = load i8, ptr %x, align 1
686 %conv = zext i8 %0 to i32
687 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
688 %1 = load i8, ptr %arrayidx.1, align 1
689 %conv.1 = zext i8 %1 to i32
690 %add.1 = add nuw nsw i32 %conv, %conv.1
694 define i32 @addv4i32i8(ptr %x) {
695 ; CHECK-LABEL: addv4i32i8:
696 ; CHECK: @ %bb.0: @ %entry
697 ; CHECK-NEXT: vldrb.u32 q0, [r0]
698 ; CHECK-NEXT: vaddv.u32 r0, q0
701 %0 = load <4 x i8>, ptr %x, align 1
702 %1 = zext <4 x i8> %0 to <4 x i32>
703 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
707 define i32 @addv8i32i8(ptr %x) {
708 ; CHECK-LABEL: addv8i32i8:
709 ; CHECK: @ %bb.0: @ %entry
710 ; CHECK-NEXT: vldrb.u16 q0, [r0]
711 ; CHECK-NEXT: vaddv.u16 r0, q0
714 %0 = load <8 x i8>, ptr %x, align 1
715 %1 = zext <8 x i8> %0 to <8 x i32>
716 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
720 define i32 @addv16i32i8(ptr %x) {
721 ; CHECK-LABEL: addv16i32i8:
722 ; CHECK: @ %bb.0: @ %entry
723 ; CHECK-NEXT: vldrb.u8 q0, [r0]
724 ; CHECK-NEXT: vaddv.u8 r0, q0
727 %0 = load <16 x i8>, ptr %x, align 1
728 %1 = zext <16 x i8> %0 to <16 x i32>
729 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
733 define i32 @addv24i32i8(ptr %x) {
734 ; CHECK-LABEL: addv24i32i8:
735 ; CHECK: @ %bb.0: @ %entry
736 ; CHECK-NEXT: vldrb.u8 q1, [r0]
737 ; CHECK-NEXT: vldrb.u16 q0, [r0, #16]
738 ; CHECK-NEXT: vaddv.u8 r0, q1
739 ; CHECK-NEXT: vaddva.u16 r0, q0
742 %0 = load <16 x i8>, ptr %x, align 1
743 %1 = zext <16 x i8> %0 to <16 x i32>
744 %arrayidx.16 = getelementptr inbounds i8, ptr %x, i32 16
745 %2 = load <8 x i8>, ptr %arrayidx.16, align 1
746 %3 = zext <8 x i8> %2 to <8 x i32>
747 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
748 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
749 %op.rdx = add nuw nsw i32 %4, %5
753 define i32 @addv32i32i8(ptr %x) {
754 ; CHECK-LABEL: addv32i32i8:
755 ; CHECK: @ %bb.0: @ %entry
756 ; CHECK-NEXT: vldrb.u32 q1, [r0]
757 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
758 ; CHECK-NEXT: vaddv.u32 r2, q1
759 ; CHECK-NEXT: vaddva.u32 r2, q0
760 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
761 ; CHECK-NEXT: vaddva.u32 r2, q0
762 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
763 ; CHECK-NEXT: vaddva.u32 r2, q0
764 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
765 ; CHECK-NEXT: vaddva.u32 r2, q0
766 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
767 ; CHECK-NEXT: vaddva.u32 r2, q0
768 ; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
769 ; CHECK-NEXT: vaddva.u32 r2, q0
770 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
771 ; CHECK-NEXT: vaddva.u32 r2, q0
772 ; CHECK-NEXT: mov r0, r2
775 %0 = load <32 x i8>, ptr %x, align 1
776 %1 = zext <32 x i8> %0 to <32 x i32>
777 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
781 define i32 @addv64i32i8(ptr %x) {
782 ; CHECK-LABEL: addv64i32i8:
783 ; CHECK: @ %bb.0: @ %entry
784 ; CHECK-NEXT: vldrb.u32 q1, [r0]
785 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
786 ; CHECK-NEXT: ldrb.w r1, [r0, #60]
787 ; CHECK-NEXT: vaddv.u32 r2, q1
788 ; CHECK-NEXT: ldrb.w r3, [r0, #61]
789 ; CHECK-NEXT: vaddva.u32 r2, q0
790 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
791 ; CHECK-NEXT: ldrb.w r12, [r0, #62]
792 ; CHECK-NEXT: vaddva.u32 r2, q0
793 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
794 ; CHECK-NEXT: vaddva.u32 r2, q0
795 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
796 ; CHECK-NEXT: vaddva.u32 r2, q0
797 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
798 ; CHECK-NEXT: vaddva.u32 r2, q0
799 ; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
800 ; CHECK-NEXT: vaddva.u32 r2, q0
801 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
802 ; CHECK-NEXT: vaddva.u32 r2, q0
803 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
804 ; CHECK-NEXT: vaddva.u8 r2, q0
805 ; CHECK-NEXT: vldrb.u16 q0, [r0, #48]
806 ; CHECK-NEXT: vaddva.u16 r2, q0
807 ; CHECK-NEXT: vldrb.u32 q0, [r0, #56]
808 ; CHECK-NEXT: ldrb.w r0, [r0, #63]
809 ; CHECK-NEXT: vaddva.u32 r2, q0
810 ; CHECK-NEXT: add r1, r2
811 ; CHECK-NEXT: add r1, r3
812 ; CHECK-NEXT: add r1, r12
813 ; CHECK-NEXT: add r0, r1
816 %0 = load <32 x i8>, ptr %x, align 1
817 %1 = zext <32 x i8> %0 to <32 x i32>
818 %arrayidx.32 = getelementptr inbounds i8, ptr %x, i32 32
819 %2 = load <16 x i8>, ptr %arrayidx.32, align 1
820 %3 = zext <16 x i8> %2 to <16 x i32>
821 %arrayidx.48 = getelementptr inbounds i8, ptr %x, i32 48
822 %4 = load <8 x i8>, ptr %arrayidx.48, align 1
823 %5 = zext <8 x i8> %4 to <8 x i32>
824 %arrayidx.56 = getelementptr inbounds i8, ptr %x, i32 56
825 %6 = load <4 x i8>, ptr %arrayidx.56, align 1
826 %7 = zext <4 x i8> %6 to <4 x i32>
827 %arrayidx.60 = getelementptr inbounds i8, ptr %x, i32 60
828 %8 = load i8, ptr %arrayidx.60, align 1
829 %conv.60 = zext i8 %8 to i32
830 %arrayidx.61 = getelementptr inbounds i8, ptr %x, i32 61
831 %9 = load i8, ptr %arrayidx.61, align 1
832 %conv.61 = zext i8 %9 to i32
833 %arrayidx.62 = getelementptr inbounds i8, ptr %x, i32 62
834 %10 = load i8, ptr %arrayidx.62, align 1
835 %conv.62 = zext i8 %10 to i32
836 %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
837 %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
838 %op.rdx = add nuw nsw i32 %11, %12
839 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
840 %op.rdx8 = add nuw nsw i32 %op.rdx, %13
841 %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
842 %op.rdx9 = add nuw nsw i32 %op.rdx8, %14
843 %15 = add nuw nsw i32 %op.rdx9, %conv.60
844 %16 = add nuw nsw i32 %15, %conv.61
845 %17 = add nuw nsw i32 %16, %conv.62
846 %arrayidx.63 = getelementptr inbounds i8, ptr %x, i32 63
847 %18 = load i8, ptr %arrayidx.63, align 1
848 %conv.63 = zext i8 %18 to i32
849 %add.63 = add nuw nsw i32 %17, %conv.63
853 define i32 @addv128i32i8(ptr %x) {
854 ; CHECK-LABEL: addv128i32i8:
855 ; CHECK: @ %bb.0: @ %entry
856 ; CHECK-NEXT: vldrb.u8 q1, [r0]
857 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
858 ; CHECK-NEXT: mov r1, r0
859 ; CHECK-NEXT: vaddv.u8 r0, q1
860 ; CHECK-NEXT: vaddva.u8 r0, q0
861 ; CHECK-NEXT: vldrb.u8 q0, [r1, #32]
862 ; CHECK-NEXT: vaddva.u8 r0, q0
863 ; CHECK-NEXT: vldrb.u8 q0, [r1, #48]
864 ; CHECK-NEXT: vaddva.u8 r0, q0
865 ; CHECK-NEXT: vldrb.u8 q0, [r1, #64]
866 ; CHECK-NEXT: vaddva.u8 r0, q0
867 ; CHECK-NEXT: vldrb.u8 q0, [r1, #80]
868 ; CHECK-NEXT: vaddva.u8 r0, q0
869 ; CHECK-NEXT: vldrb.u8 q0, [r1, #96]
870 ; CHECK-NEXT: vaddva.u8 r0, q0
871 ; CHECK-NEXT: vldrb.u8 q0, [r1, #112]
872 ; CHECK-NEXT: vaddva.u8 r0, q0
875 %wide.load = load <16 x i8>, ptr %x, align 1
876 %0 = zext <16 x i8> %wide.load to <16 x i32>
877 %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
878 %2 = getelementptr inbounds i8, ptr %x, i32 16
879 %wide.load.1 = load <16 x i8>, ptr %2, align 1
880 %3 = zext <16 x i8> %wide.load.1 to <16 x i32>
881 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
883 %6 = getelementptr inbounds i8, ptr %x, i32 32
884 %wide.load.2 = load <16 x i8>, ptr %6, align 1
885 %7 = zext <16 x i8> %wide.load.2 to <16 x i32>
886 %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
888 %10 = getelementptr inbounds i8, ptr %x, i32 48
889 %wide.load.3 = load <16 x i8>, ptr %10, align 1
890 %11 = zext <16 x i8> %wide.load.3 to <16 x i32>
891 %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %11)
892 %13 = add i32 %12, %9
893 %14 = getelementptr inbounds i8, ptr %x, i32 64
894 %wide.load.4 = load <16 x i8>, ptr %14, align 1
895 %15 = zext <16 x i8> %wide.load.4 to <16 x i32>
896 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
897 %17 = add i32 %16, %13
898 %18 = getelementptr inbounds i8, ptr %x, i32 80
899 %wide.load.5 = load <16 x i8>, ptr %18, align 1
900 %19 = zext <16 x i8> %wide.load.5 to <16 x i32>
901 %20 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19)
902 %21 = add i32 %20, %17
903 %22 = getelementptr inbounds i8, ptr %x, i32 96
904 %wide.load.6 = load <16 x i8>, ptr %22, align 1
905 %23 = zext <16 x i8> %wide.load.6 to <16 x i32>
906 %24 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %23)
907 %25 = add i32 %24, %21
908 %26 = getelementptr inbounds i8, ptr %x, i32 112
909 %wide.load.7 = load <16 x i8>, ptr %26, align 1
910 %27 = zext <16 x i8> %wide.load.7 to <16 x i32>
911 %28 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %27)
912 %29 = add i32 %28, %25
916 define signext i16 @addv2i16i16(ptr %x) {
917 ; CHECK-LABEL: addv2i16i16:
918 ; CHECK: @ %bb.0: @ %entry
919 ; CHECK-NEXT: ldrh r1, [r0]
920 ; CHECK-NEXT: ldrh r0, [r0, #2]
921 ; CHECK-NEXT: add r0, r1
922 ; CHECK-NEXT: sxth r0, r0
925 %0 = load i16, ptr %x, align 2
926 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
927 %1 = load i16, ptr %arrayidx.1, align 2
928 %add.1 = add i16 %1, %0
932 define signext i16 @addv4i16i16(ptr %x) {
933 ; CHECK-LABEL: addv4i16i16:
934 ; CHECK: @ %bb.0: @ %entry
935 ; CHECK-NEXT: vldrh.u32 q0, [r0]
936 ; CHECK-NEXT: vaddv.u32 r0, q0
937 ; CHECK-NEXT: sxth r0, r0
940 %0 = load <4 x i16>, ptr %x, align 2
941 %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %0)
945 define signext i16 @addv8i16i16(ptr %x) {
946 ; CHECK-LABEL: addv8i16i16:
947 ; CHECK: @ %bb.0: @ %entry
948 ; CHECK-NEXT: vldrh.u16 q0, [r0]
949 ; CHECK-NEXT: vaddv.u16 r0, q0
950 ; CHECK-NEXT: sxth r0, r0
953 %0 = load <8 x i16>, ptr %x, align 2
954 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
958 define signext i16 @addv16i16i16(ptr %x) {
959 ; CHECK-LABEL: addv16i16i16:
960 ; CHECK: @ %bb.0: @ %entry
961 ; CHECK-NEXT: vldrh.u16 q1, [r0]
962 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
963 ; CHECK-NEXT: vaddv.u16 r0, q1
964 ; CHECK-NEXT: vaddva.u16 r0, q0
965 ; CHECK-NEXT: sxth r0, r0
968 %0 = load <16 x i16>, ptr %x, align 2
969 %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0)
973 define signext i16 @addv24i16i16(ptr %x) {
974 ; CHECK-LABEL: addv24i16i16:
975 ; CHECK: @ %bb.0: @ %entry
976 ; CHECK-NEXT: vldrh.u16 q1, [r0]
977 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
978 ; CHECK-NEXT: vaddv.u16 r2, q1
979 ; CHECK-NEXT: vaddva.u16 r2, q0
980 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
981 ; CHECK-NEXT: vaddva.u16 r2, q0
982 ; CHECK-NEXT: sxth r0, r2
985 %0 = load <8 x i16>, ptr %x, align 2
986 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
987 %1 = load <16 x i16>, ptr %arrayidx.8, align 2
988 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
989 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
990 %op.rdx = add i16 %2, %3
994 define signext i16 @addv32i16i16(ptr %x) {
995 ; CHECK-LABEL: addv32i16i16:
996 ; CHECK: @ %bb.0: @ %entry
997 ; CHECK-NEXT: vldrh.u16 q1, [r0]
998 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
999 ; CHECK-NEXT: vaddv.u16 r2, q1
1000 ; CHECK-NEXT: vaddva.u16 r2, q0
1001 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1002 ; CHECK-NEXT: vaddva.u16 r2, q0
1003 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
1004 ; CHECK-NEXT: vaddva.u16 r2, q0
1005 ; CHECK-NEXT: sxth r0, r2
1008 %0 = load <32 x i16>, ptr %x, align 2
1009 %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %0)
1013 define signext i16 @addv64i16i16(ptr %x) {
1014 ; CHECK-LABEL: addv64i16i16:
1015 ; CHECK: @ %bb.0: @ %entry
1016 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1017 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1018 ; CHECK-NEXT: vaddv.u16 r2, q1
1019 ; CHECK-NEXT: vaddva.u16 r2, q0
1020 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1021 ; CHECK-NEXT: vaddva.u16 r2, q0
1022 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
1023 ; CHECK-NEXT: vaddva.u16 r2, q0
1024 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
1025 ; CHECK-NEXT: vaddva.u16 r2, q0
1026 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
1027 ; CHECK-NEXT: vaddva.u16 r2, q0
1028 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
1029 ; CHECK-NEXT: vaddva.u16 r2, q0
1030 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
1031 ; CHECK-NEXT: vaddva.u16 r2, q0
1032 ; CHECK-NEXT: sxth r0, r2
1035 %0 = load <64 x i16>, ptr %x, align 2
1036 %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %0)
1040 define signext i16 @addv128i16i16(ptr %x) {
1041 ; CHECK-LABEL: addv128i16i16:
1042 ; CHECK: @ %bb.0: @ %entry
1043 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1044 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1045 ; CHECK-NEXT: vaddv.u16 r2, q1
1046 ; CHECK-NEXT: vaddva.u16 r2, q0
1047 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1048 ; CHECK-NEXT: vaddva.u16 r2, q0
1049 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
1050 ; CHECK-NEXT: vaddva.u16 r2, q0
1051 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
1052 ; CHECK-NEXT: vaddva.u16 r2, q0
1053 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
1054 ; CHECK-NEXT: vaddva.u16 r2, q0
1055 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
1056 ; CHECK-NEXT: vaddva.u16 r2, q0
1057 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
1058 ; CHECK-NEXT: vaddva.u16 r2, q0
1059 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
1060 ; CHECK-NEXT: vaddva.u16 r2, q0
1061 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
1062 ; CHECK-NEXT: vaddva.u16 r2, q0
1063 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
1064 ; CHECK-NEXT: vaddva.u16 r2, q0
1065 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
1066 ; CHECK-NEXT: vaddva.u16 r2, q0
1067 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
1068 ; CHECK-NEXT: vaddva.u16 r2, q0
1069 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
1070 ; CHECK-NEXT: vaddva.u16 r2, q0
1071 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
1072 ; CHECK-NEXT: vaddva.u16 r2, q0
1073 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
1074 ; CHECK-NEXT: vaddva.u16 r2, q0
1075 ; CHECK-NEXT: sxth r0, r2
1078 %wide.load = load <8 x i16>, ptr %x, align 2
1079 %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load)
1080 %1 = getelementptr inbounds i16, ptr %x, i32 8
1081 %wide.load.1 = load <8 x i16>, ptr %1, align 2
1082 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1)
1084 %4 = getelementptr inbounds i16, ptr %x, i32 16
1085 %wide.load.2 = load <8 x i16>, ptr %4, align 2
1086 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2)
1088 %7 = getelementptr inbounds i16, ptr %x, i32 24
1089 %wide.load.3 = load <8 x i16>, ptr %7, align 2
1090 %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3)
1092 %10 = getelementptr inbounds i16, ptr %x, i32 32
1093 %wide.load.4 = load <8 x i16>, ptr %10, align 2
1094 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4)
1095 %12 = add i16 %11, %9
1096 %13 = getelementptr inbounds i16, ptr %x, i32 40
1097 %wide.load.5 = load <8 x i16>, ptr %13, align 2
1098 %14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5)
1099 %15 = add i16 %14, %12
1100 %16 = getelementptr inbounds i16, ptr %x, i32 48
1101 %wide.load.6 = load <8 x i16>, ptr %16, align 2
1102 %17 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6)
1103 %18 = add i16 %17, %15
1104 %19 = getelementptr inbounds i16, ptr %x, i32 56
1105 %wide.load.7 = load <8 x i16>, ptr %19, align 2
1106 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7)
1107 %21 = add i16 %20, %18
1108 %22 = getelementptr inbounds i16, ptr %x, i32 64
1109 %wide.load.8 = load <8 x i16>, ptr %22, align 2
1110 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8)
1111 %24 = add i16 %23, %21
1112 %25 = getelementptr inbounds i16, ptr %x, i32 72
1113 %wide.load.9 = load <8 x i16>, ptr %25, align 2
1114 %26 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9)
1115 %27 = add i16 %26, %24
1116 %28 = getelementptr inbounds i16, ptr %x, i32 80
1117 %wide.load.10 = load <8 x i16>, ptr %28, align 2
1118 %29 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10)
1119 %30 = add i16 %29, %27
1120 %31 = getelementptr inbounds i16, ptr %x, i32 88
1121 %wide.load.11 = load <8 x i16>, ptr %31, align 2
1122 %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11)
1123 %33 = add i16 %32, %30
1124 %34 = getelementptr inbounds i16, ptr %x, i32 96
1125 %wide.load.12 = load <8 x i16>, ptr %34, align 2
1126 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12)
1127 %36 = add i16 %35, %33
1128 %37 = getelementptr inbounds i16, ptr %x, i32 104
1129 %wide.load.13 = load <8 x i16>, ptr %37, align 2
1130 %38 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13)
1131 %39 = add i16 %38, %36
1132 %40 = getelementptr inbounds i16, ptr %x, i32 112
1133 %wide.load.14 = load <8 x i16>, ptr %40, align 2
1134 %41 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14)
1135 %42 = add i16 %41, %39
1136 %43 = getelementptr inbounds i16, ptr %x, i32 120
1137 %wide.load.15 = load <8 x i16>, ptr %43, align 2
1138 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15)
1139 %45 = add i16 %44, %42
1143 define zeroext i8 @addv2i8i8(ptr %x) {
1144 ; CHECK-LABEL: addv2i8i8:
1145 ; CHECK: @ %bb.0: @ %entry
1146 ; CHECK-NEXT: ldrb r1, [r0]
1147 ; CHECK-NEXT: ldrb r0, [r0, #1]
1148 ; CHECK-NEXT: add r0, r1
1149 ; CHECK-NEXT: uxtb r0, r0
1152 %0 = load i8, ptr %x, align 1
1153 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
1154 %1 = load i8, ptr %arrayidx.1, align 1
1155 %add.1 = add i8 %1, %0
1159 define zeroext i8 @addv4i8i8(ptr %x) {
1160 ; CHECK-LABEL: addv4i8i8:
1161 ; CHECK: @ %bb.0: @ %entry
1162 ; CHECK-NEXT: vldrb.u32 q0, [r0]
1163 ; CHECK-NEXT: vaddv.u32 r0, q0
1164 ; CHECK-NEXT: uxtb r0, r0
1167 %0 = load <4 x i8>, ptr %x, align 1
1168 %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %0)
1172 define zeroext i8 @addv8i8i8(ptr %x) {
1173 ; CHECK-LABEL: addv8i8i8:
1174 ; CHECK: @ %bb.0: @ %entry
1175 ; CHECK-NEXT: vldrb.u16 q0, [r0]
1176 ; CHECK-NEXT: vaddv.u16 r0, q0
1177 ; CHECK-NEXT: uxtb r0, r0
1180 %0 = load <8 x i8>, ptr %x, align 1
1181 %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0)
1185 define zeroext i8 @addv16i8i8(ptr %x) {
1186 ; CHECK-LABEL: addv16i8i8:
1187 ; CHECK: @ %bb.0: @ %entry
1188 ; CHECK-NEXT: vldrb.u8 q0, [r0]
1189 ; CHECK-NEXT: vaddv.u8 r0, q0
1190 ; CHECK-NEXT: uxtb r0, r0
1193 %0 = load <16 x i8>, ptr %x, align 1
1194 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
1198 define zeroext i8 @addv24i8i8(ptr %x) {
1199 ; CHECK-LABEL: addv24i8i8:
1200 ; CHECK: @ %bb.0: @ %entry
1201 ; CHECK-NEXT: vldrb.u16 q1, [r0]
1202 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
1203 ; CHECK-NEXT: vaddv.u16 r0, q1
1204 ; CHECK-NEXT: vaddva.u8 r0, q0
1205 ; CHECK-NEXT: uxtb r0, r0
1208 %0 = load <8 x i8>, ptr %x, align 1
1209 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
1210 %1 = load <16 x i8>, ptr %arrayidx.8, align 1
1211 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1)
1212 %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0)
1213 %op.rdx = add i8 %2, %3
1217 define zeroext i8 @addv32i8i8(ptr %x) {
1218 ; CHECK-LABEL: addv32i8i8:
1219 ; CHECK: @ %bb.0: @ %entry
1220 ; CHECK-NEXT: vldrb.u8 q1, [r0]
1221 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
1222 ; CHECK-NEXT: vaddv.u8 r0, q1
1223 ; CHECK-NEXT: vaddva.u8 r0, q0
1224 ; CHECK-NEXT: uxtb r0, r0
1227 %0 = load <32 x i8>, ptr %x, align 1
1228 %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %0)
1232 define zeroext i8 @addv64i8i8(ptr %x) {
1233 ; CHECK-LABEL: addv64i8i8:
1234 ; CHECK: @ %bb.0: @ %entry
1235 ; CHECK-NEXT: vldrb.u8 q1, [r0]
1236 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
1237 ; CHECK-NEXT: vaddv.u8 r2, q1
1238 ; CHECK-NEXT: vaddva.u8 r2, q0
1239 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
1240 ; CHECK-NEXT: vaddva.u8 r2, q0
1241 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
1242 ; CHECK-NEXT: vaddva.u8 r2, q0
1243 ; CHECK-NEXT: uxtb r0, r2
1246 %0 = load <64 x i8>, ptr %x, align 1
1247 %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %0)
1251 define zeroext i8 @addv128i8i8(ptr %x) {
1252 ; CHECK-LABEL: addv128i8i8:
1253 ; CHECK: @ %bb.0: @ %entry
1254 ; CHECK-NEXT: vldrb.u8 q1, [r0]
1255 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
1256 ; CHECK-NEXT: vaddv.u8 r2, q1
1257 ; CHECK-NEXT: vaddva.u8 r2, q0
1258 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
1259 ; CHECK-NEXT: vaddva.u8 r2, q0
1260 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
1261 ; CHECK-NEXT: vaddva.u8 r2, q0
1262 ; CHECK-NEXT: vldrb.u8 q0, [r0, #64]
1263 ; CHECK-NEXT: vaddva.u8 r2, q0
1264 ; CHECK-NEXT: vldrb.u8 q0, [r0, #80]
1265 ; CHECK-NEXT: vaddva.u8 r2, q0
1266 ; CHECK-NEXT: vldrb.u8 q0, [r0, #96]
1267 ; CHECK-NEXT: vaddva.u8 r2, q0
1268 ; CHECK-NEXT: vldrb.u8 q0, [r0, #112]
1269 ; CHECK-NEXT: vaddva.u8 r2, q0
1270 ; CHECK-NEXT: uxtb r0, r2
1273 %wide.load = load <16 x i8>, ptr %x, align 1
1274 %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load)
1275 %1 = getelementptr inbounds i8, ptr %x, i32 16
1276 %wide.load.1 = load <16 x i8>, ptr %1, align 1
1277 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1)
1279 %4 = getelementptr inbounds i8, ptr %x, i32 32
1280 %wide.load.2 = load <16 x i8>, ptr %4, align 1
1281 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2)
1283 %7 = getelementptr inbounds i8, ptr %x, i32 48
1284 %wide.load.3 = load <16 x i8>, ptr %7, align 1
1285 %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3)
1287 %10 = getelementptr inbounds i8, ptr %x, i32 64
1288 %wide.load.4 = load <16 x i8>, ptr %10, align 1
1289 %11 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4)
1290 %12 = add i8 %11, %9
1291 %13 = getelementptr inbounds i8, ptr %x, i32 80
1292 %wide.load.5 = load <16 x i8>, ptr %13, align 1
1293 %14 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5)
1294 %15 = add i8 %14, %12
1295 %16 = getelementptr inbounds i8, ptr %x, i32 96
1296 %wide.load.6 = load <16 x i8>, ptr %16, align 1
1297 %17 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6)
1298 %18 = add i8 %17, %15
1299 %19 = getelementptr inbounds i8, ptr %x, i32 112
1300 %wide.load.7 = load <16 x i8>, ptr %19, align 1
1301 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7)
1302 %21 = add i8 %20, %18
1308 define i32 @mlav2i32i32(ptr %x, ptr %y) {
1309 ; CHECK-LABEL: mlav2i32i32:
1310 ; CHECK: @ %bb.0: @ %entry
1311 ; CHECK-NEXT: ldrd r0, r2, [r0]
1312 ; CHECK-NEXT: ldrd r1, r3, [r1]
1313 ; CHECK-NEXT: muls r0, r1, r0
1314 ; CHECK-NEXT: mla r0, r3, r2, r0
1317 %0 = load i32, ptr %x, align 4
1318 %1 = load i32, ptr %y, align 4
1319 %mul = mul nsw i32 %1, %0
1320 %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1
1321 %2 = load i32, ptr %arrayidx.1, align 4
1322 %arrayidx1.1 = getelementptr inbounds i32, ptr %y, i32 1
1323 %3 = load i32, ptr %arrayidx1.1, align 4
1324 %mul.1 = mul nsw i32 %3, %2
1325 %add.1 = add nsw i32 %mul.1, %mul
1329 define i32 @mlav4i32i32(ptr %x, ptr %y) {
1330 ; CHECK-LABEL: mlav4i32i32:
1331 ; CHECK: @ %bb.0: @ %entry
1332 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1333 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1334 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1337 %0 = load <4 x i32>, ptr %x, align 4
1338 %1 = load <4 x i32>, ptr %y, align 4
1339 %2 = mul nsw <4 x i32> %1, %0
1340 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1344 define i32 @mlav8i32i32(ptr %x, ptr %y) {
1345 ; CHECK-LABEL: mlav8i32i32:
1346 ; CHECK: @ %bb.0: @ %entry
1347 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1348 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1349 ; CHECK-NEXT: vmlav.u32 r2, q1, q0
1350 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1351 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1352 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1353 ; CHECK-NEXT: mov r0, r2
1356 %0 = load <8 x i32>, ptr %x, align 4
1357 %1 = load <8 x i32>, ptr %y, align 4
1358 %2 = mul nsw <8 x i32> %1, %0
1359 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
1363 define i32 @mlav16i32i32(ptr %x, ptr %y) {
1364 ; CHECK-LABEL: mlav16i32i32:
1365 ; CHECK: @ %bb.0: @ %entry
1366 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1367 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1368 ; CHECK-NEXT: vmlav.u32 r2, q1, q0
1369 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1370 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1371 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1372 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
1373 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1374 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1375 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
1376 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1377 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1378 ; CHECK-NEXT: mov r0, r2
1381 %0 = load <16 x i32>, ptr %x, align 4
1382 %1 = load <16 x i32>, ptr %y, align 4
1383 %2 = mul nsw <16 x i32> %1, %0
1384 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
1388 define i32 @mlav24i32i32(ptr %x, ptr %y) {
1389 ; CHECK-LABEL: mlav24i32i32:
1390 ; CHECK: @ %bb.0: @ %entry
1391 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1392 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1393 ; CHECK-NEXT: mov r2, r0
1394 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1395 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1396 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1397 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1398 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1399 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1400 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1401 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1402 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1403 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1404 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1405 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1406 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1407 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1408 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1409 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1412 %0 = load <8 x i32>, ptr %x, align 4
1413 %1 = load <8 x i32>, ptr %y, align 4
1414 %2 = mul nsw <8 x i32> %1, %0
1415 %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8
1416 %arrayidx1.8 = getelementptr inbounds i32, ptr %y, i32 8
1417 %3 = load <16 x i32>, ptr %arrayidx.8, align 4
1418 %4 = load <16 x i32>, ptr %arrayidx1.8, align 4
1419 %5 = mul nsw <16 x i32> %4, %3
1420 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
1421 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
1422 %op.rdx = add nsw i32 %6, %7
1426 define i32 @mlav32i32i32(ptr %x, ptr %y) {
1427 ; CHECK-LABEL: mlav32i32i32:
1428 ; CHECK: @ %bb.0: @ %entry
1429 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1430 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1431 ; CHECK-NEXT: mov r2, r0
1432 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1433 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1434 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1435 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1436 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1437 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1438 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1439 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1440 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1441 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1442 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1443 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1444 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1445 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1446 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1447 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1448 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
1449 ; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
1450 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1451 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
1452 ; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
1453 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1456 %0 = load <32 x i32>, ptr %x, align 4
1457 %1 = load <32 x i32>, ptr %y, align 4
1458 %2 = mul nsw <32 x i32> %1, %0
1459 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
1463 define i32 @mlav64i32i32(ptr %x, ptr %y) {
1464 ; CHECK-LABEL: mlav64i32i32:
1465 ; CHECK: @ %bb.0: @ %entry
1466 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1467 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1468 ; CHECK-NEXT: mov r2, r0
1469 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1470 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1471 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1472 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1473 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1474 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1475 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1476 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1477 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1478 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1479 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1480 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1481 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1482 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1483 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1484 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1485 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
1486 ; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
1487 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1488 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
1489 ; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
1490 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1491 ; CHECK-NEXT: vldrw.u32 q0, [r2, #128]
1492 ; CHECK-NEXT: vldrw.u32 q1, [r1, #128]
1493 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1494 ; CHECK-NEXT: vldrw.u32 q0, [r2, #144]
1495 ; CHECK-NEXT: vldrw.u32 q1, [r1, #144]
1496 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1497 ; CHECK-NEXT: vldrw.u32 q0, [r2, #160]
1498 ; CHECK-NEXT: vldrw.u32 q1, [r1, #160]
1499 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1500 ; CHECK-NEXT: vldrw.u32 q0, [r2, #176]
1501 ; CHECK-NEXT: vldrw.u32 q1, [r1, #176]
1502 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1503 ; CHECK-NEXT: vldrw.u32 q0, [r2, #192]
1504 ; CHECK-NEXT: vldrw.u32 q1, [r1, #192]
1505 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1506 ; CHECK-NEXT: vldrw.u32 q0, [r2, #208]
1507 ; CHECK-NEXT: vldrw.u32 q1, [r1, #208]
1508 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1509 ; CHECK-NEXT: vldrw.u32 q0, [r2, #224]
1510 ; CHECK-NEXT: vldrw.u32 q1, [r1, #224]
1511 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1512 ; CHECK-NEXT: vldrw.u32 q0, [r2, #240]
1513 ; CHECK-NEXT: vldrw.u32 q1, [r1, #240]
1514 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1517 %wide.load = load <4 x i32>, ptr %x, align 4
1518 %wide.load10 = load <4 x i32>, ptr %y, align 4
1519 %0 = mul nsw <4 x i32> %wide.load10, %wide.load
1520 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
1521 %2 = getelementptr inbounds i32, ptr %x, i32 4
1522 %wide.load.1 = load <4 x i32>, ptr %2, align 4
1523 %3 = getelementptr inbounds i32, ptr %y, i32 4
1524 %wide.load10.1 = load <4 x i32>, ptr %3, align 4
1525 %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1526 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1528 %7 = getelementptr inbounds i32, ptr %x, i32 8
1529 %wide.load.2 = load <4 x i32>, ptr %7, align 4
1530 %8 = getelementptr inbounds i32, ptr %y, i32 8
1531 %wide.load10.2 = load <4 x i32>, ptr %8, align 4
1532 %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1533 %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
1534 %11 = add i32 %10, %6
1535 %12 = getelementptr inbounds i32, ptr %x, i32 12
1536 %wide.load.3 = load <4 x i32>, ptr %12, align 4
1537 %13 = getelementptr inbounds i32, ptr %y, i32 12
1538 %wide.load10.3 = load <4 x i32>, ptr %13, align 4
1539 %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1540 %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
1541 %16 = add i32 %15, %11
1542 %17 = getelementptr inbounds i32, ptr %x, i32 16
1543 %wide.load.4 = load <4 x i32>, ptr %17, align 4
1544 %18 = getelementptr inbounds i32, ptr %y, i32 16
1545 %wide.load10.4 = load <4 x i32>, ptr %18, align 4
1546 %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1547 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
1548 %21 = add i32 %20, %16
1549 %22 = getelementptr inbounds i32, ptr %x, i32 20
1550 %wide.load.5 = load <4 x i32>, ptr %22, align 4
1551 %23 = getelementptr inbounds i32, ptr %y, i32 20
1552 %wide.load10.5 = load <4 x i32>, ptr %23, align 4
1553 %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1554 %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
1555 %26 = add i32 %25, %21
1556 %27 = getelementptr inbounds i32, ptr %x, i32 24
1557 %wide.load.6 = load <4 x i32>, ptr %27, align 4
1558 %28 = getelementptr inbounds i32, ptr %y, i32 24
1559 %wide.load10.6 = load <4 x i32>, ptr %28, align 4
1560 %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1561 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1562 %31 = add i32 %30, %26
1563 %32 = getelementptr inbounds i32, ptr %x, i32 28
1564 %wide.load.7 = load <4 x i32>, ptr %32, align 4
1565 %33 = getelementptr inbounds i32, ptr %y, i32 28
1566 %wide.load10.7 = load <4 x i32>, ptr %33, align 4
1567 %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1568 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
1569 %36 = add i32 %35, %31
1570 %37 = getelementptr inbounds i32, ptr %x, i32 32
1571 %wide.load.8 = load <4 x i32>, ptr %37, align 4
1572 %38 = getelementptr inbounds i32, ptr %y, i32 32
1573 %wide.load10.8 = load <4 x i32>, ptr %38, align 4
1574 %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1575 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39)
1576 %41 = add i32 %40, %36
1577 %42 = getelementptr inbounds i32, ptr %x, i32 36
1578 %wide.load.9 = load <4 x i32>, ptr %42, align 4
1579 %43 = getelementptr inbounds i32, ptr %y, i32 36
1580 %wide.load10.9 = load <4 x i32>, ptr %43, align 4
1581 %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1582 %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
1583 %46 = add i32 %45, %41
1584 %47 = getelementptr inbounds i32, ptr %x, i32 40
1585 %wide.load.10 = load <4 x i32>, ptr %47, align 4
1586 %48 = getelementptr inbounds i32, ptr %y, i32 40
1587 %wide.load10.10 = load <4 x i32>, ptr %48, align 4
1588 %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1589 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49)
1590 %51 = add i32 %50, %46
1591 %52 = getelementptr inbounds i32, ptr %x, i32 44
1592 %wide.load.11 = load <4 x i32>, ptr %52, align 4
1593 %53 = getelementptr inbounds i32, ptr %y, i32 44
1594 %wide.load10.11 = load <4 x i32>, ptr %53, align 4
1595 %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1596 %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54)
1597 %56 = add i32 %55, %51
1598 %57 = getelementptr inbounds i32, ptr %x, i32 48
1599 %wide.load.12 = load <4 x i32>, ptr %57, align 4
1600 %58 = getelementptr inbounds i32, ptr %y, i32 48
1601 %wide.load10.12 = load <4 x i32>, ptr %58, align 4
1602 %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
1603 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59)
1604 %61 = add i32 %60, %56
1605 %62 = getelementptr inbounds i32, ptr %x, i32 52
1606 %wide.load.13 = load <4 x i32>, ptr %62, align 4
1607 %63 = getelementptr inbounds i32, ptr %y, i32 52
1608 %wide.load10.13 = load <4 x i32>, ptr %63, align 4
1609 %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
1610 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1611 %66 = add i32 %65, %61
1612 %67 = getelementptr inbounds i32, ptr %x, i32 56
1613 %wide.load.14 = load <4 x i32>, ptr %67, align 4
1614 %68 = getelementptr inbounds i32, ptr %y, i32 56
1615 %wide.load10.14 = load <4 x i32>, ptr %68, align 4
1616 %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
1617 %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69)
1618 %71 = add i32 %70, %66
1619 %72 = getelementptr inbounds i32, ptr %x, i32 60
1620 %wide.load.15 = load <4 x i32>, ptr %72, align 4
1621 %73 = getelementptr inbounds i32, ptr %y, i32 60
1622 %wide.load10.15 = load <4 x i32>, ptr %73, align 4
1623 %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
1624 %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74)
1625 %76 = add i32 %75, %71
1629 define i32 @mlav128i32i32(ptr %x, ptr %y) {
1630 ; CHECK-LABEL: mlav128i32i32:
1631 ; CHECK: @ %bb.0: @ %entry
1632 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1633 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1634 ; CHECK-NEXT: mov r2, r0
1635 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1636 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1637 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1638 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1639 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1640 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1641 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1642 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1643 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1644 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1645 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1646 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1647 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1648 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1649 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1650 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1651 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
1652 ; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
1653 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1654 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
1655 ; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
1656 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1657 ; CHECK-NEXT: vldrw.u32 q0, [r2, #128]
1658 ; CHECK-NEXT: vldrw.u32 q1, [r1, #128]
1659 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1660 ; CHECK-NEXT: vldrw.u32 q0, [r2, #144]
1661 ; CHECK-NEXT: vldrw.u32 q1, [r1, #144]
1662 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1663 ; CHECK-NEXT: vldrw.u32 q0, [r2, #160]
1664 ; CHECK-NEXT: vldrw.u32 q1, [r1, #160]
1665 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1666 ; CHECK-NEXT: vldrw.u32 q0, [r2, #176]
1667 ; CHECK-NEXT: vldrw.u32 q1, [r1, #176]
1668 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1669 ; CHECK-NEXT: vldrw.u32 q0, [r2, #192]
1670 ; CHECK-NEXT: vldrw.u32 q1, [r1, #192]
1671 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1672 ; CHECK-NEXT: vldrw.u32 q0, [r2, #208]
1673 ; CHECK-NEXT: vldrw.u32 q1, [r1, #208]
1674 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1675 ; CHECK-NEXT: vldrw.u32 q0, [r2, #224]
1676 ; CHECK-NEXT: vldrw.u32 q1, [r1, #224]
1677 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1678 ; CHECK-NEXT: vldrw.u32 q0, [r2, #240]
1679 ; CHECK-NEXT: vldrw.u32 q1, [r1, #240]
1680 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1681 ; CHECK-NEXT: vldrw.u32 q0, [r2, #256]
1682 ; CHECK-NEXT: vldrw.u32 q1, [r1, #256]
1683 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1684 ; CHECK-NEXT: vldrw.u32 q0, [r2, #272]
1685 ; CHECK-NEXT: vldrw.u32 q1, [r1, #272]
1686 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1687 ; CHECK-NEXT: vldrw.u32 q0, [r2, #288]
1688 ; CHECK-NEXT: vldrw.u32 q1, [r1, #288]
1689 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1690 ; CHECK-NEXT: vldrw.u32 q0, [r2, #304]
1691 ; CHECK-NEXT: vldrw.u32 q1, [r1, #304]
1692 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1693 ; CHECK-NEXT: vldrw.u32 q0, [r2, #320]
1694 ; CHECK-NEXT: vldrw.u32 q1, [r1, #320]
1695 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1696 ; CHECK-NEXT: vldrw.u32 q0, [r2, #336]
1697 ; CHECK-NEXT: vldrw.u32 q1, [r1, #336]
1698 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1699 ; CHECK-NEXT: vldrw.u32 q0, [r2, #352]
1700 ; CHECK-NEXT: vldrw.u32 q1, [r1, #352]
1701 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1702 ; CHECK-NEXT: vldrw.u32 q0, [r2, #368]
1703 ; CHECK-NEXT: vldrw.u32 q1, [r1, #368]
1704 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1705 ; CHECK-NEXT: vldrw.u32 q0, [r2, #384]
1706 ; CHECK-NEXT: vldrw.u32 q1, [r1, #384]
1707 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1708 ; CHECK-NEXT: vldrw.u32 q0, [r2, #400]
1709 ; CHECK-NEXT: vldrw.u32 q1, [r1, #400]
1710 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1711 ; CHECK-NEXT: vldrw.u32 q0, [r2, #416]
1712 ; CHECK-NEXT: vldrw.u32 q1, [r1, #416]
1713 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1714 ; CHECK-NEXT: vldrw.u32 q0, [r2, #432]
1715 ; CHECK-NEXT: vldrw.u32 q1, [r1, #432]
1716 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1717 ; CHECK-NEXT: vldrw.u32 q0, [r2, #448]
1718 ; CHECK-NEXT: vldrw.u32 q1, [r1, #448]
1719 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1720 ; CHECK-NEXT: vldrw.u32 q0, [r2, #464]
1721 ; CHECK-NEXT: vldrw.u32 q1, [r1, #464]
1722 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1723 ; CHECK-NEXT: vldrw.u32 q0, [r2, #480]
1724 ; CHECK-NEXT: vldrw.u32 q1, [r1, #480]
1725 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1726 ; CHECK-NEXT: vldrw.u32 q0, [r2, #496]
1727 ; CHECK-NEXT: vldrw.u32 q1, [r1, #496]
1728 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1731 %wide.load = load <4 x i32>, ptr %x, align 4
1732 %wide.load10 = load <4 x i32>, ptr %y, align 4
1733 %0 = mul nsw <4 x i32> %wide.load10, %wide.load
1734 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
1735 %2 = getelementptr inbounds i32, ptr %x, i32 4
1736 %wide.load.1 = load <4 x i32>, ptr %2, align 4
1737 %3 = getelementptr inbounds i32, ptr %y, i32 4
1738 %wide.load10.1 = load <4 x i32>, ptr %3, align 4
1739 %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1740 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1742 %7 = getelementptr inbounds i32, ptr %x, i32 8
1743 %wide.load.2 = load <4 x i32>, ptr %7, align 4
1744 %8 = getelementptr inbounds i32, ptr %y, i32 8
1745 %wide.load10.2 = load <4 x i32>, ptr %8, align 4
1746 %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1747 %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
1748 %11 = add i32 %10, %6
1749 %12 = getelementptr inbounds i32, ptr %x, i32 12
1750 %wide.load.3 = load <4 x i32>, ptr %12, align 4
1751 %13 = getelementptr inbounds i32, ptr %y, i32 12
1752 %wide.load10.3 = load <4 x i32>, ptr %13, align 4
1753 %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1754 %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
1755 %16 = add i32 %15, %11
1756 %17 = getelementptr inbounds i32, ptr %x, i32 16
1757 %wide.load.4 = load <4 x i32>, ptr %17, align 4
1758 %18 = getelementptr inbounds i32, ptr %y, i32 16
1759 %wide.load10.4 = load <4 x i32>, ptr %18, align 4
1760 %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1761 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
1762 %21 = add i32 %20, %16
1763 %22 = getelementptr inbounds i32, ptr %x, i32 20
1764 %wide.load.5 = load <4 x i32>, ptr %22, align 4
1765 %23 = getelementptr inbounds i32, ptr %y, i32 20
1766 %wide.load10.5 = load <4 x i32>, ptr %23, align 4
1767 %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1768 %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
1769 %26 = add i32 %25, %21
1770 %27 = getelementptr inbounds i32, ptr %x, i32 24
1771 %wide.load.6 = load <4 x i32>, ptr %27, align 4
1772 %28 = getelementptr inbounds i32, ptr %y, i32 24
1773 %wide.load10.6 = load <4 x i32>, ptr %28, align 4
1774 %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1775 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1776 %31 = add i32 %30, %26
1777 %32 = getelementptr inbounds i32, ptr %x, i32 28
1778 %wide.load.7 = load <4 x i32>, ptr %32, align 4
1779 %33 = getelementptr inbounds i32, ptr %y, i32 28
1780 %wide.load10.7 = load <4 x i32>, ptr %33, align 4
1781 %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1782 %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
1783 %36 = add i32 %35, %31
1784 %37 = getelementptr inbounds i32, ptr %x, i32 32
1785 %wide.load.8 = load <4 x i32>, ptr %37, align 4
1786 %38 = getelementptr inbounds i32, ptr %y, i32 32
1787 %wide.load10.8 = load <4 x i32>, ptr %38, align 4
1788 %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1789 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39)
1790 %41 = add i32 %40, %36
1791 %42 = getelementptr inbounds i32, ptr %x, i32 36
1792 %wide.load.9 = load <4 x i32>, ptr %42, align 4
1793 %43 = getelementptr inbounds i32, ptr %y, i32 36
1794 %wide.load10.9 = load <4 x i32>, ptr %43, align 4
1795 %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1796 %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
1797 %46 = add i32 %45, %41
1798 %47 = getelementptr inbounds i32, ptr %x, i32 40
1799 %wide.load.10 = load <4 x i32>, ptr %47, align 4
1800 %48 = getelementptr inbounds i32, ptr %y, i32 40
1801 %wide.load10.10 = load <4 x i32>, ptr %48, align 4
1802 %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1803 %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49)
1804 %51 = add i32 %50, %46
1805 %52 = getelementptr inbounds i32, ptr %x, i32 44
1806 %wide.load.11 = load <4 x i32>, ptr %52, align 4
1807 %53 = getelementptr inbounds i32, ptr %y, i32 44
1808 %wide.load10.11 = load <4 x i32>, ptr %53, align 4
1809 %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1810 %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54)
1811 %56 = add i32 %55, %51
1812 %57 = getelementptr inbounds i32, ptr %x, i32 48
1813 %wide.load.12 = load <4 x i32>, ptr %57, align 4
1814 %58 = getelementptr inbounds i32, ptr %y, i32 48
1815 %wide.load10.12 = load <4 x i32>, ptr %58, align 4
1816 %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
1817 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59)
1818 %61 = add i32 %60, %56
1819 %62 = getelementptr inbounds i32, ptr %x, i32 52
1820 %wide.load.13 = load <4 x i32>, ptr %62, align 4
1821 %63 = getelementptr inbounds i32, ptr %y, i32 52
1822 %wide.load10.13 = load <4 x i32>, ptr %63, align 4
1823 %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
1824 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1825 %66 = add i32 %65, %61
1826 %67 = getelementptr inbounds i32, ptr %x, i32 56
1827 %wide.load.14 = load <4 x i32>, ptr %67, align 4
1828 %68 = getelementptr inbounds i32, ptr %y, i32 56
1829 %wide.load10.14 = load <4 x i32>, ptr %68, align 4
1830 %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
1831 %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69)
1832 %71 = add i32 %70, %66
1833 %72 = getelementptr inbounds i32, ptr %x, i32 60
1834 %wide.load.15 = load <4 x i32>, ptr %72, align 4
1835 %73 = getelementptr inbounds i32, ptr %y, i32 60
1836 %wide.load10.15 = load <4 x i32>, ptr %73, align 4
1837 %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
1838 %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74)
1839 %76 = add i32 %75, %71
1840 %77 = getelementptr inbounds i32, ptr %x, i32 64
1841 %wide.load.16 = load <4 x i32>, ptr %77, align 4
1842 %78 = getelementptr inbounds i32, ptr %y, i32 64
1843 %wide.load10.16 = load <4 x i32>, ptr %78, align 4
1844 %79 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16
1845 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %79)
1846 %81 = add i32 %80, %76
1847 %82 = getelementptr inbounds i32, ptr %x, i32 68
1848 %wide.load.17 = load <4 x i32>, ptr %82, align 4
1849 %83 = getelementptr inbounds i32, ptr %y, i32 68
1850 %wide.load10.17 = load <4 x i32>, ptr %83, align 4
1851 %84 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17
1852 %85 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %84)
1853 %86 = add i32 %85, %81
1854 %87 = getelementptr inbounds i32, ptr %x, i32 72
1855 %wide.load.18 = load <4 x i32>, ptr %87, align 4
1856 %88 = getelementptr inbounds i32, ptr %y, i32 72
1857 %wide.load10.18 = load <4 x i32>, ptr %88, align 4
1858 %89 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18
1859 %90 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %89)
1860 %91 = add i32 %90, %86
1861 %92 = getelementptr inbounds i32, ptr %x, i32 76
1862 %wide.load.19 = load <4 x i32>, ptr %92, align 4
1863 %93 = getelementptr inbounds i32, ptr %y, i32 76
1864 %wide.load10.19 = load <4 x i32>, ptr %93, align 4
1865 %94 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19
1866 %95 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %94)
1867 %96 = add i32 %95, %91
1868 %97 = getelementptr inbounds i32, ptr %x, i32 80
1869 %wide.load.20 = load <4 x i32>, ptr %97, align 4
1870 %98 = getelementptr inbounds i32, ptr %y, i32 80
1871 %wide.load10.20 = load <4 x i32>, ptr %98, align 4
1872 %99 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20
1873 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
1874 %101 = add i32 %100, %96
1875 %102 = getelementptr inbounds i32, ptr %x, i32 84
1876 %wide.load.21 = load <4 x i32>, ptr %102, align 4
1877 %103 = getelementptr inbounds i32, ptr %y, i32 84
1878 %wide.load10.21 = load <4 x i32>, ptr %103, align 4
1879 %104 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21
1880 %105 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %104)
1881 %106 = add i32 %105, %101
1882 %107 = getelementptr inbounds i32, ptr %x, i32 88
1883 %wide.load.22 = load <4 x i32>, ptr %107, align 4
1884 %108 = getelementptr inbounds i32, ptr %y, i32 88
1885 %wide.load10.22 = load <4 x i32>, ptr %108, align 4
1886 %109 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22
1887 %110 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %109)
1888 %111 = add i32 %110, %106
1889 %112 = getelementptr inbounds i32, ptr %x, i32 92
1890 %wide.load.23 = load <4 x i32>, ptr %112, align 4
1891 %113 = getelementptr inbounds i32, ptr %y, i32 92
1892 %wide.load10.23 = load <4 x i32>, ptr %113, align 4
1893 %114 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23
1894 %115 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %114)
1895 %116 = add i32 %115, %111
1896 %117 = getelementptr inbounds i32, ptr %x, i32 96
1897 %wide.load.24 = load <4 x i32>, ptr %117, align 4
1898 %118 = getelementptr inbounds i32, ptr %y, i32 96
1899 %wide.load10.24 = load <4 x i32>, ptr %118, align 4
1900 %119 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24
1901 %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %119)
1902 %121 = add i32 %120, %116
1903 %122 = getelementptr inbounds i32, ptr %x, i32 100
1904 %wide.load.25 = load <4 x i32>, ptr %122, align 4
1905 %123 = getelementptr inbounds i32, ptr %y, i32 100
1906 %wide.load10.25 = load <4 x i32>, ptr %123, align 4
1907 %124 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25
1908 %125 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %124)
1909 %126 = add i32 %125, %121
1910 %127 = getelementptr inbounds i32, ptr %x, i32 104
1911 %wide.load.26 = load <4 x i32>, ptr %127, align 4
1912 %128 = getelementptr inbounds i32, ptr %y, i32 104
1913 %wide.load10.26 = load <4 x i32>, ptr %128, align 4
1914 %129 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26
1915 %130 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %129)
1916 %131 = add i32 %130, %126
1917 %132 = getelementptr inbounds i32, ptr %x, i32 108
1918 %wide.load.27 = load <4 x i32>, ptr %132, align 4
1919 %133 = getelementptr inbounds i32, ptr %y, i32 108
1920 %wide.load10.27 = load <4 x i32>, ptr %133, align 4
1921 %134 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27
1922 %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134)
1923 %136 = add i32 %135, %131
1924 %137 = getelementptr inbounds i32, ptr %x, i32 112
1925 %wide.load.28 = load <4 x i32>, ptr %137, align 4
1926 %138 = getelementptr inbounds i32, ptr %y, i32 112
1927 %wide.load10.28 = load <4 x i32>, ptr %138, align 4
1928 %139 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28
1929 %140 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %139)
1930 %141 = add i32 %140, %136
1931 %142 = getelementptr inbounds i32, ptr %x, i32 116
1932 %wide.load.29 = load <4 x i32>, ptr %142, align 4
1933 %143 = getelementptr inbounds i32, ptr %y, i32 116
1934 %wide.load10.29 = load <4 x i32>, ptr %143, align 4
1935 %144 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29
1936 %145 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %144)
1937 %146 = add i32 %145, %141
1938 %147 = getelementptr inbounds i32, ptr %x, i32 120
1939 %wide.load.30 = load <4 x i32>, ptr %147, align 4
1940 %148 = getelementptr inbounds i32, ptr %y, i32 120
1941 %wide.load10.30 = load <4 x i32>, ptr %148, align 4
1942 %149 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30
1943 %150 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %149)
1944 %151 = add i32 %150, %146
1945 %152 = getelementptr inbounds i32, ptr %x, i32 124
1946 %wide.load.31 = load <4 x i32>, ptr %152, align 4
1947 %153 = getelementptr inbounds i32, ptr %y, i32 124
1948 %wide.load10.31 = load <4 x i32>, ptr %153, align 4
1949 %154 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31
1950 %155 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %154)
1951 %156 = add i32 %155, %151
1955 define i32 @mlav2i32i16(ptr %x, ptr %y) {
1956 ; CHECK-LABEL: mlav2i32i16:
1957 ; CHECK: @ %bb.0: @ %entry
1958 ; CHECK-NEXT: ldrsh.w r2, [r0]
1959 ; CHECK-NEXT: ldrsh.w r3, [r1]
1960 ; CHECK-NEXT: ldrsh.w r0, [r0, #2]
1961 ; CHECK-NEXT: ldrsh.w r1, [r1, #2]
1962 ; CHECK-NEXT: muls r0, r1, r0
1963 ; CHECK-NEXT: smlabb r0, r3, r2, r0
1966 %0 = load i16, ptr %x, align 2
1967 %conv = sext i16 %0 to i32
1968 %1 = load i16, ptr %y, align 2
1969 %conv2 = sext i16 %1 to i32
1970 %mul = mul nsw i32 %conv2, %conv
1971 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
1972 %2 = load i16, ptr %arrayidx.1, align 2
1973 %conv.1 = sext i16 %2 to i32
1974 %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1
1975 %3 = load i16, ptr %arrayidx1.1, align 2
1976 %conv2.1 = sext i16 %3 to i32
1977 %mul.1 = mul nsw i32 %conv2.1, %conv.1
1978 %add.1 = add nsw i32 %mul.1, %mul
1982 define i32 @mlav4i32i16(ptr %x, ptr %y) {
1983 ; CHECK-LABEL: mlav4i32i16:
1984 ; CHECK: @ %bb.0: @ %entry
1985 ; CHECK-NEXT: vldrh.s32 q0, [r0]
1986 ; CHECK-NEXT: vldrh.s32 q1, [r1]
1987 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1990 %0 = load <4 x i16>, ptr %x, align 2
1991 %1 = sext <4 x i16> %0 to <4 x i32>
1992 %2 = load <4 x i16>, ptr %y, align 2
1993 %3 = sext <4 x i16> %2 to <4 x i32>
1994 %4 = mul nsw <4 x i32> %3, %1
1995 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1999 define i32 @mlav8i32i16(ptr %x, ptr %y) {
2000 ; CHECK-LABEL: mlav8i32i16:
2001 ; CHECK: @ %bb.0: @ %entry
2002 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2003 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2004 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2007 %0 = load <8 x i16>, ptr %x, align 2
2008 %1 = sext <8 x i16> %0 to <8 x i32>
2009 %2 = load <8 x i16>, ptr %y, align 2
2010 %3 = sext <8 x i16> %2 to <8 x i32>
2011 %4 = mul nsw <8 x i32> %3, %1
2012 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2016 define i32 @mlav16i32i16(ptr %x, ptr %y) {
2017 ; CHECK-LABEL: mlav16i32i16:
2018 ; CHECK: @ %bb.0: @ %entry
2019 ; CHECK-NEXT: vldrh.s32 q0, [r0]
2020 ; CHECK-NEXT: vldrh.s32 q1, [r1]
2021 ; CHECK-NEXT: vmlav.u32 r2, q1, q0
2022 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
2023 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
2024 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
2025 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
2026 ; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
2027 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
2028 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
2029 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
2030 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
2031 ; CHECK-NEXT: mov r0, r2
2034 %0 = load <16 x i16>, ptr %x, align 2
2035 %1 = sext <16 x i16> %0 to <16 x i32>
2036 %2 = load <16 x i16>, ptr %y, align 2
2037 %3 = sext <16 x i16> %2 to <16 x i32>
2038 %4 = mul nsw <16 x i32> %3, %1
2039 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2043 define i32 @mlav24i32i16(ptr %x, ptr %y) {
2044 ; CHECK-LABEL: mlav24i32i16:
2045 ; CHECK: @ %bb.0: @ %entry
2046 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2047 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2048 ; CHECK-NEXT: mov r2, r0
2049 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2050 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
2051 ; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
2052 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2053 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
2054 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
2055 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2056 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
2057 ; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
2058 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2059 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
2060 ; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
2061 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2064 %0 = load <8 x i16>, ptr %x, align 2
2065 %1 = sext <8 x i16> %0 to <8 x i32>
2066 %2 = load <8 x i16>, ptr %y, align 2
2067 %3 = sext <8 x i16> %2 to <8 x i32>
2068 %4 = mul nsw <8 x i32> %3, %1
2069 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
2070 %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8
2071 %5 = load <16 x i16>, ptr %arrayidx.8, align 2
2072 %6 = sext <16 x i16> %5 to <16 x i32>
2073 %7 = load <16 x i16>, ptr %arrayidx1.8, align 2
2074 %8 = sext <16 x i16> %7 to <16 x i32>
2075 %9 = mul nsw <16 x i32> %8, %6
2076 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
2077 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2078 %op.rdx = add nsw i32 %10, %11
2082 define i32 @mlav32i32i16(ptr %x, ptr %y) {
2083 ; CHECK-LABEL: mlav32i32i16:
2084 ; CHECK: @ %bb.0: @ %entry
2085 ; CHECK-NEXT: vldrh.s32 q0, [r0]
2086 ; CHECK-NEXT: vldrh.s32 q1, [r1]
2087 ; CHECK-NEXT: mov r2, r0
2088 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2089 ; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
2090 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
2091 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2092 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
2093 ; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
2094 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2095 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
2096 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
2097 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2098 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
2099 ; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
2100 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2101 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
2102 ; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
2103 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2104 ; CHECK-NEXT: vldrh.s32 q0, [r2, #48]
2105 ; CHECK-NEXT: vldrh.s32 q1, [r1, #48]
2106 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2107 ; CHECK-NEXT: vldrh.s32 q0, [r2, #56]
2108 ; CHECK-NEXT: vldrh.s32 q1, [r1, #56]
2109 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2112 %0 = load <32 x i16>, ptr %x, align 2
2113 %1 = sext <32 x i16> %0 to <32 x i32>
2114 %2 = load <32 x i16>, ptr %y, align 2
2115 %3 = sext <32 x i16> %2 to <32 x i32>
2116 %4 = mul nsw <32 x i32> %3, %1
2117 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2121 define i32 @mlav64i32i16(ptr %x, ptr %y) {
2122 ; CHECK-LABEL: mlav64i32i16:
2123 ; CHECK: @ %bb.0: @ %entry
2124 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2125 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2126 ; CHECK-NEXT: mov r2, r0
2127 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2128 ; CHECK-NEXT: vldrh.u16 q0, [r2, #16]
2129 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2130 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2131 ; CHECK-NEXT: vldrh.u16 q0, [r2, #32]
2132 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2133 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2134 ; CHECK-NEXT: vldrh.u16 q0, [r2, #48]
2135 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2136 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2137 ; CHECK-NEXT: vldrh.u16 q0, [r2, #64]
2138 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
2139 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2140 ; CHECK-NEXT: vldrh.u16 q0, [r2, #80]
2141 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
2142 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2143 ; CHECK-NEXT: vldrh.u16 q0, [r2, #96]
2144 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
2145 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2146 ; CHECK-NEXT: vldrh.u16 q0, [r2, #112]
2147 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
2148 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2151 %wide.load = load <8 x i16>, ptr %x, align 2
2152 %0 = sext <8 x i16> %wide.load to <8 x i32>
2153 %wide.load11 = load <8 x i16>, ptr %y, align 2
2154 %1 = sext <8 x i16> %wide.load11 to <8 x i32>
2155 %2 = mul nsw <8 x i32> %1, %0
2156 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
2157 %4 = getelementptr inbounds i16, ptr %x, i32 8
2158 %wide.load.1 = load <8 x i16>, ptr %4, align 2
2159 %5 = sext <8 x i16> %wide.load.1 to <8 x i32>
2160 %6 = getelementptr inbounds i16, ptr %y, i32 8
2161 %wide.load11.1 = load <8 x i16>, ptr %6, align 2
2162 %7 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2163 %8 = mul nsw <8 x i32> %7, %5
2164 %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
2165 %10 = add i32 %9, %3
2166 %11 = getelementptr inbounds i16, ptr %x, i32 16
2167 %wide.load.2 = load <8 x i16>, ptr %11, align 2
2168 %12 = sext <8 x i16> %wide.load.2 to <8 x i32>
2169 %13 = getelementptr inbounds i16, ptr %y, i32 16
2170 %wide.load11.2 = load <8 x i16>, ptr %13, align 2
2171 %14 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2172 %15 = mul nsw <8 x i32> %14, %12
2173 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
2174 %17 = add i32 %16, %10
2175 %18 = getelementptr inbounds i16, ptr %x, i32 24
2176 %wide.load.3 = load <8 x i16>, ptr %18, align 2
2177 %19 = sext <8 x i16> %wide.load.3 to <8 x i32>
2178 %20 = getelementptr inbounds i16, ptr %y, i32 24
2179 %wide.load11.3 = load <8 x i16>, ptr %20, align 2
2180 %21 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2181 %22 = mul nsw <8 x i32> %21, %19
2182 %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22)
2183 %24 = add i32 %23, %17
2184 %25 = getelementptr inbounds i16, ptr %x, i32 32
2185 %wide.load.4 = load <8 x i16>, ptr %25, align 2
2186 %26 = sext <8 x i16> %wide.load.4 to <8 x i32>
2187 %27 = getelementptr inbounds i16, ptr %y, i32 32
2188 %wide.load11.4 = load <8 x i16>, ptr %27, align 2
2189 %28 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2190 %29 = mul nsw <8 x i32> %28, %26
2191 %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29)
2192 %31 = add i32 %30, %24
2193 %32 = getelementptr inbounds i16, ptr %x, i32 40
2194 %wide.load.5 = load <8 x i16>, ptr %32, align 2
2195 %33 = sext <8 x i16> %wide.load.5 to <8 x i32>
2196 %34 = getelementptr inbounds i16, ptr %y, i32 40
2197 %wide.load11.5 = load <8 x i16>, ptr %34, align 2
2198 %35 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2199 %36 = mul nsw <8 x i32> %35, %33
2200 %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36)
2201 %38 = add i32 %37, %31
2202 %39 = getelementptr inbounds i16, ptr %x, i32 48
2203 %wide.load.6 = load <8 x i16>, ptr %39, align 2
2204 %40 = sext <8 x i16> %wide.load.6 to <8 x i32>
2205 %41 = getelementptr inbounds i16, ptr %y, i32 48
2206 %wide.load11.6 = load <8 x i16>, ptr %41, align 2
2207 %42 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2208 %43 = mul nsw <8 x i32> %42, %40
2209 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
2210 %45 = add i32 %44, %38
2211 %46 = getelementptr inbounds i16, ptr %x, i32 56
2212 %wide.load.7 = load <8 x i16>, ptr %46, align 2
2213 %47 = sext <8 x i16> %wide.load.7 to <8 x i32>
2214 %48 = getelementptr inbounds i16, ptr %y, i32 56
2215 %wide.load11.7 = load <8 x i16>, ptr %48, align 2
2216 %49 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2217 %50 = mul nsw <8 x i32> %49, %47
2218 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
2219 %52 = add i32 %51, %45
2223 define i32 @mlav128i32i16(ptr %x, ptr %y) {
2224 ; CHECK-LABEL: mlav128i32i16:
2225 ; CHECK: @ %bb.0: @ %entry
2226 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2227 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2228 ; CHECK-NEXT: mov r2, r0
2229 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2230 ; CHECK-NEXT: vldrh.u16 q0, [r2, #16]
2231 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2232 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2233 ; CHECK-NEXT: vldrh.u16 q0, [r2, #32]
2234 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2235 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2236 ; CHECK-NEXT: vldrh.u16 q0, [r2, #48]
2237 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2238 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2239 ; CHECK-NEXT: vldrh.u16 q0, [r2, #64]
2240 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
2241 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2242 ; CHECK-NEXT: vldrh.u16 q0, [r2, #80]
2243 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
2244 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2245 ; CHECK-NEXT: vldrh.u16 q0, [r2, #96]
2246 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
2247 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2248 ; CHECK-NEXT: vldrh.u16 q0, [r2, #112]
2249 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
2250 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2251 ; CHECK-NEXT: vldrh.u16 q0, [r2, #128]
2252 ; CHECK-NEXT: vldrh.u16 q1, [r1, #128]
2253 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2254 ; CHECK-NEXT: vldrh.u16 q0, [r2, #144]
2255 ; CHECK-NEXT: vldrh.u16 q1, [r1, #144]
2256 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2257 ; CHECK-NEXT: vldrh.u16 q0, [r2, #160]
2258 ; CHECK-NEXT: vldrh.u16 q1, [r1, #160]
2259 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2260 ; CHECK-NEXT: vldrh.u16 q0, [r2, #176]
2261 ; CHECK-NEXT: vldrh.u16 q1, [r1, #176]
2262 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2263 ; CHECK-NEXT: vldrh.u16 q0, [r2, #192]
2264 ; CHECK-NEXT: vldrh.u16 q1, [r1, #192]
2265 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2266 ; CHECK-NEXT: vldrh.u16 q0, [r2, #208]
2267 ; CHECK-NEXT: vldrh.u16 q1, [r1, #208]
2268 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2269 ; CHECK-NEXT: vldrh.u16 q0, [r2, #224]
2270 ; CHECK-NEXT: vldrh.u16 q1, [r1, #224]
2271 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2272 ; CHECK-NEXT: vldrh.u16 q0, [r2, #240]
2273 ; CHECK-NEXT: vldrh.u16 q1, [r1, #240]
2274 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2277 %wide.load = load <8 x i16>, ptr %x, align 2
2278 %0 = sext <8 x i16> %wide.load to <8 x i32>
2279 %wide.load11 = load <8 x i16>, ptr %y, align 2
2280 %1 = sext <8 x i16> %wide.load11 to <8 x i32>
2281 %2 = mul nsw <8 x i32> %1, %0
2282 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
2283 %4 = getelementptr inbounds i16, ptr %x, i32 8
2284 %wide.load.1 = load <8 x i16>, ptr %4, align 2
2285 %5 = sext <8 x i16> %wide.load.1 to <8 x i32>
2286 %6 = getelementptr inbounds i16, ptr %y, i32 8
2287 %wide.load11.1 = load <8 x i16>, ptr %6, align 2
2288 %7 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2289 %8 = mul nsw <8 x i32> %7, %5
2290 %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
2291 %10 = add i32 %9, %3
2292 %11 = getelementptr inbounds i16, ptr %x, i32 16
2293 %wide.load.2 = load <8 x i16>, ptr %11, align 2
2294 %12 = sext <8 x i16> %wide.load.2 to <8 x i32>
2295 %13 = getelementptr inbounds i16, ptr %y, i32 16
2296 %wide.load11.2 = load <8 x i16>, ptr %13, align 2
2297 %14 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2298 %15 = mul nsw <8 x i32> %14, %12
2299 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
2300 %17 = add i32 %16, %10
2301 %18 = getelementptr inbounds i16, ptr %x, i32 24
2302 %wide.load.3 = load <8 x i16>, ptr %18, align 2
2303 %19 = sext <8 x i16> %wide.load.3 to <8 x i32>
2304 %20 = getelementptr inbounds i16, ptr %y, i32 24
2305 %wide.load11.3 = load <8 x i16>, ptr %20, align 2
2306 %21 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2307 %22 = mul nsw <8 x i32> %21, %19
2308 %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22)
2309 %24 = add i32 %23, %17
2310 %25 = getelementptr inbounds i16, ptr %x, i32 32
2311 %wide.load.4 = load <8 x i16>, ptr %25, align 2
2312 %26 = sext <8 x i16> %wide.load.4 to <8 x i32>
2313 %27 = getelementptr inbounds i16, ptr %y, i32 32
2314 %wide.load11.4 = load <8 x i16>, ptr %27, align 2
2315 %28 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2316 %29 = mul nsw <8 x i32> %28, %26
2317 %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29)
2318 %31 = add i32 %30, %24
2319 %32 = getelementptr inbounds i16, ptr %x, i32 40
2320 %wide.load.5 = load <8 x i16>, ptr %32, align 2
2321 %33 = sext <8 x i16> %wide.load.5 to <8 x i32>
2322 %34 = getelementptr inbounds i16, ptr %y, i32 40
2323 %wide.load11.5 = load <8 x i16>, ptr %34, align 2
2324 %35 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2325 %36 = mul nsw <8 x i32> %35, %33
2326 %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36)
2327 %38 = add i32 %37, %31
2328 %39 = getelementptr inbounds i16, ptr %x, i32 48
2329 %wide.load.6 = load <8 x i16>, ptr %39, align 2
2330 %40 = sext <8 x i16> %wide.load.6 to <8 x i32>
2331 %41 = getelementptr inbounds i16, ptr %y, i32 48
2332 %wide.load11.6 = load <8 x i16>, ptr %41, align 2
2333 %42 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2334 %43 = mul nsw <8 x i32> %42, %40
2335 %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
2336 %45 = add i32 %44, %38
2337 %46 = getelementptr inbounds i16, ptr %x, i32 56
2338 %wide.load.7 = load <8 x i16>, ptr %46, align 2
2339 %47 = sext <8 x i16> %wide.load.7 to <8 x i32>
2340 %48 = getelementptr inbounds i16, ptr %y, i32 56
2341 %wide.load11.7 = load <8 x i16>, ptr %48, align 2
2342 %49 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2343 %50 = mul nsw <8 x i32> %49, %47
2344 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
2345 %52 = add i32 %51, %45
2346 %53 = getelementptr inbounds i16, ptr %x, i32 64
2347 %wide.load.8 = load <8 x i16>, ptr %53, align 2
2348 %54 = sext <8 x i16> %wide.load.8 to <8 x i32>
2349 %55 = getelementptr inbounds i16, ptr %y, i32 64
2350 %wide.load11.8 = load <8 x i16>, ptr %55, align 2
2351 %56 = sext <8 x i16> %wide.load11.8 to <8 x i32>
2352 %57 = mul nsw <8 x i32> %56, %54
2353 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
2354 %59 = add i32 %58, %52
2355 %60 = getelementptr inbounds i16, ptr %x, i32 72
2356 %wide.load.9 = load <8 x i16>, ptr %60, align 2
2357 %61 = sext <8 x i16> %wide.load.9 to <8 x i32>
2358 %62 = getelementptr inbounds i16, ptr %y, i32 72
2359 %wide.load11.9 = load <8 x i16>, ptr %62, align 2
2360 %63 = sext <8 x i16> %wide.load11.9 to <8 x i32>
2361 %64 = mul nsw <8 x i32> %63, %61
2362 %65 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %64)
2363 %66 = add i32 %65, %59
2364 %67 = getelementptr inbounds i16, ptr %x, i32 80
2365 %wide.load.10 = load <8 x i16>, ptr %67, align 2
2366 %68 = sext <8 x i16> %wide.load.10 to <8 x i32>
2367 %69 = getelementptr inbounds i16, ptr %y, i32 80
2368 %wide.load11.10 = load <8 x i16>, ptr %69, align 2
2369 %70 = sext <8 x i16> %wide.load11.10 to <8 x i32>
2370 %71 = mul nsw <8 x i32> %70, %68
2371 %72 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %71)
2372 %73 = add i32 %72, %66
2373 %74 = getelementptr inbounds i16, ptr %x, i32 88
2374 %wide.load.11 = load <8 x i16>, ptr %74, align 2
2375 %75 = sext <8 x i16> %wide.load.11 to <8 x i32>
2376 %76 = getelementptr inbounds i16, ptr %y, i32 88
2377 %wide.load11.11 = load <8 x i16>, ptr %76, align 2
2378 %77 = sext <8 x i16> %wide.load11.11 to <8 x i32>
2379 %78 = mul nsw <8 x i32> %77, %75
2380 %79 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %78)
2381 %80 = add i32 %79, %73
2382 %81 = getelementptr inbounds i16, ptr %x, i32 96
2383 %wide.load.12 = load <8 x i16>, ptr %81, align 2
2384 %82 = sext <8 x i16> %wide.load.12 to <8 x i32>
2385 %83 = getelementptr inbounds i16, ptr %y, i32 96
2386 %wide.load11.12 = load <8 x i16>, ptr %83, align 2
2387 %84 = sext <8 x i16> %wide.load11.12 to <8 x i32>
2388 %85 = mul nsw <8 x i32> %84, %82
2389 %86 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %85)
2390 %87 = add i32 %86, %80
2391 %88 = getelementptr inbounds i16, ptr %x, i32 104
2392 %wide.load.13 = load <8 x i16>, ptr %88, align 2
2393 %89 = sext <8 x i16> %wide.load.13 to <8 x i32>
2394 %90 = getelementptr inbounds i16, ptr %y, i32 104
2395 %wide.load11.13 = load <8 x i16>, ptr %90, align 2
2396 %91 = sext <8 x i16> %wide.load11.13 to <8 x i32>
2397 %92 = mul nsw <8 x i32> %91, %89
2398 %93 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %92)
2399 %94 = add i32 %93, %87
2400 %95 = getelementptr inbounds i16, ptr %x, i32 112
2401 %wide.load.14 = load <8 x i16>, ptr %95, align 2
2402 %96 = sext <8 x i16> %wide.load.14 to <8 x i32>
2403 %97 = getelementptr inbounds i16, ptr %y, i32 112
2404 %wide.load11.14 = load <8 x i16>, ptr %97, align 2
2405 %98 = sext <8 x i16> %wide.load11.14 to <8 x i32>
2406 %99 = mul nsw <8 x i32> %98, %96
2407 %100 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %99)
2408 %101 = add i32 %100, %94
2409 %102 = getelementptr inbounds i16, ptr %x, i32 120
2410 %wide.load.15 = load <8 x i16>, ptr %102, align 2
2411 %103 = sext <8 x i16> %wide.load.15 to <8 x i32>
2412 %104 = getelementptr inbounds i16, ptr %y, i32 120
2413 %wide.load11.15 = load <8 x i16>, ptr %104, align 2
2414 %105 = sext <8 x i16> %wide.load11.15 to <8 x i32>
2415 %106 = mul nsw <8 x i32> %105, %103
2416 %107 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %106)
2417 %108 = add i32 %107, %101
2421 define i32 @mlav2i32i8(ptr %x, ptr %y) {
2422 ; CHECK-LABEL: mlav2i32i8:
2423 ; CHECK: @ %bb.0: @ %entry
2424 ; CHECK-NEXT: ldrb r2, [r0]
2425 ; CHECK-NEXT: ldrb r3, [r1]
2426 ; CHECK-NEXT: ldrb r0, [r0, #1]
2427 ; CHECK-NEXT: ldrb r1, [r1, #1]
2428 ; CHECK-NEXT: muls r0, r1, r0
2429 ; CHECK-NEXT: smlabb r0, r3, r2, r0
2432 %0 = load i8, ptr %x, align 1
2433 %conv = zext i8 %0 to i32
2434 %1 = load i8, ptr %y, align 1
2435 %conv2 = zext i8 %1 to i32
2436 %mul = mul nuw nsw i32 %conv2, %conv
2437 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
2438 %2 = load i8, ptr %arrayidx.1, align 1
2439 %conv.1 = zext i8 %2 to i32
2440 %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1
2441 %3 = load i8, ptr %arrayidx1.1, align 1
2442 %conv2.1 = zext i8 %3 to i32
2443 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
2444 %add.1 = add nuw nsw i32 %mul.1, %mul
2448 define i32 @mlav4i32i8(ptr %x, ptr %y) {
2449 ; CHECK-LABEL: mlav4i32i8:
2450 ; CHECK: @ %bb.0: @ %entry
2451 ; CHECK-NEXT: vldrb.u32 q0, [r0]
2452 ; CHECK-NEXT: vldrb.u32 q1, [r1]
2453 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2456 %0 = load <4 x i8>, ptr %x, align 1
2457 %1 = zext <4 x i8> %0 to <4 x i32>
2458 %2 = load <4 x i8>, ptr %y, align 1
2459 %3 = zext <4 x i8> %2 to <4 x i32>
2460 %4 = mul nuw nsw <4 x i32> %3, %1
2461 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
2465 define i32 @mlav8i32i8(ptr %x, ptr %y) {
2466 ; CHECK-LABEL: mlav8i32i8:
2467 ; CHECK: @ %bb.0: @ %entry
2468 ; CHECK-NEXT: vldrb.u16 q0, [r0]
2469 ; CHECK-NEXT: vldrb.u16 q1, [r1]
2470 ; CHECK-NEXT: vmlav.u16 r0, q1, q0
2473 %0 = load <8 x i8>, ptr %x, align 1
2474 %1 = zext <8 x i8> %0 to <8 x i32>
2475 %2 = load <8 x i8>, ptr %y, align 1
2476 %3 = zext <8 x i8> %2 to <8 x i32>
2477 %4 = mul nuw nsw <8 x i32> %3, %1
2478 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2482 define i32 @mlav16i32i8(ptr %x, ptr %y) {
2483 ; CHECK-LABEL: mlav16i32i8:
2484 ; CHECK: @ %bb.0: @ %entry
2485 ; CHECK-NEXT: vldrb.u8 q0, [r0]
2486 ; CHECK-NEXT: vldrb.u8 q1, [r1]
2487 ; CHECK-NEXT: vmlav.u8 r0, q1, q0
2490 %0 = load <16 x i8>, ptr %x, align 1
2491 %1 = zext <16 x i8> %0 to <16 x i32>
2492 %2 = load <16 x i8>, ptr %y, align 1
2493 %3 = zext <16 x i8> %2 to <16 x i32>
2494 %4 = mul nuw nsw <16 x i32> %3, %1
2495 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2499 define i32 @mlav24i32i8(ptr %x, ptr %y) {
2500 ; CHECK-LABEL: mlav24i32i8:
2501 ; CHECK: @ %bb.0: @ %entry
2502 ; CHECK-NEXT: vldrb.u16 q0, [r0]
2503 ; CHECK-NEXT: vldrb.u16 q1, [r1]
2504 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2505 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
2506 ; CHECK-NEXT: vldrb.u8 q1, [r1, #8]
2507 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2508 ; CHECK-NEXT: mov r0, r2
2511 %0 = load <8 x i8>, ptr %x, align 1
2512 %1 = zext <8 x i8> %0 to <8 x i32>
2513 %2 = load <8 x i8>, ptr %y, align 1
2514 %3 = zext <8 x i8> %2 to <8 x i32>
2515 %4 = mul nuw nsw <8 x i32> %3, %1
2516 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
2517 %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8
2518 %5 = load <16 x i8>, ptr %arrayidx.8, align 1
2519 %6 = zext <16 x i8> %5 to <16 x i32>
2520 %7 = load <16 x i8>, ptr %arrayidx1.8, align 1
2521 %8 = zext <16 x i8> %7 to <16 x i32>
2522 %9 = mul nuw nsw <16 x i32> %8, %6
2523 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
2524 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2525 %op.rdx = add nuw nsw i32 %10, %11
2529 define i32 @mlav32i32i8(ptr %x, ptr %y) {
2530 ; CHECK-LABEL: mlav32i32i8:
2531 ; CHECK: @ %bb.0: @ %entry
2532 ; CHECK-NEXT: vldrb.u32 q0, [r0]
2533 ; CHECK-NEXT: vldrb.u32 q1, [r1]
2534 ; CHECK-NEXT: mov r2, r0
2535 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2536 ; CHECK-NEXT: vldrb.u32 q0, [r2, #4]
2537 ; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
2538 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2539 ; CHECK-NEXT: vldrb.u32 q0, [r2, #8]
2540 ; CHECK-NEXT: vldrb.u32 q1, [r1, #8]
2541 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2542 ; CHECK-NEXT: vldrb.u32 q0, [r2, #12]
2543 ; CHECK-NEXT: vldrb.u32 q1, [r1, #12]
2544 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2545 ; CHECK-NEXT: vldrb.u32 q0, [r2, #16]
2546 ; CHECK-NEXT: vldrb.u32 q1, [r1, #16]
2547 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2548 ; CHECK-NEXT: vldrb.u32 q0, [r2, #20]
2549 ; CHECK-NEXT: vldrb.u32 q1, [r1, #20]
2550 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2551 ; CHECK-NEXT: vldrb.u32 q0, [r2, #24]
2552 ; CHECK-NEXT: vldrb.u32 q1, [r1, #24]
2553 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2554 ; CHECK-NEXT: vldrb.u32 q0, [r2, #28]
2555 ; CHECK-NEXT: vldrb.u32 q1, [r1, #28]
2556 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2559 %0 = load <32 x i8>, ptr %x, align 1
2560 %1 = zext <32 x i8> %0 to <32 x i32>
2561 %2 = load <32 x i8>, ptr %y, align 1
2562 %3 = zext <32 x i8> %2 to <32 x i32>
2563 %4 = mul nuw nsw <32 x i32> %3, %1
2564 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2568 define i32 @mlav64i32i8(ptr %x, ptr %y) {
2569 ; CHECK-LABEL: mlav64i32i8:
2570 ; CHECK: @ %bb.0: @ %entry
2571 ; CHECK-NEXT: vldrb.u8 q0, [r0]
2572 ; CHECK-NEXT: vldrb.u8 q1, [r1]
2573 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
2574 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
2575 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
2576 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2577 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
2578 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
2579 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2580 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
2581 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
2582 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2583 ; CHECK-NEXT: mov r0, r2
2586 %wide.load = load <16 x i8>, ptr %x, align 1
2587 %0 = zext <16 x i8> %wide.load to <16 x i32>
2588 %wide.load11 = load <16 x i8>, ptr %y, align 1
2589 %1 = zext <16 x i8> %wide.load11 to <16 x i32>
2590 %2 = mul nuw nsw <16 x i32> %1, %0
2591 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
2592 %4 = getelementptr inbounds i8, ptr %x, i32 16
2593 %wide.load.1 = load <16 x i8>, ptr %4, align 1
2594 %5 = zext <16 x i8> %wide.load.1 to <16 x i32>
2595 %6 = getelementptr inbounds i8, ptr %y, i32 16
2596 %wide.load11.1 = load <16 x i8>, ptr %6, align 1
2597 %7 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2598 %8 = mul nuw nsw <16 x i32> %7, %5
2599 %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8)
2600 %10 = add i32 %9, %3
2601 %11 = getelementptr inbounds i8, ptr %x, i32 32
2602 %wide.load.2 = load <16 x i8>, ptr %11, align 1
2603 %12 = zext <16 x i8> %wide.load.2 to <16 x i32>
2604 %13 = getelementptr inbounds i8, ptr %y, i32 32
2605 %wide.load11.2 = load <16 x i8>, ptr %13, align 1
2606 %14 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2607 %15 = mul nuw nsw <16 x i32> %14, %12
2608 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
2609 %17 = add i32 %16, %10
2610 %18 = getelementptr inbounds i8, ptr %x, i32 48
2611 %wide.load.3 = load <16 x i8>, ptr %18, align 1
2612 %19 = zext <16 x i8> %wide.load.3 to <16 x i32>
2613 %20 = getelementptr inbounds i8, ptr %y, i32 48
2614 %wide.load11.3 = load <16 x i8>, ptr %20, align 1
2615 %21 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2616 %22 = mul nuw nsw <16 x i32> %21, %19
2617 %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22)
2618 %24 = add i32 %23, %17
2622 define i32 @mlav128i32i8(ptr %x, ptr %y) {
2623 ; CHECK-LABEL: mlav128i32i8:
2624 ; CHECK: @ %bb.0: @ %entry
2625 ; CHECK-NEXT: vldrb.u8 q0, [r0]
2626 ; CHECK-NEXT: vldrb.u8 q1, [r1]
2627 ; CHECK-NEXT: mov r2, r0
2628 ; CHECK-NEXT: vmlav.u8 r0, q1, q0
2629 ; CHECK-NEXT: vldrb.u8 q0, [r2, #16]
2630 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
2631 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2632 ; CHECK-NEXT: vldrb.u8 q0, [r2, #32]
2633 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
2634 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2635 ; CHECK-NEXT: vldrb.u8 q0, [r2, #48]
2636 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
2637 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2638 ; CHECK-NEXT: vldrb.u8 q0, [r2, #64]
2639 ; CHECK-NEXT: vldrb.u8 q1, [r1, #64]
2640 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2641 ; CHECK-NEXT: vldrb.u8 q0, [r2, #80]
2642 ; CHECK-NEXT: vldrb.u8 q1, [r1, #80]
2643 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2644 ; CHECK-NEXT: vldrb.u8 q0, [r2, #96]
2645 ; CHECK-NEXT: vldrb.u8 q1, [r1, #96]
2646 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2647 ; CHECK-NEXT: vldrb.u8 q0, [r2, #112]
2648 ; CHECK-NEXT: vldrb.u8 q1, [r1, #112]
2649 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2652 %wide.load = load <16 x i8>, ptr %x, align 1
2653 %0 = zext <16 x i8> %wide.load to <16 x i32>
2654 %wide.load11 = load <16 x i8>, ptr %y, align 1
2655 %1 = zext <16 x i8> %wide.load11 to <16 x i32>
2656 %2 = mul nuw nsw <16 x i32> %1, %0
2657 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
2658 %4 = getelementptr inbounds i8, ptr %x, i32 16
2659 %wide.load.1 = load <16 x i8>, ptr %4, align 1
2660 %5 = zext <16 x i8> %wide.load.1 to <16 x i32>
2661 %6 = getelementptr inbounds i8, ptr %y, i32 16
2662 %wide.load11.1 = load <16 x i8>, ptr %6, align 1
2663 %7 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2664 %8 = mul nuw nsw <16 x i32> %7, %5
2665 %9 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %8)
2666 %10 = add i32 %9, %3
2667 %11 = getelementptr inbounds i8, ptr %x, i32 32
2668 %wide.load.2 = load <16 x i8>, ptr %11, align 1
2669 %12 = zext <16 x i8> %wide.load.2 to <16 x i32>
2670 %13 = getelementptr inbounds i8, ptr %y, i32 32
2671 %wide.load11.2 = load <16 x i8>, ptr %13, align 1
2672 %14 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2673 %15 = mul nuw nsw <16 x i32> %14, %12
2674 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
2675 %17 = add i32 %16, %10
2676 %18 = getelementptr inbounds i8, ptr %x, i32 48
2677 %wide.load.3 = load <16 x i8>, ptr %18, align 1
2678 %19 = zext <16 x i8> %wide.load.3 to <16 x i32>
2679 %20 = getelementptr inbounds i8, ptr %y, i32 48
2680 %wide.load11.3 = load <16 x i8>, ptr %20, align 1
2681 %21 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2682 %22 = mul nuw nsw <16 x i32> %21, %19
2683 %23 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %22)
2684 %24 = add i32 %23, %17
2685 %25 = getelementptr inbounds i8, ptr %x, i32 64
2686 %wide.load.4 = load <16 x i8>, ptr %25, align 1
2687 %26 = zext <16 x i8> %wide.load.4 to <16 x i32>
2688 %27 = getelementptr inbounds i8, ptr %y, i32 64
2689 %wide.load11.4 = load <16 x i8>, ptr %27, align 1
2690 %28 = zext <16 x i8> %wide.load11.4 to <16 x i32>
2691 %29 = mul nuw nsw <16 x i32> %28, %26
2692 %30 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %29)
2693 %31 = add i32 %30, %24
2694 %32 = getelementptr inbounds i8, ptr %x, i32 80
2695 %wide.load.5 = load <16 x i8>, ptr %32, align 1
2696 %33 = zext <16 x i8> %wide.load.5 to <16 x i32>
2697 %34 = getelementptr inbounds i8, ptr %y, i32 80
2698 %wide.load11.5 = load <16 x i8>, ptr %34, align 1
2699 %35 = zext <16 x i8> %wide.load11.5 to <16 x i32>
2700 %36 = mul nuw nsw <16 x i32> %35, %33
2701 %37 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %36)
2702 %38 = add i32 %37, %31
2703 %39 = getelementptr inbounds i8, ptr %x, i32 96
2704 %wide.load.6 = load <16 x i8>, ptr %39, align 1
2705 %40 = zext <16 x i8> %wide.load.6 to <16 x i32>
2706 %41 = getelementptr inbounds i8, ptr %y, i32 96
2707 %wide.load11.6 = load <16 x i8>, ptr %41, align 1
2708 %42 = zext <16 x i8> %wide.load11.6 to <16 x i32>
2709 %43 = mul nuw nsw <16 x i32> %42, %40
2710 %44 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %43)
2711 %45 = add i32 %44, %38
2712 %46 = getelementptr inbounds i8, ptr %x, i32 112
2713 %wide.load.7 = load <16 x i8>, ptr %46, align 1
2714 %47 = zext <16 x i8> %wide.load.7 to <16 x i32>
2715 %48 = getelementptr inbounds i8, ptr %y, i32 112
2716 %wide.load11.7 = load <16 x i8>, ptr %48, align 1
2717 %49 = zext <16 x i8> %wide.load11.7 to <16 x i32>
2718 %50 = mul nuw nsw <16 x i32> %49, %47
2719 %51 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %50)
2720 %52 = add i32 %51, %45
2724 define signext i16 @mlav2i16i16(ptr %x, ptr %y) {
2725 ; CHECK-LABEL: mlav2i16i16:
2726 ; CHECK: @ %bb.0: @ %entry
2727 ; CHECK-NEXT: ldrh r2, [r0]
2728 ; CHECK-NEXT: ldrh r3, [r1]
2729 ; CHECK-NEXT: ldrh r0, [r0, #2]
2730 ; CHECK-NEXT: ldrh r1, [r1, #2]
2731 ; CHECK-NEXT: muls r2, r3, r2
2732 ; CHECK-NEXT: mla r0, r1, r0, r2
2733 ; CHECK-NEXT: sxth r0, r0
2736 %0 = load i16, ptr %x, align 2
2737 %1 = load i16, ptr %y, align 2
2738 %mul = mul i16 %1, %0
2739 %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
2740 %2 = load i16, ptr %arrayidx.1, align 2
2741 %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1
2742 %3 = load i16, ptr %arrayidx1.1, align 2
2743 %mul.1 = mul i16 %3, %2
2744 %add.1 = add i16 %mul.1, %mul
2748 define signext i16 @mlav4i16i16(ptr %x, ptr %y) {
2749 ; CHECK-LABEL: mlav4i16i16:
2750 ; CHECK: @ %bb.0: @ %entry
2751 ; CHECK-NEXT: vldrh.u32 q0, [r0]
2752 ; CHECK-NEXT: vldrh.u32 q1, [r1]
2753 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2754 ; CHECK-NEXT: sxth r0, r0
2757 %0 = load <4 x i16>, ptr %x, align 2
2758 %1 = load <4 x i16>, ptr %y, align 2
2759 %2 = mul <4 x i16> %1, %0
2760 %3 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %2)
2764 define signext i16 @mlav8i16i16(ptr %x, ptr %y) {
2765 ; CHECK-LABEL: mlav8i16i16:
2766 ; CHECK: @ %bb.0: @ %entry
2767 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2768 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2769 ; CHECK-NEXT: vmlav.u16 r0, q1, q0
2770 ; CHECK-NEXT: sxth r0, r0
2773 %0 = load <8 x i16>, ptr %x, align 2
2774 %1 = load <8 x i16>, ptr %y, align 2
2775 %2 = mul <8 x i16> %1, %0
2776 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
2780 define signext i16 @mlav16i16i16(ptr %x, ptr %y) {
2781 ; CHECK-LABEL: mlav16i16i16:
2782 ; CHECK: @ %bb.0: @ %entry
2783 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2784 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2785 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2786 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
2787 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2788 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2789 ; CHECK-NEXT: sxth r0, r2
2792 %0 = load <16 x i16>, ptr %x, align 2
2793 %1 = load <16 x i16>, ptr %y, align 2
2794 %2 = mul <16 x i16> %1, %0
2795 %3 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %2)
2799 define signext i16 @mlav24i16i16(ptr %x, ptr %y) {
2800 ; CHECK-LABEL: mlav24i16i16:
2801 ; CHECK: @ %bb.0: @ %entry
2802 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2803 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2804 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2805 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
2806 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2807 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2808 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
2809 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2810 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2811 ; CHECK-NEXT: sxth r0, r2
2814 %0 = load <8 x i16>, ptr %x, align 2
2815 %1 = load <8 x i16>, ptr %y, align 2
2816 %2 = mul <8 x i16> %1, %0
2817 %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
2818 %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8
2819 %3 = load <16 x i16>, ptr %arrayidx.8, align 2
2820 %4 = load <16 x i16>, ptr %arrayidx1.8, align 2
2821 %5 = mul <16 x i16> %4, %3
2822 %6 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5)
2823 %7 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
2824 %op.rdx = add i16 %6, %7
2828 define signext i16 @mlav32i16i16(ptr %x, ptr %y) {
2829 ; CHECK-LABEL: mlav32i16i16:
2830 ; CHECK: @ %bb.0: @ %entry
2831 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2832 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2833 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2834 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
2835 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2836 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2837 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
2838 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2839 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2840 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
2841 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2842 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2843 ; CHECK-NEXT: sxth r0, r2
2846 %0 = load <32 x i16>, ptr %x, align 2
2847 %1 = load <32 x i16>, ptr %y, align 2
2848 %2 = mul <32 x i16> %1, %0
2849 %3 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %2)
2853 define signext i16 @mlav64i16i16(ptr %x, ptr %y) {
2854 ; CHECK-LABEL: mlav64i16i16:
2855 ; CHECK: @ %bb.0: @ %entry
2856 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2857 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2858 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2859 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
2860 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2861 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2862 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
2863 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2864 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2865 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
2866 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2867 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2868 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
2869 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
2870 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2871 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
2872 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
2873 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2874 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
2875 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
2876 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2877 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
2878 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
2879 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2880 ; CHECK-NEXT: sxth r0, r2
2883 %wide.load = load <8 x i16>, ptr %x, align 2
2884 %wide.load13 = load <8 x i16>, ptr %y, align 2
2885 %0 = mul <8 x i16> %wide.load13, %wide.load
2886 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
2887 %2 = getelementptr inbounds i16, ptr %x, i32 8
2888 %wide.load.1 = load <8 x i16>, ptr %2, align 2
2889 %3 = getelementptr inbounds i16, ptr %y, i32 8
2890 %wide.load13.1 = load <8 x i16>, ptr %3, align 2
2891 %4 = mul <8 x i16> %wide.load13.1, %wide.load.1
2892 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
2894 %7 = getelementptr inbounds i16, ptr %x, i32 16
2895 %wide.load.2 = load <8 x i16>, ptr %7, align 2
2896 %8 = getelementptr inbounds i16, ptr %y, i32 16
2897 %wide.load13.2 = load <8 x i16>, ptr %8, align 2
2898 %9 = mul <8 x i16> %wide.load13.2, %wide.load.2
2899 %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9)
2900 %11 = add i16 %10, %6
2901 %12 = getelementptr inbounds i16, ptr %x, i32 24
2902 %wide.load.3 = load <8 x i16>, ptr %12, align 2
2903 %13 = getelementptr inbounds i16, ptr %y, i32 24
2904 %wide.load13.3 = load <8 x i16>, ptr %13, align 2
2905 %14 = mul <8 x i16> %wide.load13.3, %wide.load.3
2906 %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14)
2907 %16 = add i16 %15, %11
2908 %17 = getelementptr inbounds i16, ptr %x, i32 32
2909 %wide.load.4 = load <8 x i16>, ptr %17, align 2
2910 %18 = getelementptr inbounds i16, ptr %y, i32 32
2911 %wide.load13.4 = load <8 x i16>, ptr %18, align 2
2912 %19 = mul <8 x i16> %wide.load13.4, %wide.load.4
2913 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
2914 %21 = add i16 %20, %16
2915 %22 = getelementptr inbounds i16, ptr %x, i32 40
2916 %wide.load.5 = load <8 x i16>, ptr %22, align 2
2917 %23 = getelementptr inbounds i16, ptr %y, i32 40
2918 %wide.load13.5 = load <8 x i16>, ptr %23, align 2
2919 %24 = mul <8 x i16> %wide.load13.5, %wide.load.5
2920 %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24)
2921 %26 = add i16 %25, %21
2922 %27 = getelementptr inbounds i16, ptr %x, i32 48
2923 %wide.load.6 = load <8 x i16>, ptr %27, align 2
2924 %28 = getelementptr inbounds i16, ptr %y, i32 48
2925 %wide.load13.6 = load <8 x i16>, ptr %28, align 2
2926 %29 = mul <8 x i16> %wide.load13.6, %wide.load.6
2927 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
2928 %31 = add i16 %30, %26
2929 %32 = getelementptr inbounds i16, ptr %x, i32 56
2930 %wide.load.7 = load <8 x i16>, ptr %32, align 2
2931 %33 = getelementptr inbounds i16, ptr %y, i32 56
2932 %wide.load13.7 = load <8 x i16>, ptr %33, align 2
2933 %34 = mul <8 x i16> %wide.load13.7, %wide.load.7
2934 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34)
2935 %36 = add i16 %35, %31
2939 define signext i16 @mlav128i16i16(ptr %x, ptr %y) {
2940 ; CHECK-LABEL: mlav128i16i16:
2941 ; CHECK: @ %bb.0: @ %entry
2942 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2943 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2944 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2945 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
2946 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2947 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2948 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
2949 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2950 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2951 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
2952 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2953 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2954 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
2955 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
2956 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2957 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
2958 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
2959 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2960 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
2961 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
2962 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2963 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
2964 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
2965 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2966 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
2967 ; CHECK-NEXT: vldrh.u16 q1, [r1, #128]
2968 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2969 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
2970 ; CHECK-NEXT: vldrh.u16 q1, [r1, #144]
2971 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2972 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
2973 ; CHECK-NEXT: vldrh.u16 q1, [r1, #160]
2974 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2975 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
2976 ; CHECK-NEXT: vldrh.u16 q1, [r1, #176]
2977 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2978 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
2979 ; CHECK-NEXT: vldrh.u16 q1, [r1, #192]
2980 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2981 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
2982 ; CHECK-NEXT: vldrh.u16 q1, [r1, #208]
2983 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2984 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
2985 ; CHECK-NEXT: vldrh.u16 q1, [r1, #224]
2986 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2987 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
2988 ; CHECK-NEXT: vldrh.u16 q1, [r1, #240]
2989 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
2990 ; CHECK-NEXT: sxth r0, r2
2993 %wide.load = load <8 x i16>, ptr %x, align 2
2994 %wide.load13 = load <8 x i16>, ptr %y, align 2
2995 %0 = mul <8 x i16> %wide.load13, %wide.load
2996 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
2997 %2 = getelementptr inbounds i16, ptr %x, i32 8
2998 %wide.load.1 = load <8 x i16>, ptr %2, align 2
2999 %3 = getelementptr inbounds i16, ptr %y, i32 8
3000 %wide.load13.1 = load <8 x i16>, ptr %3, align 2
3001 %4 = mul <8 x i16> %wide.load13.1, %wide.load.1
3002 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
3004 %7 = getelementptr inbounds i16, ptr %x, i32 16
3005 %wide.load.2 = load <8 x i16>, ptr %7, align 2
3006 %8 = getelementptr inbounds i16, ptr %y, i32 16
3007 %wide.load13.2 = load <8 x i16>, ptr %8, align 2
3008 %9 = mul <8 x i16> %wide.load13.2, %wide.load.2
3009 %10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %9)
3010 %11 = add i16 %10, %6
3011 %12 = getelementptr inbounds i16, ptr %x, i32 24
3012 %wide.load.3 = load <8 x i16>, ptr %12, align 2
3013 %13 = getelementptr inbounds i16, ptr %y, i32 24
3014 %wide.load13.3 = load <8 x i16>, ptr %13, align 2
3015 %14 = mul <8 x i16> %wide.load13.3, %wide.load.3
3016 %15 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %14)
3017 %16 = add i16 %15, %11
3018 %17 = getelementptr inbounds i16, ptr %x, i32 32
3019 %wide.load.4 = load <8 x i16>, ptr %17, align 2
3020 %18 = getelementptr inbounds i16, ptr %y, i32 32
3021 %wide.load13.4 = load <8 x i16>, ptr %18, align 2
3022 %19 = mul <8 x i16> %wide.load13.4, %wide.load.4
3023 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19)
3024 %21 = add i16 %20, %16
3025 %22 = getelementptr inbounds i16, ptr %x, i32 40
3026 %wide.load.5 = load <8 x i16>, ptr %22, align 2
3027 %23 = getelementptr inbounds i16, ptr %y, i32 40
3028 %wide.load13.5 = load <8 x i16>, ptr %23, align 2
3029 %24 = mul <8 x i16> %wide.load13.5, %wide.load.5
3030 %25 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %24)
3031 %26 = add i16 %25, %21
3032 %27 = getelementptr inbounds i16, ptr %x, i32 48
3033 %wide.load.6 = load <8 x i16>, ptr %27, align 2
3034 %28 = getelementptr inbounds i16, ptr %y, i32 48
3035 %wide.load13.6 = load <8 x i16>, ptr %28, align 2
3036 %29 = mul <8 x i16> %wide.load13.6, %wide.load.6
3037 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
3038 %31 = add i16 %30, %26
3039 %32 = getelementptr inbounds i16, ptr %x, i32 56
3040 %wide.load.7 = load <8 x i16>, ptr %32, align 2
3041 %33 = getelementptr inbounds i16, ptr %y, i32 56
3042 %wide.load13.7 = load <8 x i16>, ptr %33, align 2
3043 %34 = mul <8 x i16> %wide.load13.7, %wide.load.7
3044 %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %34)
3045 %36 = add i16 %35, %31
3046 %37 = getelementptr inbounds i16, ptr %x, i32 64
3047 %wide.load.8 = load <8 x i16>, ptr %37, align 2
3048 %38 = getelementptr inbounds i16, ptr %y, i32 64
3049 %wide.load13.8 = load <8 x i16>, ptr %38, align 2
3050 %39 = mul <8 x i16> %wide.load13.8, %wide.load.8
3051 %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %39)
3052 %41 = add i16 %40, %36
3053 %42 = getelementptr inbounds i16, ptr %x, i32 72
3054 %wide.load.9 = load <8 x i16>, ptr %42, align 2
3055 %43 = getelementptr inbounds i16, ptr %y, i32 72
3056 %wide.load13.9 = load <8 x i16>, ptr %43, align 2
3057 %44 = mul <8 x i16> %wide.load13.9, %wide.load.9
3058 %45 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %44)
3059 %46 = add i16 %45, %41
3060 %47 = getelementptr inbounds i16, ptr %x, i32 80
3061 %wide.load.10 = load <8 x i16>, ptr %47, align 2
3062 %48 = getelementptr inbounds i16, ptr %y, i32 80
3063 %wide.load13.10 = load <8 x i16>, ptr %48, align 2
3064 %49 = mul <8 x i16> %wide.load13.10, %wide.load.10
3065 %50 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %49)
3066 %51 = add i16 %50, %46
3067 %52 = getelementptr inbounds i16, ptr %x, i32 88
3068 %wide.load.11 = load <8 x i16>, ptr %52, align 2
3069 %53 = getelementptr inbounds i16, ptr %y, i32 88
3070 %wide.load13.11 = load <8 x i16>, ptr %53, align 2
3071 %54 = mul <8 x i16> %wide.load13.11, %wide.load.11
3072 %55 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %54)
3073 %56 = add i16 %55, %51
3074 %57 = getelementptr inbounds i16, ptr %x, i32 96
3075 %wide.load.12 = load <8 x i16>, ptr %57, align 2
3076 %58 = getelementptr inbounds i16, ptr %y, i32 96
3077 %wide.load13.12 = load <8 x i16>, ptr %58, align 2
3078 %59 = mul <8 x i16> %wide.load13.12, %wide.load.12
3079 %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %59)
3080 %61 = add i16 %60, %56
3081 %62 = getelementptr inbounds i16, ptr %x, i32 104
3082 %wide.load.13 = load <8 x i16>, ptr %62, align 2
3083 %63 = getelementptr inbounds i16, ptr %y, i32 104
3084 %wide.load13.13 = load <8 x i16>, ptr %63, align 2
3085 %64 = mul <8 x i16> %wide.load13.13, %wide.load.13
3086 %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64)
3087 %66 = add i16 %65, %61
3088 %67 = getelementptr inbounds i16, ptr %x, i32 112
3089 %wide.load.14 = load <8 x i16>, ptr %67, align 2
3090 %68 = getelementptr inbounds i16, ptr %y, i32 112
3091 %wide.load13.14 = load <8 x i16>, ptr %68, align 2
3092 %69 = mul <8 x i16> %wide.load13.14, %wide.load.14
3093 %70 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %69)
3094 %71 = add i16 %70, %66
3095 %72 = getelementptr inbounds i16, ptr %x, i32 120
3096 %wide.load.15 = load <8 x i16>, ptr %72, align 2
3097 %73 = getelementptr inbounds i16, ptr %y, i32 120
3098 %wide.load13.15 = load <8 x i16>, ptr %73, align 2
3099 %74 = mul <8 x i16> %wide.load13.15, %wide.load.15
3100 %75 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %74)
3101 %76 = add i16 %75, %71
3105 define zeroext i8 @mlav2i8i8(ptr %x, ptr %y) {
3106 ; CHECK-LABEL: mlav2i8i8:
3107 ; CHECK: @ %bb.0: @ %entry
3108 ; CHECK-NEXT: ldrb r2, [r0]
3109 ; CHECK-NEXT: ldrb r3, [r1]
3110 ; CHECK-NEXT: ldrb r0, [r0, #1]
3111 ; CHECK-NEXT: ldrb r1, [r1, #1]
3112 ; CHECK-NEXT: muls r2, r3, r2
3113 ; CHECK-NEXT: mla r0, r1, r0, r2
3114 ; CHECK-NEXT: uxtb r0, r0
3117 %0 = load i8, ptr %x, align 1
3118 %1 = load i8, ptr %y, align 1
3119 %mul = mul i8 %1, %0
3120 %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
3121 %2 = load i8, ptr %arrayidx.1, align 1
3122 %arrayidx1.1 = getelementptr inbounds i8, ptr %y, i32 1
3123 %3 = load i8, ptr %arrayidx1.1, align 1
3124 %mul.1 = mul i8 %3, %2
3125 %add.1 = add i8 %mul.1, %mul
3129 define zeroext i8 @mlav4i8i8(ptr %x, ptr %y) {
3130 ; CHECK-LABEL: mlav4i8i8:
3131 ; CHECK: @ %bb.0: @ %entry
3132 ; CHECK-NEXT: vldrb.u32 q0, [r0]
3133 ; CHECK-NEXT: vldrb.u32 q1, [r1]
3134 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
3135 ; CHECK-NEXT: uxtb r0, r0
3138 %0 = load <4 x i8>, ptr %x, align 1
3139 %1 = load <4 x i8>, ptr %y, align 1
3140 %2 = mul <4 x i8> %1, %0
3141 %3 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %2)
3145 define zeroext i8 @mlav8i8i8(ptr %x, ptr %y) {
3146 ; CHECK-LABEL: mlav8i8i8:
3147 ; CHECK: @ %bb.0: @ %entry
3148 ; CHECK-NEXT: vldrb.u16 q0, [r0]
3149 ; CHECK-NEXT: vldrb.u16 q1, [r1]
3150 ; CHECK-NEXT: vmlav.u16 r0, q1, q0
3151 ; CHECK-NEXT: uxtb r0, r0
3154 %0 = load <8 x i8>, ptr %x, align 1
3155 %1 = load <8 x i8>, ptr %y, align 1
3156 %2 = mul <8 x i8> %1, %0
3157 %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2)
3161 define zeroext i8 @mlav16i8i8(ptr %x, ptr %y) {
3162 ; CHECK-LABEL: mlav16i8i8:
3163 ; CHECK: @ %bb.0: @ %entry
3164 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3165 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3166 ; CHECK-NEXT: vmlav.u8 r0, q1, q0
3167 ; CHECK-NEXT: uxtb r0, r0
3170 %0 = load <16 x i8>, ptr %x, align 1
3171 %1 = load <16 x i8>, ptr %y, align 1
3172 %2 = mul <16 x i8> %1, %0
3173 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
3177 define zeroext i8 @mlav24i8i8(ptr %x, ptr %y) {
3178 ; CHECK-LABEL: mlav24i8i8:
3179 ; CHECK: @ %bb.0: @ %entry
3180 ; CHECK-NEXT: vldrb.u16 q0, [r0]
3181 ; CHECK-NEXT: vldrb.u16 q1, [r1]
3182 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3183 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
3184 ; CHECK-NEXT: vldrb.u8 q1, [r1, #8]
3185 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3186 ; CHECK-NEXT: uxtb r0, r2
3189 %0 = load <8 x i8>, ptr %x, align 1
3190 %1 = load <8 x i8>, ptr %y, align 1
3191 %2 = mul <8 x i8> %1, %0
3192 %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
3193 %arrayidx1.8 = getelementptr inbounds i8, ptr %y, i32 8
3194 %3 = load <16 x i8>, ptr %arrayidx.8, align 1
3195 %4 = load <16 x i8>, ptr %arrayidx1.8, align 1
3196 %5 = mul <16 x i8> %4, %3
3197 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
3198 %7 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %2)
3199 %op.rdx = add i8 %6, %7
3203 define zeroext i8 @mlav32i8i8(ptr %x, ptr %y) {
3204 ; CHECK-LABEL: mlav32i8i8:
3205 ; CHECK: @ %bb.0: @ %entry
3206 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3207 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3208 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
3209 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
3210 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
3211 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3212 ; CHECK-NEXT: uxtb r0, r2
3215 %0 = load <32 x i8>, ptr %x, align 1
3216 %1 = load <32 x i8>, ptr %y, align 1
3217 %2 = mul <32 x i8> %1, %0
3218 %3 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %2)
3222 define zeroext i8 @mlav64i8i8(ptr %x, ptr %y) {
3223 ; CHECK-LABEL: mlav64i8i8:
3224 ; CHECK: @ %bb.0: @ %entry
3225 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3226 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3227 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
3228 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
3229 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
3230 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3231 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
3232 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
3233 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3234 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
3235 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
3236 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3237 ; CHECK-NEXT: uxtb r0, r2
3240 %wide.load = load <16 x i8>, ptr %x, align 1
3241 %wide.load12 = load <16 x i8>, ptr %y, align 1
3242 %0 = mul <16 x i8> %wide.load12, %wide.load
3243 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
3244 %2 = getelementptr inbounds i8, ptr %x, i32 16
3245 %wide.load.1 = load <16 x i8>, ptr %2, align 1
3246 %3 = getelementptr inbounds i8, ptr %y, i32 16
3247 %wide.load12.1 = load <16 x i8>, ptr %3, align 1
3248 %4 = mul <16 x i8> %wide.load12.1, %wide.load.1
3249 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4)
3251 %7 = getelementptr inbounds i8, ptr %x, i32 32
3252 %wide.load.2 = load <16 x i8>, ptr %7, align 1
3253 %8 = getelementptr inbounds i8, ptr %y, i32 32
3254 %wide.load12.2 = load <16 x i8>, ptr %8, align 1
3255 %9 = mul <16 x i8> %wide.load12.2, %wide.load.2
3256 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9)
3257 %11 = add i8 %10, %6
3258 %12 = getelementptr inbounds i8, ptr %x, i32 48
3259 %wide.load.3 = load <16 x i8>, ptr %12, align 1
3260 %13 = getelementptr inbounds i8, ptr %y, i32 48
3261 %wide.load12.3 = load <16 x i8>, ptr %13, align 1
3262 %14 = mul <16 x i8> %wide.load12.3, %wide.load.3
3263 %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14)
3264 %16 = add i8 %15, %11
3268 define zeroext i8 @mlav128i8i8(ptr %x, ptr %y) {
3269 ; CHECK-LABEL: mlav128i8i8:
3270 ; CHECK: @ %bb.0: @ %entry
3271 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3272 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3273 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
3274 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
3275 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
3276 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3277 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
3278 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
3279 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3280 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
3281 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
3282 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3283 ; CHECK-NEXT: vldrb.u8 q0, [r0, #64]
3284 ; CHECK-NEXT: vldrb.u8 q1, [r1, #64]
3285 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3286 ; CHECK-NEXT: vldrb.u8 q0, [r0, #80]
3287 ; CHECK-NEXT: vldrb.u8 q1, [r1, #80]
3288 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3289 ; CHECK-NEXT: vldrb.u8 q0, [r0, #96]
3290 ; CHECK-NEXT: vldrb.u8 q1, [r1, #96]
3291 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3292 ; CHECK-NEXT: vldrb.u8 q0, [r0, #112]
3293 ; CHECK-NEXT: vldrb.u8 q1, [r1, #112]
3294 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3295 ; CHECK-NEXT: uxtb r0, r2
3298 %wide.load = load <16 x i8>, ptr %x, align 1
3299 %wide.load12 = load <16 x i8>, ptr %y, align 1
3300 %0 = mul <16 x i8> %wide.load12, %wide.load
3301 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
3302 %2 = getelementptr inbounds i8, ptr %x, i32 16
3303 %wide.load.1 = load <16 x i8>, ptr %2, align 1
3304 %3 = getelementptr inbounds i8, ptr %y, i32 16
3305 %wide.load12.1 = load <16 x i8>, ptr %3, align 1
3306 %4 = mul <16 x i8> %wide.load12.1, %wide.load.1
3307 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4)
3309 %7 = getelementptr inbounds i8, ptr %x, i32 32
3310 %wide.load.2 = load <16 x i8>, ptr %7, align 1
3311 %8 = getelementptr inbounds i8, ptr %y, i32 32
3312 %wide.load12.2 = load <16 x i8>, ptr %8, align 1
3313 %9 = mul <16 x i8> %wide.load12.2, %wide.load.2
3314 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9)
3315 %11 = add i8 %10, %6
3316 %12 = getelementptr inbounds i8, ptr %x, i32 48
3317 %wide.load.3 = load <16 x i8>, ptr %12, align 1
3318 %13 = getelementptr inbounds i8, ptr %y, i32 48
3319 %wide.load12.3 = load <16 x i8>, ptr %13, align 1
3320 %14 = mul <16 x i8> %wide.load12.3, %wide.load.3
3321 %15 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %14)
3322 %16 = add i8 %15, %11
3323 %17 = getelementptr inbounds i8, ptr %x, i32 64
3324 %wide.load.4 = load <16 x i8>, ptr %17, align 1
3325 %18 = getelementptr inbounds i8, ptr %y, i32 64
3326 %wide.load12.4 = load <16 x i8>, ptr %18, align 1
3327 %19 = mul <16 x i8> %wide.load12.4, %wide.load.4
3328 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %19)
3329 %21 = add i8 %20, %16
3330 %22 = getelementptr inbounds i8, ptr %x, i32 80
3331 %wide.load.5 = load <16 x i8>, ptr %22, align 1
3332 %23 = getelementptr inbounds i8, ptr %y, i32 80
3333 %wide.load12.5 = load <16 x i8>, ptr %23, align 1
3334 %24 = mul <16 x i8> %wide.load12.5, %wide.load.5
3335 %25 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %24)
3336 %26 = add i8 %25, %21
3337 %27 = getelementptr inbounds i8, ptr %x, i32 96
3338 %wide.load.6 = load <16 x i8>, ptr %27, align 1
3339 %28 = getelementptr inbounds i8, ptr %y, i32 96
3340 %wide.load12.6 = load <16 x i8>, ptr %28, align 1
3341 %29 = mul <16 x i8> %wide.load12.6, %wide.load.6
3342 %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29)
3343 %31 = add i8 %30, %26
3344 %32 = getelementptr inbounds i8, ptr %x, i32 112
3345 %wide.load.7 = load <16 x i8>, ptr %32, align 1
3346 %33 = getelementptr inbounds i8, ptr %y, i32 112
3347 %wide.load12.7 = load <16 x i8>, ptr %33, align 1
3348 %34 = mul <16 x i8> %wide.load12.7, %wide.load.7
3349 %35 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %34)
3350 %36 = add i8 %35, %31
3355 define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) {
3356 ; CHECK-LABEL: add_two_const:
3357 ; CHECK: @ %bb.0: @ %entry
3358 ; CHECK-NEXT: vaddv.u32 r0, q1
3359 ; CHECK-NEXT: vaddva.u32 r0, q0
3360 ; CHECK-NEXT: adds r0, #10
3363 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3364 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3370 define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) {
3371 ; CHECK-LABEL: add_two_const2:
3372 ; CHECK: @ %bb.0: @ %entry
3373 ; CHECK-NEXT: vaddv.u32 r0, q1
3374 ; CHECK-NEXT: vaddva.u32 r0, q0
3375 ; CHECK-NEXT: adds r0, #10
3378 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3379 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3385 define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) {
3386 ; CHECK-LABEL: add_two_const3:
3387 ; CHECK: @ %bb.0: @ %entry
3388 ; CHECK-NEXT: vaddv.u32 r0, q0
3389 ; CHECK-NEXT: vaddva.u32 r0, q1
3390 ; CHECK-NEXT: adds r0, #20
3393 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3394 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3401 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3402 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3403 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3404 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
3405 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
3406 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
3407 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3408 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3409 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
3410 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
3411 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
3412 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
3413 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
3414 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
3415 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)