1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 ; Various reductions generated fro SLP vectorizing unrolled loops. Generated
5 ; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed.
7 define i32 @addv2i32i32(i32* %x) {
8 ; CHECK-LABEL: addv2i32i32:
9 ; CHECK: @ %bb.0: @ %entry
10 ; CHECK-NEXT: ldrd r1, r0, [r0]
11 ; CHECK-NEXT: add r0, r1
14 %0 = load i32, i32* %x, align 4
15 %arrayidx.1 = getelementptr inbounds i32, i32* %x, i32 1
16 %1 = load i32, i32* %arrayidx.1, align 4
17 %add.1 = add nsw i32 %1, %0
21 define i32 @addv4i32i32(i32* %x) {
22 ; CHECK-LABEL: addv4i32i32:
23 ; CHECK: @ %bb.0: @ %entry
24 ; CHECK-NEXT: vldrw.u32 q0, [r0]
25 ; CHECK-NEXT: vaddv.u32 r0, q0
28 %0 = bitcast i32* %x to <4 x i32>*
29 %1 = load <4 x i32>, <4 x i32>* %0, align 4
30 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
34 define i32 @addv8i32i32(i32* %x) {
35 ; CHECK-LABEL: addv8i32i32:
36 ; CHECK: @ %bb.0: @ %entry
37 ; CHECK-NEXT: vldrw.u32 q1, [r0]
38 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
39 ; CHECK-NEXT: vaddv.u32 r0, q1
40 ; CHECK-NEXT: vaddva.u32 r0, q0
43 %0 = bitcast i32* %x to <8 x i32>*
44 %1 = load <8 x i32>, <8 x i32>* %0, align 4
45 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
49 define i32 @addv16i32i32(i32* %x) {
50 ; CHECK-LABEL: addv16i32i32:
51 ; CHECK: @ %bb.0: @ %entry
52 ; CHECK-NEXT: vldrw.u32 q1, [r0]
53 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
54 ; CHECK-NEXT: vaddv.u32 r2, q1
55 ; CHECK-NEXT: vaddva.u32 r2, q0
56 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
57 ; CHECK-NEXT: vaddva.u32 r2, q0
58 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
59 ; CHECK-NEXT: vaddva.u32 r2, q0
60 ; CHECK-NEXT: mov r0, r2
63 %0 = bitcast i32* %x to <16 x i32>*
64 %1 = load <16 x i32>, <16 x i32>* %0, align 4
65 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
69 define i32 @addv24i32i32(i32* %x) {
70 ; CHECK-LABEL: addv24i32i32:
71 ; CHECK: @ %bb.0: @ %entry
72 ; CHECK-NEXT: vldrw.u32 q1, [r0]
73 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
74 ; CHECK-NEXT: vaddv.u32 r2, q1
75 ; CHECK-NEXT: vaddva.u32 r2, q0
76 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
77 ; CHECK-NEXT: vaddva.u32 r2, q0
78 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
79 ; CHECK-NEXT: vaddva.u32 r2, q0
80 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
81 ; CHECK-NEXT: vaddva.u32 r2, q0
82 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
83 ; CHECK-NEXT: vaddva.u32 r2, q0
84 ; CHECK-NEXT: mov r0, r2
87 %0 = bitcast i32* %x to <8 x i32>*
88 %1 = load <8 x i32>, <8 x i32>* %0, align 4
89 %arrayidx.8 = getelementptr inbounds i32, i32* %x, i32 8
90 %2 = bitcast i32* %arrayidx.8 to <16 x i32>*
91 %3 = load <16 x i32>, <16 x i32>* %2, align 4
92 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
93 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
94 %op.rdx = add nsw i32 %4, %5
98 define i32 @addv32i32i32(i32* %x) {
99 ; CHECK-LABEL: addv32i32i32:
100 ; CHECK: @ %bb.0: @ %entry
101 ; CHECK-NEXT: vldrw.u32 q1, [r0]
102 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
103 ; CHECK-NEXT: mov r1, r0
104 ; CHECK-NEXT: vaddv.u32 r0, q1
105 ; CHECK-NEXT: vaddva.u32 r0, q0
106 ; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
107 ; CHECK-NEXT: vaddva.u32 r0, q0
108 ; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
109 ; CHECK-NEXT: vaddva.u32 r0, q0
110 ; CHECK-NEXT: vldrw.u32 q0, [r1, #64]
111 ; CHECK-NEXT: vaddva.u32 r0, q0
112 ; CHECK-NEXT: vldrw.u32 q0, [r1, #80]
113 ; CHECK-NEXT: vaddva.u32 r0, q0
114 ; CHECK-NEXT: vldrw.u32 q0, [r1, #96]
115 ; CHECK-NEXT: vaddva.u32 r0, q0
116 ; CHECK-NEXT: vldrw.u32 q0, [r1, #112]
117 ; CHECK-NEXT: vaddva.u32 r0, q0
120 %0 = bitcast i32* %x to <32 x i32>*
121 %1 = load <32 x i32>, <32 x i32>* %0, align 4
122 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
126 define i32 @addv64i32i32(i32* %x) {
127 ; CHECK-LABEL: addv64i32i32:
128 ; CHECK: @ %bb.0: @ %entry
129 ; CHECK-NEXT: vldrw.u32 q1, [r0]
130 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
131 ; CHECK-NEXT: vaddv.u32 r2, q1
132 ; CHECK-NEXT: vaddva.u32 r2, q0
133 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
134 ; CHECK-NEXT: vaddva.u32 r2, q0
135 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
136 ; CHECK-NEXT: vaddva.u32 r2, q0
137 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
138 ; CHECK-NEXT: vaddva.u32 r2, q0
139 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
140 ; CHECK-NEXT: vaddva.u32 r2, q0
141 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
142 ; CHECK-NEXT: vaddva.u32 r2, q0
143 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
144 ; CHECK-NEXT: vaddva.u32 r2, q0
145 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
146 ; CHECK-NEXT: vaddva.u32 r2, q0
147 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
148 ; CHECK-NEXT: vaddva.u32 r2, q0
149 ; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
150 ; CHECK-NEXT: vaddva.u32 r2, q0
151 ; CHECK-NEXT: vldrw.u32 q0, [r0, #176]
152 ; CHECK-NEXT: vaddva.u32 r2, q0
153 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
154 ; CHECK-NEXT: vaddva.u32 r2, q0
155 ; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
156 ; CHECK-NEXT: vaddva.u32 r2, q0
157 ; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
158 ; CHECK-NEXT: vaddva.u32 r2, q0
159 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
160 ; CHECK-NEXT: vaddva.u32 r2, q0
161 ; CHECK-NEXT: mov r0, r2
164 %0 = bitcast i32* %x to <64 x i32>*
165 %1 = load <64 x i32>, <64 x i32>* %0, align 4
166 %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1)
170 define i32 @addv128i32i32(i32* %x) {
171 ; CHECK-LABEL: addv128i32i32:
172 ; CHECK: @ %bb.0: @ %entry
173 ; CHECK-NEXT: vldrw.u32 q1, [r0]
174 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
175 ; CHECK-NEXT: vaddv.u32 r2, q1
176 ; CHECK-NEXT: vaddva.u32 r2, q0
177 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
178 ; CHECK-NEXT: vaddva.u32 r2, q0
179 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
180 ; CHECK-NEXT: vaddva.u32 r2, q0
181 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
182 ; CHECK-NEXT: vaddva.u32 r2, q0
183 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
184 ; CHECK-NEXT: vaddva.u32 r2, q0
185 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
186 ; CHECK-NEXT: vaddva.u32 r2, q0
187 ; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
188 ; CHECK-NEXT: vaddva.u32 r2, q0
189 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
190 ; CHECK-NEXT: vaddva.u32 r2, q0
191 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
192 ; CHECK-NEXT: vaddva.u32 r2, q0
193 ; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
194 ; CHECK-NEXT: vaddva.u32 r2, q0
195 ; CHECK-NEXT: vldrw.u32 q0, [r0, #176]
196 ; CHECK-NEXT: vaddva.u32 r2, q0
197 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
198 ; CHECK-NEXT: vaddva.u32 r2, q0
199 ; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
200 ; CHECK-NEXT: vaddva.u32 r2, q0
201 ; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
202 ; CHECK-NEXT: vaddva.u32 r2, q0
203 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
204 ; CHECK-NEXT: vaddva.u32 r2, q0
205 ; CHECK-NEXT: vldrw.u32 q0, [r0, #256]
206 ; CHECK-NEXT: vaddva.u32 r2, q0
207 ; CHECK-NEXT: vldrw.u32 q0, [r0, #272]
208 ; CHECK-NEXT: vaddva.u32 r2, q0
209 ; CHECK-NEXT: vldrw.u32 q0, [r0, #288]
210 ; CHECK-NEXT: vaddva.u32 r2, q0
211 ; CHECK-NEXT: vldrw.u32 q0, [r0, #304]
212 ; CHECK-NEXT: vaddva.u32 r2, q0
213 ; CHECK-NEXT: vldrw.u32 q0, [r0, #320]
214 ; CHECK-NEXT: vaddva.u32 r2, q0
215 ; CHECK-NEXT: vldrw.u32 q0, [r0, #336]
216 ; CHECK-NEXT: vaddva.u32 r2, q0
217 ; CHECK-NEXT: vldrw.u32 q0, [r0, #352]
218 ; CHECK-NEXT: vaddva.u32 r2, q0
219 ; CHECK-NEXT: vldrw.u32 q0, [r0, #368]
220 ; CHECK-NEXT: vaddva.u32 r2, q0
221 ; CHECK-NEXT: vldrw.u32 q0, [r0, #384]
222 ; CHECK-NEXT: vaddva.u32 r2, q0
223 ; CHECK-NEXT: vldrw.u32 q0, [r0, #400]
224 ; CHECK-NEXT: vaddva.u32 r2, q0
225 ; CHECK-NEXT: vldrw.u32 q0, [r0, #416]
226 ; CHECK-NEXT: vaddva.u32 r2, q0
227 ; CHECK-NEXT: vldrw.u32 q0, [r0, #432]
228 ; CHECK-NEXT: vaddva.u32 r2, q0
229 ; CHECK-NEXT: vldrw.u32 q0, [r0, #448]
230 ; CHECK-NEXT: vaddva.u32 r2, q0
231 ; CHECK-NEXT: vldrw.u32 q0, [r0, #464]
232 ; CHECK-NEXT: vaddva.u32 r2, q0
233 ; CHECK-NEXT: vldrw.u32 q0, [r0, #480]
234 ; CHECK-NEXT: vaddva.u32 r2, q0
235 ; CHECK-NEXT: vldrw.u32 q0, [r0, #496]
236 ; CHECK-NEXT: vaddva.u32 r2, q0
237 ; CHECK-NEXT: mov r0, r2
240 %0 = bitcast i32* %x to <4 x i32>*
241 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
242 %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
243 %2 = getelementptr inbounds i32, i32* %x, i32 4
244 %3 = bitcast i32* %2 to <4 x i32>*
245 %wide.load.1 = load <4 x i32>, <4 x i32>* %3, align 4
246 %4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1)
248 %6 = getelementptr inbounds i32, i32* %x, i32 8
249 %7 = bitcast i32* %6 to <4 x i32>*
250 %wide.load.2 = load <4 x i32>, <4 x i32>* %7, align 4
251 %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2)
253 %10 = getelementptr inbounds i32, i32* %x, i32 12
254 %11 = bitcast i32* %10 to <4 x i32>*
255 %wide.load.3 = load <4 x i32>, <4 x i32>* %11, align 4
256 %12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3)
257 %13 = add i32 %12, %9
258 %14 = getelementptr inbounds i32, i32* %x, i32 16
259 %15 = bitcast i32* %14 to <4 x i32>*
260 %wide.load.4 = load <4 x i32>, <4 x i32>* %15, align 4
261 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4)
262 %17 = add i32 %16, %13
263 %18 = getelementptr inbounds i32, i32* %x, i32 20
264 %19 = bitcast i32* %18 to <4 x i32>*
265 %wide.load.5 = load <4 x i32>, <4 x i32>* %19, align 4
266 %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5)
267 %21 = add i32 %20, %17
268 %22 = getelementptr inbounds i32, i32* %x, i32 24
269 %23 = bitcast i32* %22 to <4 x i32>*
270 %wide.load.6 = load <4 x i32>, <4 x i32>* %23, align 4
271 %24 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6)
272 %25 = add i32 %24, %21
273 %26 = getelementptr inbounds i32, i32* %x, i32 28
274 %27 = bitcast i32* %26 to <4 x i32>*
275 %wide.load.7 = load <4 x i32>, <4 x i32>* %27, align 4
276 %28 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7)
277 %29 = add i32 %28, %25
278 %30 = getelementptr inbounds i32, i32* %x, i32 32
279 %31 = bitcast i32* %30 to <4 x i32>*
280 %wide.load.8 = load <4 x i32>, <4 x i32>* %31, align 4
281 %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8)
282 %33 = add i32 %32, %29
283 %34 = getelementptr inbounds i32, i32* %x, i32 36
284 %35 = bitcast i32* %34 to <4 x i32>*
285 %wide.load.9 = load <4 x i32>, <4 x i32>* %35, align 4
286 %36 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9)
287 %37 = add i32 %36, %33
288 %38 = getelementptr inbounds i32, i32* %x, i32 40
289 %39 = bitcast i32* %38 to <4 x i32>*
290 %wide.load.10 = load <4 x i32>, <4 x i32>* %39, align 4
291 %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10)
292 %41 = add i32 %40, %37
293 %42 = getelementptr inbounds i32, i32* %x, i32 44
294 %43 = bitcast i32* %42 to <4 x i32>*
295 %wide.load.11 = load <4 x i32>, <4 x i32>* %43, align 4
296 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11)
297 %45 = add i32 %44, %41
298 %46 = getelementptr inbounds i32, i32* %x, i32 48
299 %47 = bitcast i32* %46 to <4 x i32>*
300 %wide.load.12 = load <4 x i32>, <4 x i32>* %47, align 4
301 %48 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12)
302 %49 = add i32 %48, %45
303 %50 = getelementptr inbounds i32, i32* %x, i32 52
304 %51 = bitcast i32* %50 to <4 x i32>*
305 %wide.load.13 = load <4 x i32>, <4 x i32>* %51, align 4
306 %52 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13)
307 %53 = add i32 %52, %49
308 %54 = getelementptr inbounds i32, i32* %x, i32 56
309 %55 = bitcast i32* %54 to <4 x i32>*
310 %wide.load.14 = load <4 x i32>, <4 x i32>* %55, align 4
311 %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14)
312 %57 = add i32 %56, %53
313 %58 = getelementptr inbounds i32, i32* %x, i32 60
314 %59 = bitcast i32* %58 to <4 x i32>*
315 %wide.load.15 = load <4 x i32>, <4 x i32>* %59, align 4
316 %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15)
317 %61 = add i32 %60, %57
318 %62 = getelementptr inbounds i32, i32* %x, i32 64
319 %63 = bitcast i32* %62 to <4 x i32>*
320 %wide.load.16 = load <4 x i32>, <4 x i32>* %63, align 4
321 %64 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16)
322 %65 = add i32 %64, %61
323 %66 = getelementptr inbounds i32, i32* %x, i32 68
324 %67 = bitcast i32* %66 to <4 x i32>*
325 %wide.load.17 = load <4 x i32>, <4 x i32>* %67, align 4
326 %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17)
327 %69 = add i32 %68, %65
328 %70 = getelementptr inbounds i32, i32* %x, i32 72
329 %71 = bitcast i32* %70 to <4 x i32>*
330 %wide.load.18 = load <4 x i32>, <4 x i32>* %71, align 4
331 %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18)
332 %73 = add i32 %72, %69
333 %74 = getelementptr inbounds i32, i32* %x, i32 76
334 %75 = bitcast i32* %74 to <4 x i32>*
335 %wide.load.19 = load <4 x i32>, <4 x i32>* %75, align 4
336 %76 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19)
337 %77 = add i32 %76, %73
338 %78 = getelementptr inbounds i32, i32* %x, i32 80
339 %79 = bitcast i32* %78 to <4 x i32>*
340 %wide.load.20 = load <4 x i32>, <4 x i32>* %79, align 4
341 %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20)
342 %81 = add i32 %80, %77
343 %82 = getelementptr inbounds i32, i32* %x, i32 84
344 %83 = bitcast i32* %82 to <4 x i32>*
345 %wide.load.21 = load <4 x i32>, <4 x i32>* %83, align 4
346 %84 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21)
347 %85 = add i32 %84, %81
348 %86 = getelementptr inbounds i32, i32* %x, i32 88
349 %87 = bitcast i32* %86 to <4 x i32>*
350 %wide.load.22 = load <4 x i32>, <4 x i32>* %87, align 4
351 %88 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22)
352 %89 = add i32 %88, %85
353 %90 = getelementptr inbounds i32, i32* %x, i32 92
354 %91 = bitcast i32* %90 to <4 x i32>*
355 %wide.load.23 = load <4 x i32>, <4 x i32>* %91, align 4
356 %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23)
357 %93 = add i32 %92, %89
358 %94 = getelementptr inbounds i32, i32* %x, i32 96
359 %95 = bitcast i32* %94 to <4 x i32>*
360 %wide.load.24 = load <4 x i32>, <4 x i32>* %95, align 4
361 %96 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24)
362 %97 = add i32 %96, %93
363 %98 = getelementptr inbounds i32, i32* %x, i32 100
364 %99 = bitcast i32* %98 to <4 x i32>*
365 %wide.load.25 = load <4 x i32>, <4 x i32>* %99, align 4
366 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25)
367 %101 = add i32 %100, %97
368 %102 = getelementptr inbounds i32, i32* %x, i32 104
369 %103 = bitcast i32* %102 to <4 x i32>*
370 %wide.load.26 = load <4 x i32>, <4 x i32>* %103, align 4
371 %104 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26)
372 %105 = add i32 %104, %101
373 %106 = getelementptr inbounds i32, i32* %x, i32 108
374 %107 = bitcast i32* %106 to <4 x i32>*
375 %wide.load.27 = load <4 x i32>, <4 x i32>* %107, align 4
376 %108 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27)
377 %109 = add i32 %108, %105
378 %110 = getelementptr inbounds i32, i32* %x, i32 112
379 %111 = bitcast i32* %110 to <4 x i32>*
380 %wide.load.28 = load <4 x i32>, <4 x i32>* %111, align 4
381 %112 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28)
382 %113 = add i32 %112, %109
383 %114 = getelementptr inbounds i32, i32* %x, i32 116
384 %115 = bitcast i32* %114 to <4 x i32>*
385 %wide.load.29 = load <4 x i32>, <4 x i32>* %115, align 4
386 %116 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29)
387 %117 = add i32 %116, %113
388 %118 = getelementptr inbounds i32, i32* %x, i32 120
389 %119 = bitcast i32* %118 to <4 x i32>*
390 %wide.load.30 = load <4 x i32>, <4 x i32>* %119, align 4
391 %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30)
392 %121 = add i32 %120, %117
393 %122 = getelementptr inbounds i32, i32* %x, i32 124
394 %123 = bitcast i32* %122 to <4 x i32>*
395 %wide.load.31 = load <4 x i32>, <4 x i32>* %123, align 4
396 %124 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31)
397 %125 = add i32 %124, %121
401 define i32 @addv2i32i16(i16* %x) {
402 ; CHECK-LABEL: addv2i32i16:
403 ; CHECK: @ %bb.0: @ %entry
404 ; CHECK-NEXT: ldrsh.w r1, [r0]
405 ; CHECK-NEXT: ldrsh.w r0, [r0, #2]
406 ; CHECK-NEXT: add r0, r1
409 %0 = load i16, i16* %x, align 2
410 %conv = sext i16 %0 to i32
411 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
412 %1 = load i16, i16* %arrayidx.1, align 2
413 %conv.1 = sext i16 %1 to i32
414 %add.1 = add nsw i32 %conv, %conv.1
418 define i32 @addv4i32i16(i16* %x) {
419 ; CHECK-LABEL: addv4i32i16:
420 ; CHECK: @ %bb.0: @ %entry
421 ; CHECK-NEXT: vldrh.s32 q0, [r0]
422 ; CHECK-NEXT: vaddv.u32 r0, q0
425 %0 = bitcast i16* %x to <4 x i16>*
426 %1 = load <4 x i16>, <4 x i16>* %0, align 2
427 %2 = sext <4 x i16> %1 to <4 x i32>
428 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
432 define i32 @addv8i32i16(i16* %x) {
433 ; CHECK-LABEL: addv8i32i16:
434 ; CHECK: @ %bb.0: @ %entry
435 ; CHECK-NEXT: vldrh.u16 q0, [r0]
436 ; CHECK-NEXT: vaddv.s16 r0, q0
439 %0 = bitcast i16* %x to <8 x i16>*
440 %1 = load <8 x i16>, <8 x i16>* %0, align 2
441 %2 = sext <8 x i16> %1 to <8 x i32>
442 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
446 define i32 @addv16i32i16(i16* %x) {
447 ; CHECK-LABEL: addv16i32i16:
448 ; CHECK: @ %bb.0: @ %entry
449 ; CHECK-NEXT: vldrh.s32 q1, [r0]
450 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
451 ; CHECK-NEXT: vaddv.u32 r2, q1
452 ; CHECK-NEXT: vaddva.u32 r2, q0
453 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
454 ; CHECK-NEXT: vaddva.u32 r2, q0
455 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
456 ; CHECK-NEXT: vaddva.u32 r2, q0
457 ; CHECK-NEXT: mov r0, r2
460 %0 = bitcast i16* %x to <16 x i16>*
461 %1 = load <16 x i16>, <16 x i16>* %0, align 2
462 %2 = sext <16 x i16> %1 to <16 x i32>
463 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
467 define i32 @addv24i32i16(i16* %x) {
468 ; CHECK-LABEL: addv24i32i16:
469 ; CHECK: @ %bb.0: @ %entry
470 ; CHECK-NEXT: vldrh.s32 q1, [r0]
471 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
472 ; CHECK-NEXT: vaddv.u32 r2, q1
473 ; CHECK-NEXT: vaddva.u32 r2, q0
474 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
475 ; CHECK-NEXT: vaddva.u32 r2, q0
476 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
477 ; CHECK-NEXT: vaddva.u32 r2, q0
478 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
479 ; CHECK-NEXT: vaddva.s16 r2, q0
480 ; CHECK-NEXT: mov r0, r2
483 %0 = bitcast i16* %x to <16 x i16>*
484 %1 = load <16 x i16>, <16 x i16>* %0, align 2
485 %2 = sext <16 x i16> %1 to <16 x i32>
486 %arrayidx.16 = getelementptr inbounds i16, i16* %x, i32 16
487 %3 = bitcast i16* %arrayidx.16 to <8 x i16>*
488 %4 = load <8 x i16>, <8 x i16>* %3, align 2
489 %5 = sext <8 x i16> %4 to <8 x i32>
490 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
491 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
492 %op.rdx = add nsw i32 %6, %7
496 define i32 @addv32i32i16(i16* %x) {
497 ; CHECK-LABEL: addv32i32i16:
498 ; CHECK: @ %bb.0: @ %entry
499 ; CHECK-NEXT: vldrh.s32 q1, [r0]
500 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
501 ; CHECK-NEXT: vaddv.u32 r2, q1
502 ; CHECK-NEXT: vaddva.u32 r2, q0
503 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
504 ; CHECK-NEXT: vaddva.u32 r2, q0
505 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
506 ; CHECK-NEXT: vaddva.u32 r2, q0
507 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
508 ; CHECK-NEXT: vaddva.u32 r2, q0
509 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
510 ; CHECK-NEXT: vaddva.u32 r2, q0
511 ; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
512 ; CHECK-NEXT: vaddva.u32 r2, q0
513 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
514 ; CHECK-NEXT: vaddva.u32 r2, q0
515 ; CHECK-NEXT: mov r0, r2
518 %0 = bitcast i16* %x to <32 x i16>*
519 %1 = load <32 x i16>, <32 x i16>* %0, align 2
520 %2 = sext <32 x i16> %1 to <32 x i32>
521 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
525 define i32 @addv64i32i16(i16* %x) {
526 ; CHECK-LABEL: addv64i32i16:
527 ; CHECK: @ %bb.0: @ %entry
528 ; CHECK-NEXT: vldrh.s32 q1, [r0]
529 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
530 ; CHECK-NEXT: ldrsh.w r1, [r0, #120]
531 ; CHECK-NEXT: vaddv.u32 r2, q1
532 ; CHECK-NEXT: ldrsh.w r3, [r0, #122]
533 ; CHECK-NEXT: vaddva.u32 r2, q0
534 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
535 ; CHECK-NEXT: ldrsh.w r12, [r0, #124]
536 ; CHECK-NEXT: vaddva.u32 r2, q0
537 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
538 ; CHECK-NEXT: vaddva.u32 r2, q0
539 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
540 ; CHECK-NEXT: vaddva.u32 r2, q0
541 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
542 ; CHECK-NEXT: vaddva.u32 r2, q0
543 ; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
544 ; CHECK-NEXT: vaddva.u32 r2, q0
545 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
546 ; CHECK-NEXT: vaddva.u32 r2, q0
547 ; CHECK-NEXT: vldrh.s32 q0, [r0, #64]
548 ; CHECK-NEXT: vaddva.u32 r2, q0
549 ; CHECK-NEXT: vldrh.s32 q0, [r0, #72]
550 ; CHECK-NEXT: vaddva.u32 r2, q0
551 ; CHECK-NEXT: vldrh.s32 q0, [r0, #80]
552 ; CHECK-NEXT: vaddva.u32 r2, q0
553 ; CHECK-NEXT: vldrh.s32 q0, [r0, #88]
554 ; CHECK-NEXT: vaddva.u32 r2, q0
555 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
556 ; CHECK-NEXT: vaddva.s16 r2, q0
557 ; CHECK-NEXT: vldrh.s32 q0, [r0, #112]
558 ; CHECK-NEXT: ldrsh.w r0, [r0, #126]
559 ; CHECK-NEXT: vaddva.u32 r2, q0
560 ; CHECK-NEXT: add r1, r2
561 ; CHECK-NEXT: add r1, r3
562 ; CHECK-NEXT: add r1, r12
563 ; CHECK-NEXT: add r0, r1
566 %0 = bitcast i16* %x to <32 x i16>*
567 %1 = load <32 x i16>, <32 x i16>* %0, align 2
568 %2 = sext <32 x i16> %1 to <32 x i32>
569 %arrayidx.32 = getelementptr inbounds i16, i16* %x, i32 32
570 %3 = bitcast i16* %arrayidx.32 to <16 x i16>*
571 %4 = load <16 x i16>, <16 x i16>* %3, align 2
572 %5 = sext <16 x i16> %4 to <16 x i32>
573 %arrayidx.48 = getelementptr inbounds i16, i16* %x, i32 48
574 %6 = bitcast i16* %arrayidx.48 to <8 x i16>*
575 %7 = load <8 x i16>, <8 x i16>* %6, align 2
576 %8 = sext <8 x i16> %7 to <8 x i32>
577 %arrayidx.56 = getelementptr inbounds i16, i16* %x, i32 56
578 %9 = bitcast i16* %arrayidx.56 to <4 x i16>*
579 %10 = load <4 x i16>, <4 x i16>* %9, align 2
580 %11 = sext <4 x i16> %10 to <4 x i32>
581 %arrayidx.60 = getelementptr inbounds i16, i16* %x, i32 60
582 %12 = load i16, i16* %arrayidx.60, align 2
583 %conv.60 = sext i16 %12 to i32
584 %arrayidx.61 = getelementptr inbounds i16, i16* %x, i32 61
585 %13 = load i16, i16* %arrayidx.61, align 2
586 %conv.61 = sext i16 %13 to i32
587 %arrayidx.62 = getelementptr inbounds i16, i16* %x, i32 62
588 %14 = load i16, i16* %arrayidx.62, align 2
589 %conv.62 = sext i16 %14 to i32
590 %15 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
591 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
592 %op.rdx = add nsw i32 %15, %16
593 %17 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
594 %op.rdx8 = add nsw i32 %op.rdx, %17
595 %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11)
596 %op.rdx9 = add nsw i32 %op.rdx8, %18
597 %19 = add nsw i32 %op.rdx9, %conv.60
598 %20 = add nsw i32 %19, %conv.61
599 %21 = add nsw i32 %20, %conv.62
600 %arrayidx.63 = getelementptr inbounds i16, i16* %x, i32 63
601 %22 = load i16, i16* %arrayidx.63, align 2
602 %conv.63 = sext i16 %22 to i32
603 %add.63 = add nsw i32 %21, %conv.63
607 define i32 @addv128i32i16(i16* %x) {
608 ; CHECK-LABEL: addv128i32i16:
609 ; CHECK: @ %bb.0: @ %entry
610 ; CHECK-NEXT: vldrh.u16 q1, [r0]
611 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
612 ; CHECK-NEXT: vaddv.s16 r2, q1
613 ; CHECK-NEXT: vaddva.s16 r2, q0
614 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
615 ; CHECK-NEXT: vaddva.s16 r2, q0
616 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
617 ; CHECK-NEXT: vaddva.s16 r2, q0
618 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
619 ; CHECK-NEXT: vaddva.s16 r2, q0
620 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
621 ; CHECK-NEXT: vaddva.s16 r2, q0
622 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
623 ; CHECK-NEXT: vaddva.s16 r2, q0
624 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
625 ; CHECK-NEXT: vaddva.s16 r2, q0
626 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
627 ; CHECK-NEXT: vaddva.s16 r2, q0
628 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
629 ; CHECK-NEXT: vaddva.s16 r2, q0
630 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
631 ; CHECK-NEXT: vaddva.s16 r2, q0
632 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
633 ; CHECK-NEXT: vaddva.s16 r2, q0
634 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
635 ; CHECK-NEXT: vaddva.s16 r2, q0
636 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
637 ; CHECK-NEXT: vaddva.s16 r2, q0
638 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
639 ; CHECK-NEXT: vaddva.s16 r2, q0
640 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
641 ; CHECK-NEXT: vaddva.s16 r2, q0
642 ; CHECK-NEXT: mov r0, r2
645 %0 = bitcast i16* %x to <8 x i16>*
646 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
647 %1 = sext <8 x i16> %wide.load to <8 x i32>
648 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
649 %3 = getelementptr inbounds i16, i16* %x, i32 8
650 %4 = bitcast i16* %3 to <8 x i16>*
651 %wide.load.1 = load <8 x i16>, <8 x i16>* %4, align 2
652 %5 = sext <8 x i16> %wide.load.1 to <8 x i32>
653 %6 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
655 %8 = getelementptr inbounds i16, i16* %x, i32 16
656 %9 = bitcast i16* %8 to <8 x i16>*
657 %wide.load.2 = load <8 x i16>, <8 x i16>* %9, align 2
658 %10 = sext <8 x i16> %wide.load.2 to <8 x i32>
659 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %10)
660 %12 = add i32 %11, %7
661 %13 = getelementptr inbounds i16, i16* %x, i32 24
662 %14 = bitcast i16* %13 to <8 x i16>*
663 %wide.load.3 = load <8 x i16>, <8 x i16>* %14, align 2
664 %15 = sext <8 x i16> %wide.load.3 to <8 x i32>
665 %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
666 %17 = add i32 %16, %12
667 %18 = getelementptr inbounds i16, i16* %x, i32 32
668 %19 = bitcast i16* %18 to <8 x i16>*
669 %wide.load.4 = load <8 x i16>, <8 x i16>* %19, align 2
670 %20 = sext <8 x i16> %wide.load.4 to <8 x i32>
671 %21 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %20)
672 %22 = add i32 %21, %17
673 %23 = getelementptr inbounds i16, i16* %x, i32 40
674 %24 = bitcast i16* %23 to <8 x i16>*
675 %wide.load.5 = load <8 x i16>, <8 x i16>* %24, align 2
676 %25 = sext <8 x i16> %wide.load.5 to <8 x i32>
677 %26 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %25)
678 %27 = add i32 %26, %22
679 %28 = getelementptr inbounds i16, i16* %x, i32 48
680 %29 = bitcast i16* %28 to <8 x i16>*
681 %wide.load.6 = load <8 x i16>, <8 x i16>* %29, align 2
682 %30 = sext <8 x i16> %wide.load.6 to <8 x i32>
683 %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30)
684 %32 = add i32 %31, %27
685 %33 = getelementptr inbounds i16, i16* %x, i32 56
686 %34 = bitcast i16* %33 to <8 x i16>*
687 %wide.load.7 = load <8 x i16>, <8 x i16>* %34, align 2
688 %35 = sext <8 x i16> %wide.load.7 to <8 x i32>
689 %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35)
690 %37 = add i32 %36, %32
691 %38 = getelementptr inbounds i16, i16* %x, i32 64
692 %39 = bitcast i16* %38 to <8 x i16>*
693 %wide.load.8 = load <8 x i16>, <8 x i16>* %39, align 2
694 %40 = sext <8 x i16> %wide.load.8 to <8 x i32>
695 %41 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
696 %42 = add i32 %41, %37
697 %43 = getelementptr inbounds i16, i16* %x, i32 72
698 %44 = bitcast i16* %43 to <8 x i16>*
699 %wide.load.9 = load <8 x i16>, <8 x i16>* %44, align 2
700 %45 = sext <8 x i16> %wide.load.9 to <8 x i32>
701 %46 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %45)
702 %47 = add i32 %46, %42
703 %48 = getelementptr inbounds i16, i16* %x, i32 80
704 %49 = bitcast i16* %48 to <8 x i16>*
705 %wide.load.10 = load <8 x i16>, <8 x i16>* %49, align 2
706 %50 = sext <8 x i16> %wide.load.10 to <8 x i32>
707 %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
708 %52 = add i32 %51, %47
709 %53 = getelementptr inbounds i16, i16* %x, i32 88
710 %54 = bitcast i16* %53 to <8 x i16>*
711 %wide.load.11 = load <8 x i16>, <8 x i16>* %54, align 2
712 %55 = sext <8 x i16> %wide.load.11 to <8 x i32>
713 %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55)
714 %57 = add i32 %56, %52
715 %58 = getelementptr inbounds i16, i16* %x, i32 96
716 %59 = bitcast i16* %58 to <8 x i16>*
717 %wide.load.12 = load <8 x i16>, <8 x i16>* %59, align 2
718 %60 = sext <8 x i16> %wide.load.12 to <8 x i32>
719 %61 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %60)
720 %62 = add i32 %61, %57
721 %63 = getelementptr inbounds i16, i16* %x, i32 104
722 %64 = bitcast i16* %63 to <8 x i16>*
723 %wide.load.13 = load <8 x i16>, <8 x i16>* %64, align 2
724 %65 = sext <8 x i16> %wide.load.13 to <8 x i32>
725 %66 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %65)
726 %67 = add i32 %66, %62
727 %68 = getelementptr inbounds i16, i16* %x, i32 112
728 %69 = bitcast i16* %68 to <8 x i16>*
729 %wide.load.14 = load <8 x i16>, <8 x i16>* %69, align 2
730 %70 = sext <8 x i16> %wide.load.14 to <8 x i32>
731 %71 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %70)
732 %72 = add i32 %71, %67
733 %73 = getelementptr inbounds i16, i16* %x, i32 120
734 %74 = bitcast i16* %73 to <8 x i16>*
735 %wide.load.15 = load <8 x i16>, <8 x i16>* %74, align 2
736 %75 = sext <8 x i16> %wide.load.15 to <8 x i32>
737 %76 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %75)
738 %77 = add i32 %76, %72
742 define i32 @addv2i32i8(i8* %x) {
743 ; CHECK-LABEL: addv2i32i8:
744 ; CHECK: @ %bb.0: @ %entry
745 ; CHECK-NEXT: ldrb r1, [r0]
746 ; CHECK-NEXT: ldrb r0, [r0, #1]
747 ; CHECK-NEXT: add r0, r1
750 %0 = load i8, i8* %x, align 1
751 %conv = zext i8 %0 to i32
752 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
753 %1 = load i8, i8* %arrayidx.1, align 1
754 %conv.1 = zext i8 %1 to i32
755 %add.1 = add nuw nsw i32 %conv, %conv.1
759 define i32 @addv4i32i8(i8* %x) {
760 ; CHECK-LABEL: addv4i32i8:
761 ; CHECK: @ %bb.0: @ %entry
762 ; CHECK-NEXT: vldrb.u32 q0, [r0]
763 ; CHECK-NEXT: vaddv.u32 r0, q0
766 %0 = bitcast i8* %x to <4 x i8>*
767 %1 = load <4 x i8>, <4 x i8>* %0, align 1
768 %2 = zext <4 x i8> %1 to <4 x i32>
769 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
773 define i32 @addv8i32i8(i8* %x) {
774 ; CHECK-LABEL: addv8i32i8:
775 ; CHECK: @ %bb.0: @ %entry
776 ; CHECK-NEXT: vldrb.u16 q0, [r0]
777 ; CHECK-NEXT: vaddv.u16 r0, q0
780 %0 = bitcast i8* %x to <8 x i8>*
781 %1 = load <8 x i8>, <8 x i8>* %0, align 1
782 %2 = zext <8 x i8> %1 to <8 x i32>
783 %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
787 define i32 @addv16i32i8(i8* %x) {
788 ; CHECK-LABEL: addv16i32i8:
789 ; CHECK: @ %bb.0: @ %entry
790 ; CHECK-NEXT: vldrb.u8 q0, [r0]
791 ; CHECK-NEXT: vaddv.u8 r0, q0
794 %0 = bitcast i8* %x to <16 x i8>*
795 %1 = load <16 x i8>, <16 x i8>* %0, align 1
796 %2 = zext <16 x i8> %1 to <16 x i32>
797 %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
801 define i32 @addv24i32i8(i8* %x) {
802 ; CHECK-LABEL: addv24i32i8:
803 ; CHECK: @ %bb.0: @ %entry
804 ; CHECK-NEXT: vldrb.u8 q1, [r0]
805 ; CHECK-NEXT: vldrb.u16 q0, [r0, #16]
806 ; CHECK-NEXT: vaddv.u8 r0, q1
807 ; CHECK-NEXT: vaddva.u16 r0, q0
810 %0 = bitcast i8* %x to <16 x i8>*
811 %1 = load <16 x i8>, <16 x i8>* %0, align 1
812 %2 = zext <16 x i8> %1 to <16 x i32>
813 %arrayidx.16 = getelementptr inbounds i8, i8* %x, i32 16
814 %3 = bitcast i8* %arrayidx.16 to <8 x i8>*
815 %4 = load <8 x i8>, <8 x i8>* %3, align 1
816 %5 = zext <8 x i8> %4 to <8 x i32>
817 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
818 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
819 %op.rdx = add nuw nsw i32 %6, %7
823 define i32 @addv32i32i8(i8* %x) {
824 ; CHECK-LABEL: addv32i32i8:
825 ; CHECK: @ %bb.0: @ %entry
826 ; CHECK-NEXT: vldrb.u32 q1, [r0]
827 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
828 ; CHECK-NEXT: vaddv.u32 r2, q1
829 ; CHECK-NEXT: vaddva.u32 r2, q0
830 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
831 ; CHECK-NEXT: vaddva.u32 r2, q0
832 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
833 ; CHECK-NEXT: vaddva.u32 r2, q0
834 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
835 ; CHECK-NEXT: vaddva.u32 r2, q0
836 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
837 ; CHECK-NEXT: vaddva.u32 r2, q0
838 ; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
839 ; CHECK-NEXT: vaddva.u32 r2, q0
840 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
841 ; CHECK-NEXT: vaddva.u32 r2, q0
842 ; CHECK-NEXT: mov r0, r2
845 %0 = bitcast i8* %x to <32 x i8>*
846 %1 = load <32 x i8>, <32 x i8>* %0, align 1
847 %2 = zext <32 x i8> %1 to <32 x i32>
848 %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
852 define i32 @addv64i32i8(i8* %x) {
853 ; CHECK-LABEL: addv64i32i8:
854 ; CHECK: @ %bb.0: @ %entry
855 ; CHECK-NEXT: vldrb.u32 q1, [r0]
856 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
857 ; CHECK-NEXT: ldrb.w r1, [r0, #60]
858 ; CHECK-NEXT: vaddv.u32 r2, q1
859 ; CHECK-NEXT: ldrb.w r3, [r0, #61]
860 ; CHECK-NEXT: vaddva.u32 r2, q0
861 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
862 ; CHECK-NEXT: ldrb.w r12, [r0, #62]
863 ; CHECK-NEXT: vaddva.u32 r2, q0
864 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
865 ; CHECK-NEXT: vaddva.u32 r2, q0
866 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
867 ; CHECK-NEXT: vaddva.u32 r2, q0
868 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
869 ; CHECK-NEXT: vaddva.u32 r2, q0
870 ; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
871 ; CHECK-NEXT: vaddva.u32 r2, q0
872 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
873 ; CHECK-NEXT: vaddva.u32 r2, q0
874 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
875 ; CHECK-NEXT: vaddva.u8 r2, q0
876 ; CHECK-NEXT: vldrb.u16 q0, [r0, #48]
877 ; CHECK-NEXT: vaddva.u16 r2, q0
878 ; CHECK-NEXT: vldrb.u32 q0, [r0, #56]
879 ; CHECK-NEXT: ldrb.w r0, [r0, #63]
880 ; CHECK-NEXT: vaddva.u32 r2, q0
881 ; CHECK-NEXT: add r1, r2
882 ; CHECK-NEXT: add r1, r3
883 ; CHECK-NEXT: add r1, r12
884 ; CHECK-NEXT: add r0, r1
887 %0 = bitcast i8* %x to <32 x i8>*
888 %1 = load <32 x i8>, <32 x i8>* %0, align 1
889 %2 = zext <32 x i8> %1 to <32 x i32>
890 %arrayidx.32 = getelementptr inbounds i8, i8* %x, i32 32
891 %3 = bitcast i8* %arrayidx.32 to <16 x i8>*
892 %4 = load <16 x i8>, <16 x i8>* %3, align 1
893 %5 = zext <16 x i8> %4 to <16 x i32>
894 %arrayidx.48 = getelementptr inbounds i8, i8* %x, i32 48
895 %6 = bitcast i8* %arrayidx.48 to <8 x i8>*
896 %7 = load <8 x i8>, <8 x i8>* %6, align 1
897 %8 = zext <8 x i8> %7 to <8 x i32>
898 %arrayidx.56 = getelementptr inbounds i8, i8* %x, i32 56
899 %9 = bitcast i8* %arrayidx.56 to <4 x i8>*
900 %10 = load <4 x i8>, <4 x i8>* %9, align 1
901 %11 = zext <4 x i8> %10 to <4 x i32>
902 %arrayidx.60 = getelementptr inbounds i8, i8* %x, i32 60
903 %12 = load i8, i8* %arrayidx.60, align 1
904 %conv.60 = zext i8 %12 to i32
905 %arrayidx.61 = getelementptr inbounds i8, i8* %x, i32 61
906 %13 = load i8, i8* %arrayidx.61, align 1
907 %conv.61 = zext i8 %13 to i32
908 %arrayidx.62 = getelementptr inbounds i8, i8* %x, i32 62
909 %14 = load i8, i8* %arrayidx.62, align 1
910 %conv.62 = zext i8 %14 to i32
911 %15 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
912 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
913 %op.rdx = add nuw nsw i32 %15, %16
914 %17 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
915 %op.rdx8 = add nuw nsw i32 %op.rdx, %17
916 %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11)
917 %op.rdx9 = add nuw nsw i32 %op.rdx8, %18
918 %19 = add nuw nsw i32 %op.rdx9, %conv.60
919 %20 = add nuw nsw i32 %19, %conv.61
920 %21 = add nuw nsw i32 %20, %conv.62
921 %arrayidx.63 = getelementptr inbounds i8, i8* %x, i32 63
922 %22 = load i8, i8* %arrayidx.63, align 1
923 %conv.63 = zext i8 %22 to i32
924 %add.63 = add nuw nsw i32 %21, %conv.63
928 define i32 @addv128i32i8(i8* %x) {
929 ; CHECK-LABEL: addv128i32i8:
930 ; CHECK: @ %bb.0: @ %entry
931 ; CHECK-NEXT: vldrb.u8 q1, [r0]
932 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
933 ; CHECK-NEXT: mov r1, r0
934 ; CHECK-NEXT: vaddv.u8 r0, q1
935 ; CHECK-NEXT: vaddva.u8 r0, q0
936 ; CHECK-NEXT: vldrb.u8 q0, [r1, #32]
937 ; CHECK-NEXT: vaddva.u8 r0, q0
938 ; CHECK-NEXT: vldrb.u8 q0, [r1, #48]
939 ; CHECK-NEXT: vaddva.u8 r0, q0
940 ; CHECK-NEXT: vldrb.u8 q0, [r1, #64]
941 ; CHECK-NEXT: vaddva.u8 r0, q0
942 ; CHECK-NEXT: vldrb.u8 q0, [r1, #80]
943 ; CHECK-NEXT: vaddva.u8 r0, q0
944 ; CHECK-NEXT: vldrb.u8 q0, [r1, #96]
945 ; CHECK-NEXT: vaddva.u8 r0, q0
946 ; CHECK-NEXT: vldrb.u8 q0, [r1, #112]
947 ; CHECK-NEXT: vaddva.u8 r0, q0
950 %0 = bitcast i8* %x to <16 x i8>*
951 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
952 %1 = zext <16 x i8> %wide.load to <16 x i32>
953 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
954 %3 = getelementptr inbounds i8, i8* %x, i32 16
955 %4 = bitcast i8* %3 to <16 x i8>*
956 %wide.load.1 = load <16 x i8>, <16 x i8>* %4, align 1
957 %5 = zext <16 x i8> %wide.load.1 to <16 x i32>
958 %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
960 %8 = getelementptr inbounds i8, i8* %x, i32 32
961 %9 = bitcast i8* %8 to <16 x i8>*
962 %wide.load.2 = load <16 x i8>, <16 x i8>* %9, align 1
963 %10 = zext <16 x i8> %wide.load.2 to <16 x i32>
964 %11 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %10)
965 %12 = add i32 %11, %7
966 %13 = getelementptr inbounds i8, i8* %x, i32 48
967 %14 = bitcast i8* %13 to <16 x i8>*
968 %wide.load.3 = load <16 x i8>, <16 x i8>* %14, align 1
969 %15 = zext <16 x i8> %wide.load.3 to <16 x i32>
970 %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
971 %17 = add i32 %16, %12
972 %18 = getelementptr inbounds i8, i8* %x, i32 64
973 %19 = bitcast i8* %18 to <16 x i8>*
974 %wide.load.4 = load <16 x i8>, <16 x i8>* %19, align 1
975 %20 = zext <16 x i8> %wide.load.4 to <16 x i32>
976 %21 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %20)
977 %22 = add i32 %21, %17
978 %23 = getelementptr inbounds i8, i8* %x, i32 80
979 %24 = bitcast i8* %23 to <16 x i8>*
980 %wide.load.5 = load <16 x i8>, <16 x i8>* %24, align 1
981 %25 = zext <16 x i8> %wide.load.5 to <16 x i32>
982 %26 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %25)
983 %27 = add i32 %26, %22
984 %28 = getelementptr inbounds i8, i8* %x, i32 96
985 %29 = bitcast i8* %28 to <16 x i8>*
986 %wide.load.6 = load <16 x i8>, <16 x i8>* %29, align 1
987 %30 = zext <16 x i8> %wide.load.6 to <16 x i32>
988 %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30)
989 %32 = add i32 %31, %27
990 %33 = getelementptr inbounds i8, i8* %x, i32 112
991 %34 = bitcast i8* %33 to <16 x i8>*
992 %wide.load.7 = load <16 x i8>, <16 x i8>* %34, align 1
993 %35 = zext <16 x i8> %wide.load.7 to <16 x i32>
994 %36 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %35)
995 %37 = add i32 %36, %32
999 define signext i16 @addv2i16i16(i16* %x) {
1000 ; CHECK-LABEL: addv2i16i16:
1001 ; CHECK: @ %bb.0: @ %entry
1002 ; CHECK-NEXT: ldrh r1, [r0]
1003 ; CHECK-NEXT: ldrh r0, [r0, #2]
1004 ; CHECK-NEXT: add r0, r1
1005 ; CHECK-NEXT: sxth r0, r0
1008 %0 = load i16, i16* %x, align 2
1009 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
1010 %1 = load i16, i16* %arrayidx.1, align 2
1011 %add.1 = add i16 %1, %0
1015 define signext i16 @addv4i16i16(i16* %x) {
1016 ; CHECK-LABEL: addv4i16i16:
1017 ; CHECK: @ %bb.0: @ %entry
1018 ; CHECK-NEXT: vldrh.u32 q0, [r0]
1019 ; CHECK-NEXT: vaddv.u32 r0, q0
1020 ; CHECK-NEXT: sxth r0, r0
1023 %0 = bitcast i16* %x to <4 x i16>*
1024 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1025 %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
1029 define signext i16 @addv8i16i16(i16* %x) {
1030 ; CHECK-LABEL: addv8i16i16:
1031 ; CHECK: @ %bb.0: @ %entry
1032 ; CHECK-NEXT: vldrh.u16 q0, [r0]
1033 ; CHECK-NEXT: vaddv.u16 r0, q0
1034 ; CHECK-NEXT: sxth r0, r0
1037 %0 = bitcast i16* %x to <8 x i16>*
1038 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1039 %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
1043 define signext i16 @addv16i16i16(i16* %x) {
1044 ; CHECK-LABEL: addv16i16i16:
1045 ; CHECK: @ %bb.0: @ %entry
1046 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1047 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1048 ; CHECK-NEXT: vaddv.u16 r0, q1
1049 ; CHECK-NEXT: vaddva.u16 r0, q0
1050 ; CHECK-NEXT: sxth r0, r0
1053 %0 = bitcast i16* %x to <16 x i16>*
1054 %1 = load <16 x i16>, <16 x i16>* %0, align 2
1055 %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
1059 define signext i16 @addv24i16i16(i16* %x) {
1060 ; CHECK-LABEL: addv24i16i16:
1061 ; CHECK: @ %bb.0: @ %entry
1062 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1063 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1064 ; CHECK-NEXT: vaddv.u16 r2, q1
1065 ; CHECK-NEXT: vaddva.u16 r2, q0
1066 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1067 ; CHECK-NEXT: vaddva.u16 r2, q0
1068 ; CHECK-NEXT: sxth r0, r2
1071 %0 = bitcast i16* %x to <8 x i16>*
1072 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1073 %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8
1074 %2 = bitcast i16* %arrayidx.8 to <16 x i16>*
1075 %3 = load <16 x i16>, <16 x i16>* %2, align 2
1076 %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
1077 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
1078 %op.rdx = add i16 %4, %5
1082 define signext i16 @addv32i16i16(i16* %x) {
1083 ; CHECK-LABEL: addv32i16i16:
1084 ; CHECK: @ %bb.0: @ %entry
1085 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1086 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1087 ; CHECK-NEXT: vaddv.u16 r2, q1
1088 ; CHECK-NEXT: vaddva.u16 r2, q0
1089 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1090 ; CHECK-NEXT: vaddva.u16 r2, q0
1091 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
1092 ; CHECK-NEXT: vaddva.u16 r2, q0
1093 ; CHECK-NEXT: sxth r0, r2
1096 %0 = bitcast i16* %x to <32 x i16>*
1097 %1 = load <32 x i16>, <32 x i16>* %0, align 2
1098 %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
1102 define signext i16 @addv64i16i16(i16* %x) {
1103 ; CHECK-LABEL: addv64i16i16:
1104 ; CHECK: @ %bb.0: @ %entry
1105 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1106 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1107 ; CHECK-NEXT: vaddv.u16 r2, q1
1108 ; CHECK-NEXT: vaddva.u16 r2, q0
1109 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1110 ; CHECK-NEXT: vaddva.u16 r2, q0
1111 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
1112 ; CHECK-NEXT: vaddva.u16 r2, q0
1113 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
1114 ; CHECK-NEXT: vaddva.u16 r2, q0
1115 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
1116 ; CHECK-NEXT: vaddva.u16 r2, q0
1117 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
1118 ; CHECK-NEXT: vaddva.u16 r2, q0
1119 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
1120 ; CHECK-NEXT: vaddva.u16 r2, q0
1121 ; CHECK-NEXT: sxth r0, r2
1124 %0 = bitcast i16* %x to <64 x i16>*
1125 %1 = load <64 x i16>, <64 x i16>* %0, align 2
1126 %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
1130 define signext i16 @addv128i16i16(i16* %x) {
1131 ; CHECK-LABEL: addv128i16i16:
1132 ; CHECK: @ %bb.0: @ %entry
1133 ; CHECK-NEXT: vldrh.u16 q1, [r0]
1134 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
1135 ; CHECK-NEXT: vaddv.u16 r2, q1
1136 ; CHECK-NEXT: vaddva.u16 r2, q0
1137 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
1138 ; CHECK-NEXT: vaddva.u16 r2, q0
1139 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
1140 ; CHECK-NEXT: vaddva.u16 r2, q0
1141 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
1142 ; CHECK-NEXT: vaddva.u16 r2, q0
1143 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
1144 ; CHECK-NEXT: vaddva.u16 r2, q0
1145 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
1146 ; CHECK-NEXT: vaddva.u16 r2, q0
1147 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
1148 ; CHECK-NEXT: vaddva.u16 r2, q0
1149 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
1150 ; CHECK-NEXT: vaddva.u16 r2, q0
1151 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
1152 ; CHECK-NEXT: vaddva.u16 r2, q0
1153 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
1154 ; CHECK-NEXT: vaddva.u16 r2, q0
1155 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
1156 ; CHECK-NEXT: vaddva.u16 r2, q0
1157 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
1158 ; CHECK-NEXT: vaddva.u16 r2, q0
1159 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
1160 ; CHECK-NEXT: vaddva.u16 r2, q0
1161 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
1162 ; CHECK-NEXT: vaddva.u16 r2, q0
1163 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
1164 ; CHECK-NEXT: vaddva.u16 r2, q0
1165 ; CHECK-NEXT: sxth r0, r2
1168 %0 = bitcast i16* %x to <8 x i16>*
1169 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
1170 %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load)
1171 %2 = getelementptr inbounds i16, i16* %x, i32 8
1172 %3 = bitcast i16* %2 to <8 x i16>*
1173 %wide.load.1 = load <8 x i16>, <8 x i16>* %3, align 2
1174 %4 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1)
1176 %6 = getelementptr inbounds i16, i16* %x, i32 16
1177 %7 = bitcast i16* %6 to <8 x i16>*
1178 %wide.load.2 = load <8 x i16>, <8 x i16>* %7, align 2
1179 %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2)
1181 %10 = getelementptr inbounds i16, i16* %x, i32 24
1182 %11 = bitcast i16* %10 to <8 x i16>*
1183 %wide.load.3 = load <8 x i16>, <8 x i16>* %11, align 2
1184 %12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3)
1185 %13 = add i16 %12, %9
1186 %14 = getelementptr inbounds i16, i16* %x, i32 32
1187 %15 = bitcast i16* %14 to <8 x i16>*
1188 %wide.load.4 = load <8 x i16>, <8 x i16>* %15, align 2
1189 %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4)
1190 %17 = add i16 %16, %13
1191 %18 = getelementptr inbounds i16, i16* %x, i32 40
1192 %19 = bitcast i16* %18 to <8 x i16>*
1193 %wide.load.5 = load <8 x i16>, <8 x i16>* %19, align 2
1194 %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5)
1195 %21 = add i16 %20, %17
1196 %22 = getelementptr inbounds i16, i16* %x, i32 48
1197 %23 = bitcast i16* %22 to <8 x i16>*
1198 %wide.load.6 = load <8 x i16>, <8 x i16>* %23, align 2
1199 %24 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6)
1200 %25 = add i16 %24, %21
1201 %26 = getelementptr inbounds i16, i16* %x, i32 56
1202 %27 = bitcast i16* %26 to <8 x i16>*
1203 %wide.load.7 = load <8 x i16>, <8 x i16>* %27, align 2
1204 %28 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7)
1205 %29 = add i16 %28, %25
1206 %30 = getelementptr inbounds i16, i16* %x, i32 64
1207 %31 = bitcast i16* %30 to <8 x i16>*
1208 %wide.load.8 = load <8 x i16>, <8 x i16>* %31, align 2
1209 %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8)
1210 %33 = add i16 %32, %29
1211 %34 = getelementptr inbounds i16, i16* %x, i32 72
1212 %35 = bitcast i16* %34 to <8 x i16>*
1213 %wide.load.9 = load <8 x i16>, <8 x i16>* %35, align 2
1214 %36 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9)
1215 %37 = add i16 %36, %33
1216 %38 = getelementptr inbounds i16, i16* %x, i32 80
1217 %39 = bitcast i16* %38 to <8 x i16>*
1218 %wide.load.10 = load <8 x i16>, <8 x i16>* %39, align 2
1219 %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10)
1220 %41 = add i16 %40, %37
1221 %42 = getelementptr inbounds i16, i16* %x, i32 88
1222 %43 = bitcast i16* %42 to <8 x i16>*
1223 %wide.load.11 = load <8 x i16>, <8 x i16>* %43, align 2
1224 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11)
1225 %45 = add i16 %44, %41
1226 %46 = getelementptr inbounds i16, i16* %x, i32 96
1227 %47 = bitcast i16* %46 to <8 x i16>*
1228 %wide.load.12 = load <8 x i16>, <8 x i16>* %47, align 2
1229 %48 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12)
1230 %49 = add i16 %48, %45
1231 %50 = getelementptr inbounds i16, i16* %x, i32 104
1232 %51 = bitcast i16* %50 to <8 x i16>*
1233 %wide.load.13 = load <8 x i16>, <8 x i16>* %51, align 2
1234 %52 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13)
1235 %53 = add i16 %52, %49
1236 %54 = getelementptr inbounds i16, i16* %x, i32 112
1237 %55 = bitcast i16* %54 to <8 x i16>*
1238 %wide.load.14 = load <8 x i16>, <8 x i16>* %55, align 2
1239 %56 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14)
1240 %57 = add i16 %56, %53
1241 %58 = getelementptr inbounds i16, i16* %x, i32 120
1242 %59 = bitcast i16* %58 to <8 x i16>*
1243 %wide.load.15 = load <8 x i16>, <8 x i16>* %59, align 2
1244 %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15)
1245 %61 = add i16 %60, %57
1249 define zeroext i8 @addv2i8i8(i8* %x) {
1250 ; CHECK-LABEL: addv2i8i8:
1251 ; CHECK: @ %bb.0: @ %entry
1252 ; CHECK-NEXT: ldrb r1, [r0]
1253 ; CHECK-NEXT: ldrb r0, [r0, #1]
1254 ; CHECK-NEXT: add r0, r1
1255 ; CHECK-NEXT: uxtb r0, r0
1258 %0 = load i8, i8* %x, align 1
1259 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
1260 %1 = load i8, i8* %arrayidx.1, align 1
1261 %add.1 = add i8 %1, %0
1265 define zeroext i8 @addv4i8i8(i8* %x) {
1266 ; CHECK-LABEL: addv4i8i8:
1267 ; CHECK: @ %bb.0: @ %entry
1268 ; CHECK-NEXT: vldrb.u32 q0, [r0]
1269 ; CHECK-NEXT: vaddv.u32 r0, q0
1270 ; CHECK-NEXT: uxtb r0, r0
1273 %0 = bitcast i8* %x to <4 x i8>*
1274 %1 = load <4 x i8>, <4 x i8>* %0, align 1
1275 %2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %1)
1279 define zeroext i8 @addv8i8i8(i8* %x) {
1280 ; CHECK-LABEL: addv8i8i8:
1281 ; CHECK: @ %bb.0: @ %entry
1282 ; CHECK-NEXT: vldrb.u16 q0, [r0]
1283 ; CHECK-NEXT: vaddv.u16 r0, q0
1284 ; CHECK-NEXT: uxtb r0, r0
1287 %0 = bitcast i8* %x to <8 x i8>*
1288 %1 = load <8 x i8>, <8 x i8>* %0, align 1
1289 %2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %1)
1293 define zeroext i8 @addv16i8i8(i8* %x) {
1294 ; CHECK-LABEL: addv16i8i8:
1295 ; CHECK: @ %bb.0: @ %entry
1296 ; CHECK-NEXT: vldrb.u8 q0, [r0]
1297 ; CHECK-NEXT: vaddv.u8 r0, q0
1298 ; CHECK-NEXT: uxtb r0, r0
1301 %0 = bitcast i8* %x to <16 x i8>*
1302 %1 = load <16 x i8>, <16 x i8>* %0, align 1
1303 %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1)
1307 define zeroext i8 @addv24i8i8(i8* %x) {
1308 ; CHECK-LABEL: addv24i8i8:
1309 ; CHECK: @ %bb.0: @ %entry
1310 ; CHECK-NEXT: vldrb.u16 q1, [r0]
1311 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
1312 ; CHECK-NEXT: vaddv.u16 r0, q1
1313 ; CHECK-NEXT: vaddva.u8 r0, q0
1314 ; CHECK-NEXT: uxtb r0, r0
1317 %0 = bitcast i8* %x to <8 x i8>*
1318 %1 = load <8 x i8>, <8 x i8>* %0, align 1
1319 %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8
1320 %2 = bitcast i8* %arrayidx.8 to <16 x i8>*
1321 %3 = load <16 x i8>, <16 x i8>* %2, align 1
1322 %4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %3)
1323 %5 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %1)
1324 %op.rdx = add i8 %4, %5
1328 define zeroext i8 @addv32i8i8(i8* %x) {
1329 ; CHECK-LABEL: addv32i8i8:
1330 ; CHECK: @ %bb.0: @ %entry
1331 ; CHECK-NEXT: vldrb.u8 q1, [r0]
1332 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
1333 ; CHECK-NEXT: vaddv.u8 r0, q1
1334 ; CHECK-NEXT: vaddva.u8 r0, q0
1335 ; CHECK-NEXT: uxtb r0, r0
1338 %0 = bitcast i8* %x to <32 x i8>*
1339 %1 = load <32 x i8>, <32 x i8>* %0, align 1
1340 %2 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %1)
1344 define zeroext i8 @addv64i8i8(i8* %x) {
1345 ; CHECK-LABEL: addv64i8i8:
1346 ; CHECK: @ %bb.0: @ %entry
1347 ; CHECK-NEXT: vldrb.u8 q1, [r0]
1348 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
1349 ; CHECK-NEXT: vaddv.u8 r2, q1
1350 ; CHECK-NEXT: vaddva.u8 r2, q0
1351 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
1352 ; CHECK-NEXT: vaddva.u8 r2, q0
1353 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
1354 ; CHECK-NEXT: vaddva.u8 r2, q0
1355 ; CHECK-NEXT: uxtb r0, r2
1358 %0 = bitcast i8* %x to <64 x i8>*
1359 %1 = load <64 x i8>, <64 x i8>* %0, align 1
1360 %2 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %1)
1364 define zeroext i8 @addv128i8i8(i8* %x) {
1365 ; CHECK-LABEL: addv128i8i8:
1366 ; CHECK: @ %bb.0: @ %entry
1367 ; CHECK-NEXT: vldrb.u8 q1, [r0]
1368 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
1369 ; CHECK-NEXT: vaddv.u8 r2, q1
1370 ; CHECK-NEXT: vaddva.u8 r2, q0
1371 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
1372 ; CHECK-NEXT: vaddva.u8 r2, q0
1373 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
1374 ; CHECK-NEXT: vaddva.u8 r2, q0
1375 ; CHECK-NEXT: vldrb.u8 q0, [r0, #64]
1376 ; CHECK-NEXT: vaddva.u8 r2, q0
1377 ; CHECK-NEXT: vldrb.u8 q0, [r0, #80]
1378 ; CHECK-NEXT: vaddva.u8 r2, q0
1379 ; CHECK-NEXT: vldrb.u8 q0, [r0, #96]
1380 ; CHECK-NEXT: vaddva.u8 r2, q0
1381 ; CHECK-NEXT: vldrb.u8 q0, [r0, #112]
1382 ; CHECK-NEXT: vaddva.u8 r2, q0
1383 ; CHECK-NEXT: uxtb r0, r2
1386 %0 = bitcast i8* %x to <16 x i8>*
1387 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
1388 %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load)
1389 %2 = getelementptr inbounds i8, i8* %x, i32 16
1390 %3 = bitcast i8* %2 to <16 x i8>*
1391 %wide.load.1 = load <16 x i8>, <16 x i8>* %3, align 1
1392 %4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1)
1394 %6 = getelementptr inbounds i8, i8* %x, i32 32
1395 %7 = bitcast i8* %6 to <16 x i8>*
1396 %wide.load.2 = load <16 x i8>, <16 x i8>* %7, align 1
1397 %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2)
1399 %10 = getelementptr inbounds i8, i8* %x, i32 48
1400 %11 = bitcast i8* %10 to <16 x i8>*
1401 %wide.load.3 = load <16 x i8>, <16 x i8>* %11, align 1
1402 %12 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3)
1403 %13 = add i8 %12, %9
1404 %14 = getelementptr inbounds i8, i8* %x, i32 64
1405 %15 = bitcast i8* %14 to <16 x i8>*
1406 %wide.load.4 = load <16 x i8>, <16 x i8>* %15, align 1
1407 %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4)
1408 %17 = add i8 %16, %13
1409 %18 = getelementptr inbounds i8, i8* %x, i32 80
1410 %19 = bitcast i8* %18 to <16 x i8>*
1411 %wide.load.5 = load <16 x i8>, <16 x i8>* %19, align 1
1412 %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5)
1413 %21 = add i8 %20, %17
1414 %22 = getelementptr inbounds i8, i8* %x, i32 96
1415 %23 = bitcast i8* %22 to <16 x i8>*
1416 %wide.load.6 = load <16 x i8>, <16 x i8>* %23, align 1
1417 %24 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6)
1418 %25 = add i8 %24, %21
1419 %26 = getelementptr inbounds i8, i8* %x, i32 112
1420 %27 = bitcast i8* %26 to <16 x i8>*
1421 %wide.load.7 = load <16 x i8>, <16 x i8>* %27, align 1
1422 %28 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7)
1423 %29 = add i8 %28, %25
1429 define i32 @mlav2i32i32(i32* %x, i32* %y) {
1430 ; CHECK-LABEL: mlav2i32i32:
1431 ; CHECK: @ %bb.0: @ %entry
1432 ; CHECK-NEXT: ldrd r2, r0, [r0]
1433 ; CHECK-NEXT: ldrd r3, r1, [r1]
1434 ; CHECK-NEXT: muls r2, r3, r2
1435 ; CHECK-NEXT: mla r0, r1, r0, r2
1438 %0 = load i32, i32* %x, align 4
1439 %1 = load i32, i32* %y, align 4
1440 %mul = mul nsw i32 %1, %0
1441 %arrayidx.1 = getelementptr inbounds i32, i32* %x, i32 1
1442 %2 = load i32, i32* %arrayidx.1, align 4
1443 %arrayidx1.1 = getelementptr inbounds i32, i32* %y, i32 1
1444 %3 = load i32, i32* %arrayidx1.1, align 4
1445 %mul.1 = mul nsw i32 %3, %2
1446 %add.1 = add nsw i32 %mul.1, %mul
1450 define i32 @mlav4i32i32(i32* %x, i32* %y) {
1451 ; CHECK-LABEL: mlav4i32i32:
1452 ; CHECK: @ %bb.0: @ %entry
1453 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1454 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1455 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1458 %0 = bitcast i32* %x to <4 x i32>*
1459 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1460 %2 = bitcast i32* %y to <4 x i32>*
1461 %3 = load <4 x i32>, <4 x i32>* %2, align 4
1462 %4 = mul nsw <4 x i32> %3, %1
1463 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1467 define i32 @mlav8i32i32(i32* %x, i32* %y) {
1468 ; CHECK-LABEL: mlav8i32i32:
1469 ; CHECK: @ %bb.0: @ %entry
1470 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1471 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1472 ; CHECK-NEXT: vmlav.u32 r2, q1, q0
1473 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1474 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1475 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1476 ; CHECK-NEXT: mov r0, r2
1479 %0 = bitcast i32* %x to <8 x i32>*
1480 %1 = load <8 x i32>, <8 x i32>* %0, align 4
1481 %2 = bitcast i32* %y to <8 x i32>*
1482 %3 = load <8 x i32>, <8 x i32>* %2, align 4
1483 %4 = mul nsw <8 x i32> %3, %1
1484 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
1488 define i32 @mlav16i32i32(i32* %x, i32* %y) {
1489 ; CHECK-LABEL: mlav16i32i32:
1490 ; CHECK: @ %bb.0: @ %entry
1491 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1492 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1493 ; CHECK-NEXT: vmlav.u32 r2, q1, q0
1494 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1495 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1496 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1497 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
1498 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1499 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1500 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
1501 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1502 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
1503 ; CHECK-NEXT: mov r0, r2
1506 %0 = bitcast i32* %x to <16 x i32>*
1507 %1 = load <16 x i32>, <16 x i32>* %0, align 4
1508 %2 = bitcast i32* %y to <16 x i32>*
1509 %3 = load <16 x i32>, <16 x i32>* %2, align 4
1510 %4 = mul nsw <16 x i32> %3, %1
1511 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
1515 define i32 @mlav24i32i32(i32* %x, i32* %y) {
1516 ; CHECK-LABEL: mlav24i32i32:
1517 ; CHECK: @ %bb.0: @ %entry
1518 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1519 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1520 ; CHECK-NEXT: mov r2, r0
1521 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1522 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1523 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1524 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1525 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1526 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1527 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1528 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1529 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1530 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1531 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1532 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1533 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1534 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1535 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1536 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1539 %0 = bitcast i32* %x to <8 x i32>*
1540 %1 = load <8 x i32>, <8 x i32>* %0, align 4
1541 %2 = bitcast i32* %y to <8 x i32>*
1542 %3 = load <8 x i32>, <8 x i32>* %2, align 4
1543 %4 = mul nsw <8 x i32> %3, %1
1544 %arrayidx.8 = getelementptr inbounds i32, i32* %x, i32 8
1545 %arrayidx1.8 = getelementptr inbounds i32, i32* %y, i32 8
1546 %5 = bitcast i32* %arrayidx.8 to <16 x i32>*
1547 %6 = load <16 x i32>, <16 x i32>* %5, align 4
1548 %7 = bitcast i32* %arrayidx1.8 to <16 x i32>*
1549 %8 = load <16 x i32>, <16 x i32>* %7, align 4
1550 %9 = mul nsw <16 x i32> %8, %6
1551 %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
1552 %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
1553 %op.rdx = add nsw i32 %10, %11
1557 define i32 @mlav32i32i32(i32* %x, i32* %y) {
1558 ; CHECK-LABEL: mlav32i32i32:
1559 ; CHECK: @ %bb.0: @ %entry
1560 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1561 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1562 ; CHECK-NEXT: mov r2, r0
1563 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1564 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1565 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1566 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1567 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1568 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1569 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1570 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1571 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1572 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1573 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1574 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1575 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1576 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1577 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1578 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1579 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
1580 ; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
1581 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1582 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
1583 ; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
1584 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1587 %0 = bitcast i32* %x to <32 x i32>*
1588 %1 = load <32 x i32>, <32 x i32>* %0, align 4
1589 %2 = bitcast i32* %y to <32 x i32>*
1590 %3 = load <32 x i32>, <32 x i32>* %2, align 4
1591 %4 = mul nsw <32 x i32> %3, %1
1592 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
1596 define i32 @mlav64i32i32(i32* %x, i32* %y) {
1597 ; CHECK-LABEL: mlav64i32i32:
1598 ; CHECK: @ %bb.0: @ %entry
1599 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1600 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1601 ; CHECK-NEXT: mov r2, r0
1602 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1603 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1604 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1605 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1606 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1607 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1608 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1609 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1610 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1611 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1612 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1613 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1614 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1615 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1616 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1617 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1618 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
1619 ; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
1620 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1621 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
1622 ; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
1623 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1624 ; CHECK-NEXT: vldrw.u32 q0, [r2, #128]
1625 ; CHECK-NEXT: vldrw.u32 q1, [r1, #128]
1626 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1627 ; CHECK-NEXT: vldrw.u32 q0, [r2, #144]
1628 ; CHECK-NEXT: vldrw.u32 q1, [r1, #144]
1629 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1630 ; CHECK-NEXT: vldrw.u32 q0, [r2, #160]
1631 ; CHECK-NEXT: vldrw.u32 q1, [r1, #160]
1632 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1633 ; CHECK-NEXT: vldrw.u32 q0, [r2, #176]
1634 ; CHECK-NEXT: vldrw.u32 q1, [r1, #176]
1635 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1636 ; CHECK-NEXT: vldrw.u32 q0, [r2, #192]
1637 ; CHECK-NEXT: vldrw.u32 q1, [r1, #192]
1638 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1639 ; CHECK-NEXT: vldrw.u32 q0, [r2, #208]
1640 ; CHECK-NEXT: vldrw.u32 q1, [r1, #208]
1641 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1642 ; CHECK-NEXT: vldrw.u32 q0, [r2, #224]
1643 ; CHECK-NEXT: vldrw.u32 q1, [r1, #224]
1644 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1645 ; CHECK-NEXT: vldrw.u32 q0, [r2, #240]
1646 ; CHECK-NEXT: vldrw.u32 q1, [r1, #240]
1647 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1650 %0 = bitcast i32* %x to <4 x i32>*
1651 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
1652 %1 = bitcast i32* %y to <4 x i32>*
1653 %wide.load10 = load <4 x i32>, <4 x i32>* %1, align 4
1654 %2 = mul nsw <4 x i32> %wide.load10, %wide.load
1655 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1656 %4 = getelementptr inbounds i32, i32* %x, i32 4
1657 %5 = bitcast i32* %4 to <4 x i32>*
1658 %wide.load.1 = load <4 x i32>, <4 x i32>* %5, align 4
1659 %6 = getelementptr inbounds i32, i32* %y, i32 4
1660 %7 = bitcast i32* %6 to <4 x i32>*
1661 %wide.load10.1 = load <4 x i32>, <4 x i32>* %7, align 4
1662 %8 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1663 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
1664 %10 = add i32 %9, %3
1665 %11 = getelementptr inbounds i32, i32* %x, i32 8
1666 %12 = bitcast i32* %11 to <4 x i32>*
1667 %wide.load.2 = load <4 x i32>, <4 x i32>* %12, align 4
1668 %13 = getelementptr inbounds i32, i32* %y, i32 8
1669 %14 = bitcast i32* %13 to <4 x i32>*
1670 %wide.load10.2 = load <4 x i32>, <4 x i32>* %14, align 4
1671 %15 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1672 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15)
1673 %17 = add i32 %16, %10
1674 %18 = getelementptr inbounds i32, i32* %x, i32 12
1675 %19 = bitcast i32* %18 to <4 x i32>*
1676 %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 4
1677 %20 = getelementptr inbounds i32, i32* %y, i32 12
1678 %21 = bitcast i32* %20 to <4 x i32>*
1679 %wide.load10.3 = load <4 x i32>, <4 x i32>* %21, align 4
1680 %22 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1681 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %22)
1682 %24 = add i32 %23, %17
1683 %25 = getelementptr inbounds i32, i32* %x, i32 16
1684 %26 = bitcast i32* %25 to <4 x i32>*
1685 %wide.load.4 = load <4 x i32>, <4 x i32>* %26, align 4
1686 %27 = getelementptr inbounds i32, i32* %y, i32 16
1687 %28 = bitcast i32* %27 to <4 x i32>*
1688 %wide.load10.4 = load <4 x i32>, <4 x i32>* %28, align 4
1689 %29 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1690 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1691 %31 = add i32 %30, %24
1692 %32 = getelementptr inbounds i32, i32* %x, i32 20
1693 %33 = bitcast i32* %32 to <4 x i32>*
1694 %wide.load.5 = load <4 x i32>, <4 x i32>* %33, align 4
1695 %34 = getelementptr inbounds i32, i32* %y, i32 20
1696 %35 = bitcast i32* %34 to <4 x i32>*
1697 %wide.load10.5 = load <4 x i32>, <4 x i32>* %35, align 4
1698 %36 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1699 %37 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36)
1700 %38 = add i32 %37, %31
1701 %39 = getelementptr inbounds i32, i32* %x, i32 24
1702 %40 = bitcast i32* %39 to <4 x i32>*
1703 %wide.load.6 = load <4 x i32>, <4 x i32>* %40, align 4
1704 %41 = getelementptr inbounds i32, i32* %y, i32 24
1705 %42 = bitcast i32* %41 to <4 x i32>*
1706 %wide.load10.6 = load <4 x i32>, <4 x i32>* %42, align 4
1707 %43 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1708 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %43)
1709 %45 = add i32 %44, %38
1710 %46 = getelementptr inbounds i32, i32* %x, i32 28
1711 %47 = bitcast i32* %46 to <4 x i32>*
1712 %wide.load.7 = load <4 x i32>, <4 x i32>* %47, align 4
1713 %48 = getelementptr inbounds i32, i32* %y, i32 28
1714 %49 = bitcast i32* %48 to <4 x i32>*
1715 %wide.load10.7 = load <4 x i32>, <4 x i32>* %49, align 4
1716 %50 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1717 %51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %50)
1718 %52 = add i32 %51, %45
1719 %53 = getelementptr inbounds i32, i32* %x, i32 32
1720 %54 = bitcast i32* %53 to <4 x i32>*
1721 %wide.load.8 = load <4 x i32>, <4 x i32>* %54, align 4
1722 %55 = getelementptr inbounds i32, i32* %y, i32 32
1723 %56 = bitcast i32* %55 to <4 x i32>*
1724 %wide.load10.8 = load <4 x i32>, <4 x i32>* %56, align 4
1725 %57 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1726 %58 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %57)
1727 %59 = add i32 %58, %52
1728 %60 = getelementptr inbounds i32, i32* %x, i32 36
1729 %61 = bitcast i32* %60 to <4 x i32>*
1730 %wide.load.9 = load <4 x i32>, <4 x i32>* %61, align 4
1731 %62 = getelementptr inbounds i32, i32* %y, i32 36
1732 %63 = bitcast i32* %62 to <4 x i32>*
1733 %wide.load10.9 = load <4 x i32>, <4 x i32>* %63, align 4
1734 %64 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1735 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1736 %66 = add i32 %65, %59
1737 %67 = getelementptr inbounds i32, i32* %x, i32 40
1738 %68 = bitcast i32* %67 to <4 x i32>*
1739 %wide.load.10 = load <4 x i32>, <4 x i32>* %68, align 4
1740 %69 = getelementptr inbounds i32, i32* %y, i32 40
1741 %70 = bitcast i32* %69 to <4 x i32>*
1742 %wide.load10.10 = load <4 x i32>, <4 x i32>* %70, align 4
1743 %71 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1744 %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %71)
1745 %73 = add i32 %72, %66
1746 %74 = getelementptr inbounds i32, i32* %x, i32 44
1747 %75 = bitcast i32* %74 to <4 x i32>*
1748 %wide.load.11 = load <4 x i32>, <4 x i32>* %75, align 4
1749 %76 = getelementptr inbounds i32, i32* %y, i32 44
1750 %77 = bitcast i32* %76 to <4 x i32>*
1751 %wide.load10.11 = load <4 x i32>, <4 x i32>* %77, align 4
1752 %78 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1753 %79 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %78)
1754 %80 = add i32 %79, %73
1755 %81 = getelementptr inbounds i32, i32* %x, i32 48
1756 %82 = bitcast i32* %81 to <4 x i32>*
1757 %wide.load.12 = load <4 x i32>, <4 x i32>* %82, align 4
1758 %83 = getelementptr inbounds i32, i32* %y, i32 48
1759 %84 = bitcast i32* %83 to <4 x i32>*
1760 %wide.load10.12 = load <4 x i32>, <4 x i32>* %84, align 4
1761 %85 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
1762 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %85)
1763 %87 = add i32 %86, %80
1764 %88 = getelementptr inbounds i32, i32* %x, i32 52
1765 %89 = bitcast i32* %88 to <4 x i32>*
1766 %wide.load.13 = load <4 x i32>, <4 x i32>* %89, align 4
1767 %90 = getelementptr inbounds i32, i32* %y, i32 52
1768 %91 = bitcast i32* %90 to <4 x i32>*
1769 %wide.load10.13 = load <4 x i32>, <4 x i32>* %91, align 4
1770 %92 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
1771 %93 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %92)
1772 %94 = add i32 %93, %87
1773 %95 = getelementptr inbounds i32, i32* %x, i32 56
1774 %96 = bitcast i32* %95 to <4 x i32>*
1775 %wide.load.14 = load <4 x i32>, <4 x i32>* %96, align 4
1776 %97 = getelementptr inbounds i32, i32* %y, i32 56
1777 %98 = bitcast i32* %97 to <4 x i32>*
1778 %wide.load10.14 = load <4 x i32>, <4 x i32>* %98, align 4
1779 %99 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
1780 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
1781 %101 = add i32 %100, %94
1782 %102 = getelementptr inbounds i32, i32* %x, i32 60
1783 %103 = bitcast i32* %102 to <4 x i32>*
1784 %wide.load.15 = load <4 x i32>, <4 x i32>* %103, align 4
1785 %104 = getelementptr inbounds i32, i32* %y, i32 60
1786 %105 = bitcast i32* %104 to <4 x i32>*
1787 %wide.load10.15 = load <4 x i32>, <4 x i32>* %105, align 4
1788 %106 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
1789 %107 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %106)
1790 %108 = add i32 %107, %101
1794 define i32 @mlav128i32i32(i32* %x, i32* %y) {
1795 ; CHECK-LABEL: mlav128i32i32:
1796 ; CHECK: @ %bb.0: @ %entry
1797 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1798 ; CHECK-NEXT: vldrw.u32 q1, [r1]
1799 ; CHECK-NEXT: mov r2, r0
1800 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
1801 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
1802 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
1803 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1804 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
1805 ; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
1806 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1807 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
1808 ; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
1809 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1810 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
1811 ; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
1812 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1813 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
1814 ; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
1815 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1816 ; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
1817 ; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
1818 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1819 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
1820 ; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
1821 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1822 ; CHECK-NEXT: vldrw.u32 q0, [r2, #128]
1823 ; CHECK-NEXT: vldrw.u32 q1, [r1, #128]
1824 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1825 ; CHECK-NEXT: vldrw.u32 q0, [r2, #144]
1826 ; CHECK-NEXT: vldrw.u32 q1, [r1, #144]
1827 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1828 ; CHECK-NEXT: vldrw.u32 q0, [r2, #160]
1829 ; CHECK-NEXT: vldrw.u32 q1, [r1, #160]
1830 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1831 ; CHECK-NEXT: vldrw.u32 q0, [r2, #176]
1832 ; CHECK-NEXT: vldrw.u32 q1, [r1, #176]
1833 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1834 ; CHECK-NEXT: vldrw.u32 q0, [r2, #192]
1835 ; CHECK-NEXT: vldrw.u32 q1, [r1, #192]
1836 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1837 ; CHECK-NEXT: vldrw.u32 q0, [r2, #208]
1838 ; CHECK-NEXT: vldrw.u32 q1, [r1, #208]
1839 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1840 ; CHECK-NEXT: vldrw.u32 q0, [r2, #224]
1841 ; CHECK-NEXT: vldrw.u32 q1, [r1, #224]
1842 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1843 ; CHECK-NEXT: vldrw.u32 q0, [r2, #240]
1844 ; CHECK-NEXT: vldrw.u32 q1, [r1, #240]
1845 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1846 ; CHECK-NEXT: vldrw.u32 q0, [r2, #256]
1847 ; CHECK-NEXT: vldrw.u32 q1, [r1, #256]
1848 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1849 ; CHECK-NEXT: vldrw.u32 q0, [r2, #272]
1850 ; CHECK-NEXT: vldrw.u32 q1, [r1, #272]
1851 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1852 ; CHECK-NEXT: vldrw.u32 q0, [r2, #288]
1853 ; CHECK-NEXT: vldrw.u32 q1, [r1, #288]
1854 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1855 ; CHECK-NEXT: vldrw.u32 q0, [r2, #304]
1856 ; CHECK-NEXT: vldrw.u32 q1, [r1, #304]
1857 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1858 ; CHECK-NEXT: vldrw.u32 q0, [r2, #320]
1859 ; CHECK-NEXT: vldrw.u32 q1, [r1, #320]
1860 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1861 ; CHECK-NEXT: vldrw.u32 q0, [r2, #336]
1862 ; CHECK-NEXT: vldrw.u32 q1, [r1, #336]
1863 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1864 ; CHECK-NEXT: vldrw.u32 q0, [r2, #352]
1865 ; CHECK-NEXT: vldrw.u32 q1, [r1, #352]
1866 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1867 ; CHECK-NEXT: vldrw.u32 q0, [r2, #368]
1868 ; CHECK-NEXT: vldrw.u32 q1, [r1, #368]
1869 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1870 ; CHECK-NEXT: vldrw.u32 q0, [r2, #384]
1871 ; CHECK-NEXT: vldrw.u32 q1, [r1, #384]
1872 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1873 ; CHECK-NEXT: vldrw.u32 q0, [r2, #400]
1874 ; CHECK-NEXT: vldrw.u32 q1, [r1, #400]
1875 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1876 ; CHECK-NEXT: vldrw.u32 q0, [r2, #416]
1877 ; CHECK-NEXT: vldrw.u32 q1, [r1, #416]
1878 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1879 ; CHECK-NEXT: vldrw.u32 q0, [r2, #432]
1880 ; CHECK-NEXT: vldrw.u32 q1, [r1, #432]
1881 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1882 ; CHECK-NEXT: vldrw.u32 q0, [r2, #448]
1883 ; CHECK-NEXT: vldrw.u32 q1, [r1, #448]
1884 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1885 ; CHECK-NEXT: vldrw.u32 q0, [r2, #464]
1886 ; CHECK-NEXT: vldrw.u32 q1, [r1, #464]
1887 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1888 ; CHECK-NEXT: vldrw.u32 q0, [r2, #480]
1889 ; CHECK-NEXT: vldrw.u32 q1, [r1, #480]
1890 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1891 ; CHECK-NEXT: vldrw.u32 q0, [r2, #496]
1892 ; CHECK-NEXT: vldrw.u32 q1, [r1, #496]
1893 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
1896 %0 = bitcast i32* %x to <4 x i32>*
1897 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
1898 %1 = bitcast i32* %y to <4 x i32>*
1899 %wide.load10 = load <4 x i32>, <4 x i32>* %1, align 4
1900 %2 = mul nsw <4 x i32> %wide.load10, %wide.load
1901 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1902 %4 = getelementptr inbounds i32, i32* %x, i32 4
1903 %5 = bitcast i32* %4 to <4 x i32>*
1904 %wide.load.1 = load <4 x i32>, <4 x i32>* %5, align 4
1905 %6 = getelementptr inbounds i32, i32* %y, i32 4
1906 %7 = bitcast i32* %6 to <4 x i32>*
1907 %wide.load10.1 = load <4 x i32>, <4 x i32>* %7, align 4
1908 %8 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1909 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
1910 %10 = add i32 %9, %3
1911 %11 = getelementptr inbounds i32, i32* %x, i32 8
1912 %12 = bitcast i32* %11 to <4 x i32>*
1913 %wide.load.2 = load <4 x i32>, <4 x i32>* %12, align 4
1914 %13 = getelementptr inbounds i32, i32* %y, i32 8
1915 %14 = bitcast i32* %13 to <4 x i32>*
1916 %wide.load10.2 = load <4 x i32>, <4 x i32>* %14, align 4
1917 %15 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1918 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15)
1919 %17 = add i32 %16, %10
1920 %18 = getelementptr inbounds i32, i32* %x, i32 12
1921 %19 = bitcast i32* %18 to <4 x i32>*
1922 %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 4
1923 %20 = getelementptr inbounds i32, i32* %y, i32 12
1924 %21 = bitcast i32* %20 to <4 x i32>*
1925 %wide.load10.3 = load <4 x i32>, <4 x i32>* %21, align 4
1926 %22 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1927 %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %22)
1928 %24 = add i32 %23, %17
1929 %25 = getelementptr inbounds i32, i32* %x, i32 16
1930 %26 = bitcast i32* %25 to <4 x i32>*
1931 %wide.load.4 = load <4 x i32>, <4 x i32>* %26, align 4
1932 %27 = getelementptr inbounds i32, i32* %y, i32 16
1933 %28 = bitcast i32* %27 to <4 x i32>*
1934 %wide.load10.4 = load <4 x i32>, <4 x i32>* %28, align 4
1935 %29 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1936 %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1937 %31 = add i32 %30, %24
1938 %32 = getelementptr inbounds i32, i32* %x, i32 20
1939 %33 = bitcast i32* %32 to <4 x i32>*
1940 %wide.load.5 = load <4 x i32>, <4 x i32>* %33, align 4
1941 %34 = getelementptr inbounds i32, i32* %y, i32 20
1942 %35 = bitcast i32* %34 to <4 x i32>*
1943 %wide.load10.5 = load <4 x i32>, <4 x i32>* %35, align 4
1944 %36 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1945 %37 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36)
1946 %38 = add i32 %37, %31
1947 %39 = getelementptr inbounds i32, i32* %x, i32 24
1948 %40 = bitcast i32* %39 to <4 x i32>*
1949 %wide.load.6 = load <4 x i32>, <4 x i32>* %40, align 4
1950 %41 = getelementptr inbounds i32, i32* %y, i32 24
1951 %42 = bitcast i32* %41 to <4 x i32>*
1952 %wide.load10.6 = load <4 x i32>, <4 x i32>* %42, align 4
1953 %43 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1954 %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %43)
1955 %45 = add i32 %44, %38
1956 %46 = getelementptr inbounds i32, i32* %x, i32 28
1957 %47 = bitcast i32* %46 to <4 x i32>*
1958 %wide.load.7 = load <4 x i32>, <4 x i32>* %47, align 4
1959 %48 = getelementptr inbounds i32, i32* %y, i32 28
1960 %49 = bitcast i32* %48 to <4 x i32>*
1961 %wide.load10.7 = load <4 x i32>, <4 x i32>* %49, align 4
1962 %50 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1963 %51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %50)
1964 %52 = add i32 %51, %45
1965 %53 = getelementptr inbounds i32, i32* %x, i32 32
1966 %54 = bitcast i32* %53 to <4 x i32>*
1967 %wide.load.8 = load <4 x i32>, <4 x i32>* %54, align 4
1968 %55 = getelementptr inbounds i32, i32* %y, i32 32
1969 %56 = bitcast i32* %55 to <4 x i32>*
1970 %wide.load10.8 = load <4 x i32>, <4 x i32>* %56, align 4
1971 %57 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1972 %58 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %57)
1973 %59 = add i32 %58, %52
1974 %60 = getelementptr inbounds i32, i32* %x, i32 36
1975 %61 = bitcast i32* %60 to <4 x i32>*
1976 %wide.load.9 = load <4 x i32>, <4 x i32>* %61, align 4
1977 %62 = getelementptr inbounds i32, i32* %y, i32 36
1978 %63 = bitcast i32* %62 to <4 x i32>*
1979 %wide.load10.9 = load <4 x i32>, <4 x i32>* %63, align 4
1980 %64 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1981 %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1982 %66 = add i32 %65, %59
1983 %67 = getelementptr inbounds i32, i32* %x, i32 40
1984 %68 = bitcast i32* %67 to <4 x i32>*
1985 %wide.load.10 = load <4 x i32>, <4 x i32>* %68, align 4
1986 %69 = getelementptr inbounds i32, i32* %y, i32 40
1987 %70 = bitcast i32* %69 to <4 x i32>*
1988 %wide.load10.10 = load <4 x i32>, <4 x i32>* %70, align 4
1989 %71 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1990 %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %71)
1991 %73 = add i32 %72, %66
1992 %74 = getelementptr inbounds i32, i32* %x, i32 44
1993 %75 = bitcast i32* %74 to <4 x i32>*
1994 %wide.load.11 = load <4 x i32>, <4 x i32>* %75, align 4
1995 %76 = getelementptr inbounds i32, i32* %y, i32 44
1996 %77 = bitcast i32* %76 to <4 x i32>*
1997 %wide.load10.11 = load <4 x i32>, <4 x i32>* %77, align 4
1998 %78 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1999 %79 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %78)
2000 %80 = add i32 %79, %73
2001 %81 = getelementptr inbounds i32, i32* %x, i32 48
2002 %82 = bitcast i32* %81 to <4 x i32>*
2003 %wide.load.12 = load <4 x i32>, <4 x i32>* %82, align 4
2004 %83 = getelementptr inbounds i32, i32* %y, i32 48
2005 %84 = bitcast i32* %83 to <4 x i32>*
2006 %wide.load10.12 = load <4 x i32>, <4 x i32>* %84, align 4
2007 %85 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
2008 %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %85)
2009 %87 = add i32 %86, %80
2010 %88 = getelementptr inbounds i32, i32* %x, i32 52
2011 %89 = bitcast i32* %88 to <4 x i32>*
2012 %wide.load.13 = load <4 x i32>, <4 x i32>* %89, align 4
2013 %90 = getelementptr inbounds i32, i32* %y, i32 52
2014 %91 = bitcast i32* %90 to <4 x i32>*
2015 %wide.load10.13 = load <4 x i32>, <4 x i32>* %91, align 4
2016 %92 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
2017 %93 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %92)
2018 %94 = add i32 %93, %87
2019 %95 = getelementptr inbounds i32, i32* %x, i32 56
2020 %96 = bitcast i32* %95 to <4 x i32>*
2021 %wide.load.14 = load <4 x i32>, <4 x i32>* %96, align 4
2022 %97 = getelementptr inbounds i32, i32* %y, i32 56
2023 %98 = bitcast i32* %97 to <4 x i32>*
2024 %wide.load10.14 = load <4 x i32>, <4 x i32>* %98, align 4
2025 %99 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
2026 %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
2027 %101 = add i32 %100, %94
2028 %102 = getelementptr inbounds i32, i32* %x, i32 60
2029 %103 = bitcast i32* %102 to <4 x i32>*
2030 %wide.load.15 = load <4 x i32>, <4 x i32>* %103, align 4
2031 %104 = getelementptr inbounds i32, i32* %y, i32 60
2032 %105 = bitcast i32* %104 to <4 x i32>*
2033 %wide.load10.15 = load <4 x i32>, <4 x i32>* %105, align 4
2034 %106 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
2035 %107 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %106)
2036 %108 = add i32 %107, %101
2037 %109 = getelementptr inbounds i32, i32* %x, i32 64
2038 %110 = bitcast i32* %109 to <4 x i32>*
2039 %wide.load.16 = load <4 x i32>, <4 x i32>* %110, align 4
2040 %111 = getelementptr inbounds i32, i32* %y, i32 64
2041 %112 = bitcast i32* %111 to <4 x i32>*
2042 %wide.load10.16 = load <4 x i32>, <4 x i32>* %112, align 4
2043 %113 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16
2044 %114 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %113)
2045 %115 = add i32 %114, %108
2046 %116 = getelementptr inbounds i32, i32* %x, i32 68
2047 %117 = bitcast i32* %116 to <4 x i32>*
2048 %wide.load.17 = load <4 x i32>, <4 x i32>* %117, align 4
2049 %118 = getelementptr inbounds i32, i32* %y, i32 68
2050 %119 = bitcast i32* %118 to <4 x i32>*
2051 %wide.load10.17 = load <4 x i32>, <4 x i32>* %119, align 4
2052 %120 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17
2053 %121 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %120)
2054 %122 = add i32 %121, %115
2055 %123 = getelementptr inbounds i32, i32* %x, i32 72
2056 %124 = bitcast i32* %123 to <4 x i32>*
2057 %wide.load.18 = load <4 x i32>, <4 x i32>* %124, align 4
2058 %125 = getelementptr inbounds i32, i32* %y, i32 72
2059 %126 = bitcast i32* %125 to <4 x i32>*
2060 %wide.load10.18 = load <4 x i32>, <4 x i32>* %126, align 4
2061 %127 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18
2062 %128 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %127)
2063 %129 = add i32 %128, %122
2064 %130 = getelementptr inbounds i32, i32* %x, i32 76
2065 %131 = bitcast i32* %130 to <4 x i32>*
2066 %wide.load.19 = load <4 x i32>, <4 x i32>* %131, align 4
2067 %132 = getelementptr inbounds i32, i32* %y, i32 76
2068 %133 = bitcast i32* %132 to <4 x i32>*
2069 %wide.load10.19 = load <4 x i32>, <4 x i32>* %133, align 4
2070 %134 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19
2071 %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134)
2072 %136 = add i32 %135, %129
2073 %137 = getelementptr inbounds i32, i32* %x, i32 80
2074 %138 = bitcast i32* %137 to <4 x i32>*
2075 %wide.load.20 = load <4 x i32>, <4 x i32>* %138, align 4
2076 %139 = getelementptr inbounds i32, i32* %y, i32 80
2077 %140 = bitcast i32* %139 to <4 x i32>*
2078 %wide.load10.20 = load <4 x i32>, <4 x i32>* %140, align 4
2079 %141 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20
2080 %142 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %141)
2081 %143 = add i32 %142, %136
2082 %144 = getelementptr inbounds i32, i32* %x, i32 84
2083 %145 = bitcast i32* %144 to <4 x i32>*
2084 %wide.load.21 = load <4 x i32>, <4 x i32>* %145, align 4
2085 %146 = getelementptr inbounds i32, i32* %y, i32 84
2086 %147 = bitcast i32* %146 to <4 x i32>*
2087 %wide.load10.21 = load <4 x i32>, <4 x i32>* %147, align 4
2088 %148 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21
2089 %149 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %148)
2090 %150 = add i32 %149, %143
2091 %151 = getelementptr inbounds i32, i32* %x, i32 88
2092 %152 = bitcast i32* %151 to <4 x i32>*
2093 %wide.load.22 = load <4 x i32>, <4 x i32>* %152, align 4
2094 %153 = getelementptr inbounds i32, i32* %y, i32 88
2095 %154 = bitcast i32* %153 to <4 x i32>*
2096 %wide.load10.22 = load <4 x i32>, <4 x i32>* %154, align 4
2097 %155 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22
2098 %156 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %155)
2099 %157 = add i32 %156, %150
2100 %158 = getelementptr inbounds i32, i32* %x, i32 92
2101 %159 = bitcast i32* %158 to <4 x i32>*
2102 %wide.load.23 = load <4 x i32>, <4 x i32>* %159, align 4
2103 %160 = getelementptr inbounds i32, i32* %y, i32 92
2104 %161 = bitcast i32* %160 to <4 x i32>*
2105 %wide.load10.23 = load <4 x i32>, <4 x i32>* %161, align 4
2106 %162 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23
2107 %163 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %162)
2108 %164 = add i32 %163, %157
2109 %165 = getelementptr inbounds i32, i32* %x, i32 96
2110 %166 = bitcast i32* %165 to <4 x i32>*
2111 %wide.load.24 = load <4 x i32>, <4 x i32>* %166, align 4
2112 %167 = getelementptr inbounds i32, i32* %y, i32 96
2113 %168 = bitcast i32* %167 to <4 x i32>*
2114 %wide.load10.24 = load <4 x i32>, <4 x i32>* %168, align 4
2115 %169 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24
2116 %170 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %169)
2117 %171 = add i32 %170, %164
2118 %172 = getelementptr inbounds i32, i32* %x, i32 100
2119 %173 = bitcast i32* %172 to <4 x i32>*
2120 %wide.load.25 = load <4 x i32>, <4 x i32>* %173, align 4
2121 %174 = getelementptr inbounds i32, i32* %y, i32 100
2122 %175 = bitcast i32* %174 to <4 x i32>*
2123 %wide.load10.25 = load <4 x i32>, <4 x i32>* %175, align 4
2124 %176 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25
2125 %177 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %176)
2126 %178 = add i32 %177, %171
2127 %179 = getelementptr inbounds i32, i32* %x, i32 104
2128 %180 = bitcast i32* %179 to <4 x i32>*
2129 %wide.load.26 = load <4 x i32>, <4 x i32>* %180, align 4
2130 %181 = getelementptr inbounds i32, i32* %y, i32 104
2131 %182 = bitcast i32* %181 to <4 x i32>*
2132 %wide.load10.26 = load <4 x i32>, <4 x i32>* %182, align 4
2133 %183 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26
2134 %184 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %183)
2135 %185 = add i32 %184, %178
2136 %186 = getelementptr inbounds i32, i32* %x, i32 108
2137 %187 = bitcast i32* %186 to <4 x i32>*
2138 %wide.load.27 = load <4 x i32>, <4 x i32>* %187, align 4
2139 %188 = getelementptr inbounds i32, i32* %y, i32 108
2140 %189 = bitcast i32* %188 to <4 x i32>*
2141 %wide.load10.27 = load <4 x i32>, <4 x i32>* %189, align 4
2142 %190 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27
2143 %191 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %190)
2144 %192 = add i32 %191, %185
2145 %193 = getelementptr inbounds i32, i32* %x, i32 112
2146 %194 = bitcast i32* %193 to <4 x i32>*
2147 %wide.load.28 = load <4 x i32>, <4 x i32>* %194, align 4
2148 %195 = getelementptr inbounds i32, i32* %y, i32 112
2149 %196 = bitcast i32* %195 to <4 x i32>*
2150 %wide.load10.28 = load <4 x i32>, <4 x i32>* %196, align 4
2151 %197 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28
2152 %198 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %197)
2153 %199 = add i32 %198, %192
2154 %200 = getelementptr inbounds i32, i32* %x, i32 116
2155 %201 = bitcast i32* %200 to <4 x i32>*
2156 %wide.load.29 = load <4 x i32>, <4 x i32>* %201, align 4
2157 %202 = getelementptr inbounds i32, i32* %y, i32 116
2158 %203 = bitcast i32* %202 to <4 x i32>*
2159 %wide.load10.29 = load <4 x i32>, <4 x i32>* %203, align 4
2160 %204 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29
2161 %205 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %204)
2162 %206 = add i32 %205, %199
2163 %207 = getelementptr inbounds i32, i32* %x, i32 120
2164 %208 = bitcast i32* %207 to <4 x i32>*
2165 %wide.load.30 = load <4 x i32>, <4 x i32>* %208, align 4
2166 %209 = getelementptr inbounds i32, i32* %y, i32 120
2167 %210 = bitcast i32* %209 to <4 x i32>*
2168 %wide.load10.30 = load <4 x i32>, <4 x i32>* %210, align 4
2169 %211 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30
2170 %212 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %211)
2171 %213 = add i32 %212, %206
2172 %214 = getelementptr inbounds i32, i32* %x, i32 124
2173 %215 = bitcast i32* %214 to <4 x i32>*
2174 %wide.load.31 = load <4 x i32>, <4 x i32>* %215, align 4
2175 %216 = getelementptr inbounds i32, i32* %y, i32 124
2176 %217 = bitcast i32* %216 to <4 x i32>*
2177 %wide.load10.31 = load <4 x i32>, <4 x i32>* %217, align 4
2178 %218 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31
2179 %219 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %218)
2180 %220 = add i32 %219, %213
2184 define i32 @mlav2i32i16(i16* %x, i16* %y) {
2185 ; CHECK-LABEL: mlav2i32i16:
2186 ; CHECK: @ %bb.0: @ %entry
2187 ; CHECK-NEXT: ldrsh.w r2, [r0]
2188 ; CHECK-NEXT: ldrsh.w r3, [r1]
2189 ; CHECK-NEXT: ldrsh.w r0, [r0, #2]
2190 ; CHECK-NEXT: ldrsh.w r1, [r1, #2]
2191 ; CHECK-NEXT: muls r0, r1, r0
2192 ; CHECK-NEXT: smlabb r0, r3, r2, r0
2195 %0 = load i16, i16* %x, align 2
2196 %conv = sext i16 %0 to i32
2197 %1 = load i16, i16* %y, align 2
2198 %conv2 = sext i16 %1 to i32
2199 %mul = mul nsw i32 %conv2, %conv
2200 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
2201 %2 = load i16, i16* %arrayidx.1, align 2
2202 %conv.1 = sext i16 %2 to i32
2203 %arrayidx1.1 = getelementptr inbounds i16, i16* %y, i32 1
2204 %3 = load i16, i16* %arrayidx1.1, align 2
2205 %conv2.1 = sext i16 %3 to i32
2206 %mul.1 = mul nsw i32 %conv2.1, %conv.1
2207 %add.1 = add nsw i32 %mul.1, %mul
2211 define i32 @mlav4i32i16(i16* %x, i16* %y) {
2212 ; CHECK-LABEL: mlav4i32i16:
2213 ; CHECK: @ %bb.0: @ %entry
2214 ; CHECK-NEXT: vldrh.s32 q0, [r0]
2215 ; CHECK-NEXT: vldrh.s32 q1, [r1]
2216 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2219 %0 = bitcast i16* %x to <4 x i16>*
2220 %1 = load <4 x i16>, <4 x i16>* %0, align 2
2221 %2 = sext <4 x i16> %1 to <4 x i32>
2222 %3 = bitcast i16* %y to <4 x i16>*
2223 %4 = load <4 x i16>, <4 x i16>* %3, align 2
2224 %5 = sext <4 x i16> %4 to <4 x i32>
2225 %6 = mul nsw <4 x i32> %5, %2
2226 %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
2230 define i32 @mlav8i32i16(i16* %x, i16* %y) {
2231 ; CHECK-LABEL: mlav8i32i16:
2232 ; CHECK: @ %bb.0: @ %entry
2233 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2234 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2235 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2238 %0 = bitcast i16* %x to <8 x i16>*
2239 %1 = load <8 x i16>, <8 x i16>* %0, align 2
2240 %2 = sext <8 x i16> %1 to <8 x i32>
2241 %3 = bitcast i16* %y to <8 x i16>*
2242 %4 = load <8 x i16>, <8 x i16>* %3, align 2
2243 %5 = sext <8 x i16> %4 to <8 x i32>
2244 %6 = mul nsw <8 x i32> %5, %2
2245 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2249 define i32 @mlav16i32i16(i16* %x, i16* %y) {
2250 ; CHECK-LABEL: mlav16i32i16:
2251 ; CHECK: @ %bb.0: @ %entry
2252 ; CHECK-NEXT: vldrh.s32 q0, [r0]
2253 ; CHECK-NEXT: vldrh.s32 q1, [r1]
2254 ; CHECK-NEXT: vmlav.u32 r2, q1, q0
2255 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
2256 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
2257 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
2258 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
2259 ; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
2260 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
2261 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
2262 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
2263 ; CHECK-NEXT: vmlava.u32 r2, q1, q0
2264 ; CHECK-NEXT: mov r0, r2
2267 %0 = bitcast i16* %x to <16 x i16>*
2268 %1 = load <16 x i16>, <16 x i16>* %0, align 2
2269 %2 = sext <16 x i16> %1 to <16 x i32>
2270 %3 = bitcast i16* %y to <16 x i16>*
2271 %4 = load <16 x i16>, <16 x i16>* %3, align 2
2272 %5 = sext <16 x i16> %4 to <16 x i32>
2273 %6 = mul nsw <16 x i32> %5, %2
2274 %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
2278 define i32 @mlav24i32i16(i16* %x, i16* %y) {
2279 ; CHECK-LABEL: mlav24i32i16:
2280 ; CHECK: @ %bb.0: @ %entry
2281 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2282 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2283 ; CHECK-NEXT: mov r2, r0
2284 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2285 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
2286 ; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
2287 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2288 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
2289 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
2290 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2291 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
2292 ; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
2293 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2294 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
2295 ; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
2296 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2299 %0 = bitcast i16* %x to <8 x i16>*
2300 %1 = load <8 x i16>, <8 x i16>* %0, align 2
2301 %2 = sext <8 x i16> %1 to <8 x i32>
2302 %3 = bitcast i16* %y to <8 x i16>*
2303 %4 = load <8 x i16>, <8 x i16>* %3, align 2
2304 %5 = sext <8 x i16> %4 to <8 x i32>
2305 %6 = mul nsw <8 x i32> %5, %2
2306 %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8
2307 %arrayidx1.8 = getelementptr inbounds i16, i16* %y, i32 8
2308 %7 = bitcast i16* %arrayidx.8 to <16 x i16>*
2309 %8 = load <16 x i16>, <16 x i16>* %7, align 2
2310 %9 = sext <16 x i16> %8 to <16 x i32>
2311 %10 = bitcast i16* %arrayidx1.8 to <16 x i16>*
2312 %11 = load <16 x i16>, <16 x i16>* %10, align 2
2313 %12 = sext <16 x i16> %11 to <16 x i32>
2314 %13 = mul nsw <16 x i32> %12, %9
2315 %14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %13)
2316 %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2317 %op.rdx = add nsw i32 %14, %15
2321 define i32 @mlav32i32i16(i16* %x, i16* %y) {
2322 ; CHECK-LABEL: mlav32i32i16:
2323 ; CHECK: @ %bb.0: @ %entry
2324 ; CHECK-NEXT: vldrh.s32 q0, [r0]
2325 ; CHECK-NEXT: vldrh.s32 q1, [r1]
2326 ; CHECK-NEXT: mov r2, r0
2327 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2328 ; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
2329 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
2330 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2331 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
2332 ; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
2333 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2334 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
2335 ; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
2336 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2337 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
2338 ; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
2339 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2340 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
2341 ; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
2342 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2343 ; CHECK-NEXT: vldrh.s32 q0, [r2, #48]
2344 ; CHECK-NEXT: vldrh.s32 q1, [r1, #48]
2345 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2346 ; CHECK-NEXT: vldrh.s32 q0, [r2, #56]
2347 ; CHECK-NEXT: vldrh.s32 q1, [r1, #56]
2348 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2351 %0 = bitcast i16* %x to <32 x i16>*
2352 %1 = load <32 x i16>, <32 x i16>* %0, align 2
2353 %2 = sext <32 x i16> %1 to <32 x i32>
2354 %3 = bitcast i16* %y to <32 x i16>*
2355 %4 = load <32 x i16>, <32 x i16>* %3, align 2
2356 %5 = sext <32 x i16> %4 to <32 x i32>
2357 %6 = mul nsw <32 x i32> %5, %2
2358 %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6)
2362 define i32 @mlav64i32i16(i16* %x, i16* %y) {
2363 ; CHECK-LABEL: mlav64i32i16:
2364 ; CHECK: @ %bb.0: @ %entry
2365 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2366 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2367 ; CHECK-NEXT: mov r2, r0
2368 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2369 ; CHECK-NEXT: vldrh.u16 q0, [r2, #16]
2370 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2371 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2372 ; CHECK-NEXT: vldrh.u16 q0, [r2, #32]
2373 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2374 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2375 ; CHECK-NEXT: vldrh.u16 q0, [r2, #48]
2376 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2377 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2378 ; CHECK-NEXT: vldrh.u16 q0, [r2, #64]
2379 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
2380 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2381 ; CHECK-NEXT: vldrh.u16 q0, [r2, #80]
2382 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
2383 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2384 ; CHECK-NEXT: vldrh.u16 q0, [r2, #96]
2385 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
2386 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2387 ; CHECK-NEXT: vldrh.u16 q0, [r2, #112]
2388 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
2389 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2392 %0 = bitcast i16* %x to <8 x i16>*
2393 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
2394 %1 = sext <8 x i16> %wide.load to <8 x i32>
2395 %2 = bitcast i16* %y to <8 x i16>*
2396 %wide.load11 = load <8 x i16>, <8 x i16>* %2, align 2
2397 %3 = sext <8 x i16> %wide.load11 to <8 x i32>
2398 %4 = mul nsw <8 x i32> %3, %1
2399 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2400 %6 = getelementptr inbounds i16, i16* %x, i32 8
2401 %7 = bitcast i16* %6 to <8 x i16>*
2402 %wide.load.1 = load <8 x i16>, <8 x i16>* %7, align 2
2403 %8 = sext <8 x i16> %wide.load.1 to <8 x i32>
2404 %9 = getelementptr inbounds i16, i16* %y, i32 8
2405 %10 = bitcast i16* %9 to <8 x i16>*
2406 %wide.load11.1 = load <8 x i16>, <8 x i16>* %10, align 2
2407 %11 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2408 %12 = mul nsw <8 x i32> %11, %8
2409 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
2410 %14 = add i32 %13, %5
2411 %15 = getelementptr inbounds i16, i16* %x, i32 16
2412 %16 = bitcast i16* %15 to <8 x i16>*
2413 %wide.load.2 = load <8 x i16>, <8 x i16>* %16, align 2
2414 %17 = sext <8 x i16> %wide.load.2 to <8 x i32>
2415 %18 = getelementptr inbounds i16, i16* %y, i32 16
2416 %19 = bitcast i16* %18 to <8 x i16>*
2417 %wide.load11.2 = load <8 x i16>, <8 x i16>* %19, align 2
2418 %20 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2419 %21 = mul nsw <8 x i32> %20, %17
2420 %22 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %21)
2421 %23 = add i32 %22, %14
2422 %24 = getelementptr inbounds i16, i16* %x, i32 24
2423 %25 = bitcast i16* %24 to <8 x i16>*
2424 %wide.load.3 = load <8 x i16>, <8 x i16>* %25, align 2
2425 %26 = sext <8 x i16> %wide.load.3 to <8 x i32>
2426 %27 = getelementptr inbounds i16, i16* %y, i32 24
2427 %28 = bitcast i16* %27 to <8 x i16>*
2428 %wide.load11.3 = load <8 x i16>, <8 x i16>* %28, align 2
2429 %29 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2430 %30 = mul nsw <8 x i32> %29, %26
2431 %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30)
2432 %32 = add i32 %31, %23
2433 %33 = getelementptr inbounds i16, i16* %x, i32 32
2434 %34 = bitcast i16* %33 to <8 x i16>*
2435 %wide.load.4 = load <8 x i16>, <8 x i16>* %34, align 2
2436 %35 = sext <8 x i16> %wide.load.4 to <8 x i32>
2437 %36 = getelementptr inbounds i16, i16* %y, i32 32
2438 %37 = bitcast i16* %36 to <8 x i16>*
2439 %wide.load11.4 = load <8 x i16>, <8 x i16>* %37, align 2
2440 %38 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2441 %39 = mul nsw <8 x i32> %38, %35
2442 %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
2443 %41 = add i32 %40, %32
2444 %42 = getelementptr inbounds i16, i16* %x, i32 40
2445 %43 = bitcast i16* %42 to <8 x i16>*
2446 %wide.load.5 = load <8 x i16>, <8 x i16>* %43, align 2
2447 %44 = sext <8 x i16> %wide.load.5 to <8 x i32>
2448 %45 = getelementptr inbounds i16, i16* %y, i32 40
2449 %46 = bitcast i16* %45 to <8 x i16>*
2450 %wide.load11.5 = load <8 x i16>, <8 x i16>* %46, align 2
2451 %47 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2452 %48 = mul nsw <8 x i32> %47, %44
2453 %49 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %48)
2454 %50 = add i32 %49, %41
2455 %51 = getelementptr inbounds i16, i16* %x, i32 48
2456 %52 = bitcast i16* %51 to <8 x i16>*
2457 %wide.load.6 = load <8 x i16>, <8 x i16>* %52, align 2
2458 %53 = sext <8 x i16> %wide.load.6 to <8 x i32>
2459 %54 = getelementptr inbounds i16, i16* %y, i32 48
2460 %55 = bitcast i16* %54 to <8 x i16>*
2461 %wide.load11.6 = load <8 x i16>, <8 x i16>* %55, align 2
2462 %56 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2463 %57 = mul nsw <8 x i32> %56, %53
2464 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
2465 %59 = add i32 %58, %50
2466 %60 = getelementptr inbounds i16, i16* %x, i32 56
2467 %61 = bitcast i16* %60 to <8 x i16>*
2468 %wide.load.7 = load <8 x i16>, <8 x i16>* %61, align 2
2469 %62 = sext <8 x i16> %wide.load.7 to <8 x i32>
2470 %63 = getelementptr inbounds i16, i16* %y, i32 56
2471 %64 = bitcast i16* %63 to <8 x i16>*
2472 %wide.load11.7 = load <8 x i16>, <8 x i16>* %64, align 2
2473 %65 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2474 %66 = mul nsw <8 x i32> %65, %62
2475 %67 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %66)
2476 %68 = add i32 %67, %59
2480 define i32 @mlav128i32i16(i16* %x, i16* %y) {
2481 ; CHECK-LABEL: mlav128i32i16:
2482 ; CHECK: @ %bb.0: @ %entry
2483 ; CHECK-NEXT: vldrh.u16 q0, [r0]
2484 ; CHECK-NEXT: vldrh.u16 q1, [r1]
2485 ; CHECK-NEXT: mov r2, r0
2486 ; CHECK-NEXT: vmlav.s16 r0, q1, q0
2487 ; CHECK-NEXT: vldrh.u16 q0, [r2, #16]
2488 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
2489 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2490 ; CHECK-NEXT: vldrh.u16 q0, [r2, #32]
2491 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
2492 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2493 ; CHECK-NEXT: vldrh.u16 q0, [r2, #48]
2494 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
2495 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2496 ; CHECK-NEXT: vldrh.u16 q0, [r2, #64]
2497 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
2498 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2499 ; CHECK-NEXT: vldrh.u16 q0, [r2, #80]
2500 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
2501 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2502 ; CHECK-NEXT: vldrh.u16 q0, [r2, #96]
2503 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
2504 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2505 ; CHECK-NEXT: vldrh.u16 q0, [r2, #112]
2506 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
2507 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2508 ; CHECK-NEXT: vldrh.u16 q0, [r2, #128]
2509 ; CHECK-NEXT: vldrh.u16 q1, [r1, #128]
2510 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2511 ; CHECK-NEXT: vldrh.u16 q0, [r2, #144]
2512 ; CHECK-NEXT: vldrh.u16 q1, [r1, #144]
2513 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2514 ; CHECK-NEXT: vldrh.u16 q0, [r2, #160]
2515 ; CHECK-NEXT: vldrh.u16 q1, [r1, #160]
2516 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2517 ; CHECK-NEXT: vldrh.u16 q0, [r2, #176]
2518 ; CHECK-NEXT: vldrh.u16 q1, [r1, #176]
2519 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2520 ; CHECK-NEXT: vldrh.u16 q0, [r2, #192]
2521 ; CHECK-NEXT: vldrh.u16 q1, [r1, #192]
2522 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2523 ; CHECK-NEXT: vldrh.u16 q0, [r2, #208]
2524 ; CHECK-NEXT: vldrh.u16 q1, [r1, #208]
2525 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2526 ; CHECK-NEXT: vldrh.u16 q0, [r2, #224]
2527 ; CHECK-NEXT: vldrh.u16 q1, [r1, #224]
2528 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2529 ; CHECK-NEXT: vldrh.u16 q0, [r2, #240]
2530 ; CHECK-NEXT: vldrh.u16 q1, [r1, #240]
2531 ; CHECK-NEXT: vmlava.s16 r0, q1, q0
2534 %0 = bitcast i16* %x to <8 x i16>*
2535 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
2536 %1 = sext <8 x i16> %wide.load to <8 x i32>
2537 %2 = bitcast i16* %y to <8 x i16>*
2538 %wide.load11 = load <8 x i16>, <8 x i16>* %2, align 2
2539 %3 = sext <8 x i16> %wide.load11 to <8 x i32>
2540 %4 = mul nsw <8 x i32> %3, %1
2541 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2542 %6 = getelementptr inbounds i16, i16* %x, i32 8
2543 %7 = bitcast i16* %6 to <8 x i16>*
2544 %wide.load.1 = load <8 x i16>, <8 x i16>* %7, align 2
2545 %8 = sext <8 x i16> %wide.load.1 to <8 x i32>
2546 %9 = getelementptr inbounds i16, i16* %y, i32 8
2547 %10 = bitcast i16* %9 to <8 x i16>*
2548 %wide.load11.1 = load <8 x i16>, <8 x i16>* %10, align 2
2549 %11 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2550 %12 = mul nsw <8 x i32> %11, %8
2551 %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
2552 %14 = add i32 %13, %5
2553 %15 = getelementptr inbounds i16, i16* %x, i32 16
2554 %16 = bitcast i16* %15 to <8 x i16>*
2555 %wide.load.2 = load <8 x i16>, <8 x i16>* %16, align 2
2556 %17 = sext <8 x i16> %wide.load.2 to <8 x i32>
2557 %18 = getelementptr inbounds i16, i16* %y, i32 16
2558 %19 = bitcast i16* %18 to <8 x i16>*
2559 %wide.load11.2 = load <8 x i16>, <8 x i16>* %19, align 2
2560 %20 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2561 %21 = mul nsw <8 x i32> %20, %17
2562 %22 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %21)
2563 %23 = add i32 %22, %14
2564 %24 = getelementptr inbounds i16, i16* %x, i32 24
2565 %25 = bitcast i16* %24 to <8 x i16>*
2566 %wide.load.3 = load <8 x i16>, <8 x i16>* %25, align 2
2567 %26 = sext <8 x i16> %wide.load.3 to <8 x i32>
2568 %27 = getelementptr inbounds i16, i16* %y, i32 24
2569 %28 = bitcast i16* %27 to <8 x i16>*
2570 %wide.load11.3 = load <8 x i16>, <8 x i16>* %28, align 2
2571 %29 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2572 %30 = mul nsw <8 x i32> %29, %26
2573 %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30)
2574 %32 = add i32 %31, %23
2575 %33 = getelementptr inbounds i16, i16* %x, i32 32
2576 %34 = bitcast i16* %33 to <8 x i16>*
2577 %wide.load.4 = load <8 x i16>, <8 x i16>* %34, align 2
2578 %35 = sext <8 x i16> %wide.load.4 to <8 x i32>
2579 %36 = getelementptr inbounds i16, i16* %y, i32 32
2580 %37 = bitcast i16* %36 to <8 x i16>*
2581 %wide.load11.4 = load <8 x i16>, <8 x i16>* %37, align 2
2582 %38 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2583 %39 = mul nsw <8 x i32> %38, %35
2584 %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
2585 %41 = add i32 %40, %32
2586 %42 = getelementptr inbounds i16, i16* %x, i32 40
2587 %43 = bitcast i16* %42 to <8 x i16>*
2588 %wide.load.5 = load <8 x i16>, <8 x i16>* %43, align 2
2589 %44 = sext <8 x i16> %wide.load.5 to <8 x i32>
2590 %45 = getelementptr inbounds i16, i16* %y, i32 40
2591 %46 = bitcast i16* %45 to <8 x i16>*
2592 %wide.load11.5 = load <8 x i16>, <8 x i16>* %46, align 2
2593 %47 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2594 %48 = mul nsw <8 x i32> %47, %44
2595 %49 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %48)
2596 %50 = add i32 %49, %41
2597 %51 = getelementptr inbounds i16, i16* %x, i32 48
2598 %52 = bitcast i16* %51 to <8 x i16>*
2599 %wide.load.6 = load <8 x i16>, <8 x i16>* %52, align 2
2600 %53 = sext <8 x i16> %wide.load.6 to <8 x i32>
2601 %54 = getelementptr inbounds i16, i16* %y, i32 48
2602 %55 = bitcast i16* %54 to <8 x i16>*
2603 %wide.load11.6 = load <8 x i16>, <8 x i16>* %55, align 2
2604 %56 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2605 %57 = mul nsw <8 x i32> %56, %53
2606 %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
2607 %59 = add i32 %58, %50
2608 %60 = getelementptr inbounds i16, i16* %x, i32 56
2609 %61 = bitcast i16* %60 to <8 x i16>*
2610 %wide.load.7 = load <8 x i16>, <8 x i16>* %61, align 2
2611 %62 = sext <8 x i16> %wide.load.7 to <8 x i32>
2612 %63 = getelementptr inbounds i16, i16* %y, i32 56
2613 %64 = bitcast i16* %63 to <8 x i16>*
2614 %wide.load11.7 = load <8 x i16>, <8 x i16>* %64, align 2
2615 %65 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2616 %66 = mul nsw <8 x i32> %65, %62
2617 %67 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %66)
2618 %68 = add i32 %67, %59
2619 %69 = getelementptr inbounds i16, i16* %x, i32 64
2620 %70 = bitcast i16* %69 to <8 x i16>*
2621 %wide.load.8 = load <8 x i16>, <8 x i16>* %70, align 2
2622 %71 = sext <8 x i16> %wide.load.8 to <8 x i32>
2623 %72 = getelementptr inbounds i16, i16* %y, i32 64
2624 %73 = bitcast i16* %72 to <8 x i16>*
2625 %wide.load11.8 = load <8 x i16>, <8 x i16>* %73, align 2
2626 %74 = sext <8 x i16> %wide.load11.8 to <8 x i32>
2627 %75 = mul nsw <8 x i32> %74, %71
2628 %76 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %75)
2629 %77 = add i32 %76, %68
2630 %78 = getelementptr inbounds i16, i16* %x, i32 72
2631 %79 = bitcast i16* %78 to <8 x i16>*
2632 %wide.load.9 = load <8 x i16>, <8 x i16>* %79, align 2
2633 %80 = sext <8 x i16> %wide.load.9 to <8 x i32>
2634 %81 = getelementptr inbounds i16, i16* %y, i32 72
2635 %82 = bitcast i16* %81 to <8 x i16>*
2636 %wide.load11.9 = load <8 x i16>, <8 x i16>* %82, align 2
2637 %83 = sext <8 x i16> %wide.load11.9 to <8 x i32>
2638 %84 = mul nsw <8 x i32> %83, %80
2639 %85 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %84)
2640 %86 = add i32 %85, %77
2641 %87 = getelementptr inbounds i16, i16* %x, i32 80
2642 %88 = bitcast i16* %87 to <8 x i16>*
2643 %wide.load.10 = load <8 x i16>, <8 x i16>* %88, align 2
2644 %89 = sext <8 x i16> %wide.load.10 to <8 x i32>
2645 %90 = getelementptr inbounds i16, i16* %y, i32 80
2646 %91 = bitcast i16* %90 to <8 x i16>*
2647 %wide.load11.10 = load <8 x i16>, <8 x i16>* %91, align 2
2648 %92 = sext <8 x i16> %wide.load11.10 to <8 x i32>
2649 %93 = mul nsw <8 x i32> %92, %89
2650 %94 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %93)
2651 %95 = add i32 %94, %86
2652 %96 = getelementptr inbounds i16, i16* %x, i32 88
2653 %97 = bitcast i16* %96 to <8 x i16>*
2654 %wide.load.11 = load <8 x i16>, <8 x i16>* %97, align 2
2655 %98 = sext <8 x i16> %wide.load.11 to <8 x i32>
2656 %99 = getelementptr inbounds i16, i16* %y, i32 88
2657 %100 = bitcast i16* %99 to <8 x i16>*
2658 %wide.load11.11 = load <8 x i16>, <8 x i16>* %100, align 2
2659 %101 = sext <8 x i16> %wide.load11.11 to <8 x i32>
2660 %102 = mul nsw <8 x i32> %101, %98
2661 %103 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %102)
2662 %104 = add i32 %103, %95
2663 %105 = getelementptr inbounds i16, i16* %x, i32 96
2664 %106 = bitcast i16* %105 to <8 x i16>*
2665 %wide.load.12 = load <8 x i16>, <8 x i16>* %106, align 2
2666 %107 = sext <8 x i16> %wide.load.12 to <8 x i32>
2667 %108 = getelementptr inbounds i16, i16* %y, i32 96
2668 %109 = bitcast i16* %108 to <8 x i16>*
2669 %wide.load11.12 = load <8 x i16>, <8 x i16>* %109, align 2
2670 %110 = sext <8 x i16> %wide.load11.12 to <8 x i32>
2671 %111 = mul nsw <8 x i32> %110, %107
2672 %112 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %111)
2673 %113 = add i32 %112, %104
2674 %114 = getelementptr inbounds i16, i16* %x, i32 104
2675 %115 = bitcast i16* %114 to <8 x i16>*
2676 %wide.load.13 = load <8 x i16>, <8 x i16>* %115, align 2
2677 %116 = sext <8 x i16> %wide.load.13 to <8 x i32>
2678 %117 = getelementptr inbounds i16, i16* %y, i32 104
2679 %118 = bitcast i16* %117 to <8 x i16>*
2680 %wide.load11.13 = load <8 x i16>, <8 x i16>* %118, align 2
2681 %119 = sext <8 x i16> %wide.load11.13 to <8 x i32>
2682 %120 = mul nsw <8 x i32> %119, %116
2683 %121 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %120)
2684 %122 = add i32 %121, %113
2685 %123 = getelementptr inbounds i16, i16* %x, i32 112
2686 %124 = bitcast i16* %123 to <8 x i16>*
2687 %wide.load.14 = load <8 x i16>, <8 x i16>* %124, align 2
2688 %125 = sext <8 x i16> %wide.load.14 to <8 x i32>
2689 %126 = getelementptr inbounds i16, i16* %y, i32 112
2690 %127 = bitcast i16* %126 to <8 x i16>*
2691 %wide.load11.14 = load <8 x i16>, <8 x i16>* %127, align 2
2692 %128 = sext <8 x i16> %wide.load11.14 to <8 x i32>
2693 %129 = mul nsw <8 x i32> %128, %125
2694 %130 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %129)
2695 %131 = add i32 %130, %122
2696 %132 = getelementptr inbounds i16, i16* %x, i32 120
2697 %133 = bitcast i16* %132 to <8 x i16>*
2698 %wide.load.15 = load <8 x i16>, <8 x i16>* %133, align 2
2699 %134 = sext <8 x i16> %wide.load.15 to <8 x i32>
2700 %135 = getelementptr inbounds i16, i16* %y, i32 120
2701 %136 = bitcast i16* %135 to <8 x i16>*
2702 %wide.load11.15 = load <8 x i16>, <8 x i16>* %136, align 2
2703 %137 = sext <8 x i16> %wide.load11.15 to <8 x i32>
2704 %138 = mul nsw <8 x i32> %137, %134
2705 %139 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %138)
2706 %140 = add i32 %139, %131
2710 define i32 @mlav2i32i8(i8* %x, i8* %y) {
2711 ; CHECK-LABEL: mlav2i32i8:
2712 ; CHECK: @ %bb.0: @ %entry
2713 ; CHECK-NEXT: ldrb r2, [r0]
2714 ; CHECK-NEXT: ldrb r3, [r1]
2715 ; CHECK-NEXT: ldrb r0, [r0, #1]
2716 ; CHECK-NEXT: ldrb r1, [r1, #1]
2717 ; CHECK-NEXT: muls r0, r1, r0
2718 ; CHECK-NEXT: smlabb r0, r3, r2, r0
2721 %0 = load i8, i8* %x, align 1
2722 %conv = zext i8 %0 to i32
2723 %1 = load i8, i8* %y, align 1
2724 %conv2 = zext i8 %1 to i32
2725 %mul = mul nuw nsw i32 %conv2, %conv
2726 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
2727 %2 = load i8, i8* %arrayidx.1, align 1
2728 %conv.1 = zext i8 %2 to i32
2729 %arrayidx1.1 = getelementptr inbounds i8, i8* %y, i32 1
2730 %3 = load i8, i8* %arrayidx1.1, align 1
2731 %conv2.1 = zext i8 %3 to i32
2732 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
2733 %add.1 = add nuw nsw i32 %mul.1, %mul
2737 define i32 @mlav4i32i8(i8* %x, i8* %y) {
2738 ; CHECK-LABEL: mlav4i32i8:
2739 ; CHECK: @ %bb.0: @ %entry
2740 ; CHECK-NEXT: vldrb.u32 q0, [r0]
2741 ; CHECK-NEXT: vldrb.u32 q1, [r1]
2742 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2745 %0 = bitcast i8* %x to <4 x i8>*
2746 %1 = load <4 x i8>, <4 x i8>* %0, align 1
2747 %2 = zext <4 x i8> %1 to <4 x i32>
2748 %3 = bitcast i8* %y to <4 x i8>*
2749 %4 = load <4 x i8>, <4 x i8>* %3, align 1
2750 %5 = zext <4 x i8> %4 to <4 x i32>
2751 %6 = mul nuw nsw <4 x i32> %5, %2
2752 %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
2756 define i32 @mlav8i32i8(i8* %x, i8* %y) {
2757 ; CHECK-LABEL: mlav8i32i8:
2758 ; CHECK: @ %bb.0: @ %entry
2759 ; CHECK-NEXT: vldrb.u16 q0, [r0]
2760 ; CHECK-NEXT: vldrb.u16 q1, [r1]
2761 ; CHECK-NEXT: vmlav.u16 r0, q1, q0
2764 %0 = bitcast i8* %x to <8 x i8>*
2765 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2766 %2 = zext <8 x i8> %1 to <8 x i32>
2767 %3 = bitcast i8* %y to <8 x i8>*
2768 %4 = load <8 x i8>, <8 x i8>* %3, align 1
2769 %5 = zext <8 x i8> %4 to <8 x i32>
2770 %6 = mul nuw nsw <8 x i32> %5, %2
2771 %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2775 define i32 @mlav16i32i8(i8* %x, i8* %y) {
2776 ; CHECK-LABEL: mlav16i32i8:
2777 ; CHECK: @ %bb.0: @ %entry
2778 ; CHECK-NEXT: vldrb.u8 q0, [r0]
2779 ; CHECK-NEXT: vldrb.u8 q1, [r1]
2780 ; CHECK-NEXT: vmlav.u8 r0, q1, q0
2783 %0 = bitcast i8* %x to <16 x i8>*
2784 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2785 %2 = zext <16 x i8> %1 to <16 x i32>
2786 %3 = bitcast i8* %y to <16 x i8>*
2787 %4 = load <16 x i8>, <16 x i8>* %3, align 1
2788 %5 = zext <16 x i8> %4 to <16 x i32>
2789 %6 = mul nuw nsw <16 x i32> %5, %2
2790 %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
2794 define i32 @mlav24i32i8(i8* %x, i8* %y) {
2795 ; CHECK-LABEL: mlav24i32i8:
2796 ; CHECK: @ %bb.0: @ %entry
2797 ; CHECK-NEXT: vldrb.u16 q0, [r0]
2798 ; CHECK-NEXT: vldrb.u16 q1, [r1]
2799 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
2800 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
2801 ; CHECK-NEXT: vldrb.u8 q1, [r1, #8]
2802 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2803 ; CHECK-NEXT: mov r0, r2
2806 %0 = bitcast i8* %x to <8 x i8>*
2807 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2808 %2 = zext <8 x i8> %1 to <8 x i32>
2809 %3 = bitcast i8* %y to <8 x i8>*
2810 %4 = load <8 x i8>, <8 x i8>* %3, align 1
2811 %5 = zext <8 x i8> %4 to <8 x i32>
2812 %6 = mul nuw nsw <8 x i32> %5, %2
2813 %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8
2814 %arrayidx1.8 = getelementptr inbounds i8, i8* %y, i32 8
2815 %7 = bitcast i8* %arrayidx.8 to <16 x i8>*
2816 %8 = load <16 x i8>, <16 x i8>* %7, align 1
2817 %9 = zext <16 x i8> %8 to <16 x i32>
2818 %10 = bitcast i8* %arrayidx1.8 to <16 x i8>*
2819 %11 = load <16 x i8>, <16 x i8>* %10, align 1
2820 %12 = zext <16 x i8> %11 to <16 x i32>
2821 %13 = mul nuw nsw <16 x i32> %12, %9
2822 %14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %13)
2823 %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2824 %op.rdx = add nuw nsw i32 %14, %15
2828 define i32 @mlav32i32i8(i8* %x, i8* %y) {
2829 ; CHECK-LABEL: mlav32i32i8:
2830 ; CHECK: @ %bb.0: @ %entry
2831 ; CHECK-NEXT: vldrb.u32 q0, [r0]
2832 ; CHECK-NEXT: vldrb.u32 q1, [r1]
2833 ; CHECK-NEXT: mov r2, r0
2834 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
2835 ; CHECK-NEXT: vldrb.u32 q0, [r2, #4]
2836 ; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
2837 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2838 ; CHECK-NEXT: vldrb.u32 q0, [r2, #8]
2839 ; CHECK-NEXT: vldrb.u32 q1, [r1, #8]
2840 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2841 ; CHECK-NEXT: vldrb.u32 q0, [r2, #12]
2842 ; CHECK-NEXT: vldrb.u32 q1, [r1, #12]
2843 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2844 ; CHECK-NEXT: vldrb.u32 q0, [r2, #16]
2845 ; CHECK-NEXT: vldrb.u32 q1, [r1, #16]
2846 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2847 ; CHECK-NEXT: vldrb.u32 q0, [r2, #20]
2848 ; CHECK-NEXT: vldrb.u32 q1, [r1, #20]
2849 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2850 ; CHECK-NEXT: vldrb.u32 q0, [r2, #24]
2851 ; CHECK-NEXT: vldrb.u32 q1, [r1, #24]
2852 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2853 ; CHECK-NEXT: vldrb.u32 q0, [r2, #28]
2854 ; CHECK-NEXT: vldrb.u32 q1, [r1, #28]
2855 ; CHECK-NEXT: vmlava.u32 r0, q1, q0
2858 %0 = bitcast i8* %x to <32 x i8>*
2859 %1 = load <32 x i8>, <32 x i8>* %0, align 1
2860 %2 = zext <32 x i8> %1 to <32 x i32>
2861 %3 = bitcast i8* %y to <32 x i8>*
2862 %4 = load <32 x i8>, <32 x i8>* %3, align 1
2863 %5 = zext <32 x i8> %4 to <32 x i32>
2864 %6 = mul nuw nsw <32 x i32> %5, %2
2865 %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6)
2869 define i32 @mlav64i32i8(i8* %x, i8* %y) {
2870 ; CHECK-LABEL: mlav64i32i8:
2871 ; CHECK: @ %bb.0: @ %entry
2872 ; CHECK-NEXT: vldrb.u8 q0, [r0]
2873 ; CHECK-NEXT: vldrb.u8 q1, [r1]
2874 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
2875 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
2876 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
2877 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2878 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
2879 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
2880 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2881 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
2882 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
2883 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
2884 ; CHECK-NEXT: mov r0, r2
2887 %0 = bitcast i8* %x to <16 x i8>*
2888 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
2889 %1 = zext <16 x i8> %wide.load to <16 x i32>
2890 %2 = bitcast i8* %y to <16 x i8>*
2891 %wide.load11 = load <16 x i8>, <16 x i8>* %2, align 1
2892 %3 = zext <16 x i8> %wide.load11 to <16 x i32>
2893 %4 = mul nuw nsw <16 x i32> %3, %1
2894 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2895 %6 = getelementptr inbounds i8, i8* %x, i32 16
2896 %7 = bitcast i8* %6 to <16 x i8>*
2897 %wide.load.1 = load <16 x i8>, <16 x i8>* %7, align 1
2898 %8 = zext <16 x i8> %wide.load.1 to <16 x i32>
2899 %9 = getelementptr inbounds i8, i8* %y, i32 16
2900 %10 = bitcast i8* %9 to <16 x i8>*
2901 %wide.load11.1 = load <16 x i8>, <16 x i8>* %10, align 1
2902 %11 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2903 %12 = mul nuw nsw <16 x i32> %11, %8
2904 %13 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
2905 %14 = add i32 %13, %5
2906 %15 = getelementptr inbounds i8, i8* %x, i32 32
2907 %16 = bitcast i8* %15 to <16 x i8>*
2908 %wide.load.2 = load <16 x i8>, <16 x i8>* %16, align 1
2909 %17 = zext <16 x i8> %wide.load.2 to <16 x i32>
2910 %18 = getelementptr inbounds i8, i8* %y, i32 32
2911 %19 = bitcast i8* %18 to <16 x i8>*
2912 %wide.load11.2 = load <16 x i8>, <16 x i8>* %19, align 1
2913 %20 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2914 %21 = mul nuw nsw <16 x i32> %20, %17
2915 %22 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %21)
2916 %23 = add i32 %22, %14
2917 %24 = getelementptr inbounds i8, i8* %x, i32 48
2918 %25 = bitcast i8* %24 to <16 x i8>*
2919 %wide.load.3 = load <16 x i8>, <16 x i8>* %25, align 1
2920 %26 = zext <16 x i8> %wide.load.3 to <16 x i32>
2921 %27 = getelementptr inbounds i8, i8* %y, i32 48
2922 %28 = bitcast i8* %27 to <16 x i8>*
2923 %wide.load11.3 = load <16 x i8>, <16 x i8>* %28, align 1
2924 %29 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2925 %30 = mul nuw nsw <16 x i32> %29, %26
2926 %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30)
2927 %32 = add i32 %31, %23
2931 define i32 @mlav128i32i8(i8* %x, i8* %y) {
2932 ; CHECK-LABEL: mlav128i32i8:
2933 ; CHECK: @ %bb.0: @ %entry
2934 ; CHECK-NEXT: vldrb.u8 q0, [r0]
2935 ; CHECK-NEXT: vldrb.u8 q1, [r1]
2936 ; CHECK-NEXT: mov r2, r0
2937 ; CHECK-NEXT: vmlav.u8 r0, q1, q0
2938 ; CHECK-NEXT: vldrb.u8 q0, [r2, #16]
2939 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
2940 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2941 ; CHECK-NEXT: vldrb.u8 q0, [r2, #32]
2942 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
2943 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2944 ; CHECK-NEXT: vldrb.u8 q0, [r2, #48]
2945 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
2946 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2947 ; CHECK-NEXT: vldrb.u8 q0, [r2, #64]
2948 ; CHECK-NEXT: vldrb.u8 q1, [r1, #64]
2949 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2950 ; CHECK-NEXT: vldrb.u8 q0, [r2, #80]
2951 ; CHECK-NEXT: vldrb.u8 q1, [r1, #80]
2952 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2953 ; CHECK-NEXT: vldrb.u8 q0, [r2, #96]
2954 ; CHECK-NEXT: vldrb.u8 q1, [r1, #96]
2955 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2956 ; CHECK-NEXT: vldrb.u8 q0, [r2, #112]
2957 ; CHECK-NEXT: vldrb.u8 q1, [r1, #112]
2958 ; CHECK-NEXT: vmlava.u8 r0, q1, q0
2961 %0 = bitcast i8* %x to <16 x i8>*
2962 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
2963 %1 = zext <16 x i8> %wide.load to <16 x i32>
2964 %2 = bitcast i8* %y to <16 x i8>*
2965 %wide.load11 = load <16 x i8>, <16 x i8>* %2, align 1
2966 %3 = zext <16 x i8> %wide.load11 to <16 x i32>
2967 %4 = mul nuw nsw <16 x i32> %3, %1
2968 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2969 %6 = getelementptr inbounds i8, i8* %x, i32 16
2970 %7 = bitcast i8* %6 to <16 x i8>*
2971 %wide.load.1 = load <16 x i8>, <16 x i8>* %7, align 1
2972 %8 = zext <16 x i8> %wide.load.1 to <16 x i32>
2973 %9 = getelementptr inbounds i8, i8* %y, i32 16
2974 %10 = bitcast i8* %9 to <16 x i8>*
2975 %wide.load11.1 = load <16 x i8>, <16 x i8>* %10, align 1
2976 %11 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2977 %12 = mul nuw nsw <16 x i32> %11, %8
2978 %13 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
2979 %14 = add i32 %13, %5
2980 %15 = getelementptr inbounds i8, i8* %x, i32 32
2981 %16 = bitcast i8* %15 to <16 x i8>*
2982 %wide.load.2 = load <16 x i8>, <16 x i8>* %16, align 1
2983 %17 = zext <16 x i8> %wide.load.2 to <16 x i32>
2984 %18 = getelementptr inbounds i8, i8* %y, i32 32
2985 %19 = bitcast i8* %18 to <16 x i8>*
2986 %wide.load11.2 = load <16 x i8>, <16 x i8>* %19, align 1
2987 %20 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2988 %21 = mul nuw nsw <16 x i32> %20, %17
2989 %22 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %21)
2990 %23 = add i32 %22, %14
2991 %24 = getelementptr inbounds i8, i8* %x, i32 48
2992 %25 = bitcast i8* %24 to <16 x i8>*
2993 %wide.load.3 = load <16 x i8>, <16 x i8>* %25, align 1
2994 %26 = zext <16 x i8> %wide.load.3 to <16 x i32>
2995 %27 = getelementptr inbounds i8, i8* %y, i32 48
2996 %28 = bitcast i8* %27 to <16 x i8>*
2997 %wide.load11.3 = load <16 x i8>, <16 x i8>* %28, align 1
2998 %29 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2999 %30 = mul nuw nsw <16 x i32> %29, %26
3000 %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30)
3001 %32 = add i32 %31, %23
3002 %33 = getelementptr inbounds i8, i8* %x, i32 64
3003 %34 = bitcast i8* %33 to <16 x i8>*
3004 %wide.load.4 = load <16 x i8>, <16 x i8>* %34, align 1
3005 %35 = zext <16 x i8> %wide.load.4 to <16 x i32>
3006 %36 = getelementptr inbounds i8, i8* %y, i32 64
3007 %37 = bitcast i8* %36 to <16 x i8>*
3008 %wide.load11.4 = load <16 x i8>, <16 x i8>* %37, align 1
3009 %38 = zext <16 x i8> %wide.load11.4 to <16 x i32>
3010 %39 = mul nuw nsw <16 x i32> %38, %35
3011 %40 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %39)
3012 %41 = add i32 %40, %32
3013 %42 = getelementptr inbounds i8, i8* %x, i32 80
3014 %43 = bitcast i8* %42 to <16 x i8>*
3015 %wide.load.5 = load <16 x i8>, <16 x i8>* %43, align 1
3016 %44 = zext <16 x i8> %wide.load.5 to <16 x i32>
3017 %45 = getelementptr inbounds i8, i8* %y, i32 80
3018 %46 = bitcast i8* %45 to <16 x i8>*
3019 %wide.load11.5 = load <16 x i8>, <16 x i8>* %46, align 1
3020 %47 = zext <16 x i8> %wide.load11.5 to <16 x i32>
3021 %48 = mul nuw nsw <16 x i32> %47, %44
3022 %49 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %48)
3023 %50 = add i32 %49, %41
3024 %51 = getelementptr inbounds i8, i8* %x, i32 96
3025 %52 = bitcast i8* %51 to <16 x i8>*
3026 %wide.load.6 = load <16 x i8>, <16 x i8>* %52, align 1
3027 %53 = zext <16 x i8> %wide.load.6 to <16 x i32>
3028 %54 = getelementptr inbounds i8, i8* %y, i32 96
3029 %55 = bitcast i8* %54 to <16 x i8>*
3030 %wide.load11.6 = load <16 x i8>, <16 x i8>* %55, align 1
3031 %56 = zext <16 x i8> %wide.load11.6 to <16 x i32>
3032 %57 = mul nuw nsw <16 x i32> %56, %53
3033 %58 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %57)
3034 %59 = add i32 %58, %50
3035 %60 = getelementptr inbounds i8, i8* %x, i32 112
3036 %61 = bitcast i8* %60 to <16 x i8>*
3037 %wide.load.7 = load <16 x i8>, <16 x i8>* %61, align 1
3038 %62 = zext <16 x i8> %wide.load.7 to <16 x i32>
3039 %63 = getelementptr inbounds i8, i8* %y, i32 112
3040 %64 = bitcast i8* %63 to <16 x i8>*
3041 %wide.load11.7 = load <16 x i8>, <16 x i8>* %64, align 1
3042 %65 = zext <16 x i8> %wide.load11.7 to <16 x i32>
3043 %66 = mul nuw nsw <16 x i32> %65, %62
3044 %67 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %66)
3045 %68 = add i32 %67, %59
3049 define signext i16 @mlav2i16i16(i16* %x, i16* %y) {
3050 ; CHECK-LABEL: mlav2i16i16:
3051 ; CHECK: @ %bb.0: @ %entry
3052 ; CHECK-NEXT: ldrh r2, [r0]
3053 ; CHECK-NEXT: ldrh r3, [r1]
3054 ; CHECK-NEXT: ldrh r0, [r0, #2]
3055 ; CHECK-NEXT: ldrh r1, [r1, #2]
3056 ; CHECK-NEXT: muls r2, r3, r2
3057 ; CHECK-NEXT: mla r0, r1, r0, r2
3058 ; CHECK-NEXT: sxth r0, r0
3061 %0 = load i16, i16* %x, align 2
3062 %1 = load i16, i16* %y, align 2
3063 %mul = mul i16 %1, %0
3064 %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
3065 %2 = load i16, i16* %arrayidx.1, align 2
3066 %arrayidx1.1 = getelementptr inbounds i16, i16* %y, i32 1
3067 %3 = load i16, i16* %arrayidx1.1, align 2
3068 %mul.1 = mul i16 %3, %2
3069 %add.1 = add i16 %mul.1, %mul
3073 define signext i16 @mlav4i16i16(i16* %x, i16* %y) {
3074 ; CHECK-LABEL: mlav4i16i16:
3075 ; CHECK: @ %bb.0: @ %entry
3076 ; CHECK-NEXT: vldrh.u32 q0, [r0]
3077 ; CHECK-NEXT: vldrh.u32 q1, [r1]
3078 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
3079 ; CHECK-NEXT: sxth r0, r0
3082 %0 = bitcast i16* %x to <4 x i16>*
3083 %1 = load <4 x i16>, <4 x i16>* %0, align 2
3084 %2 = bitcast i16* %y to <4 x i16>*
3085 %3 = load <4 x i16>, <4 x i16>* %2, align 2
3086 %4 = mul <4 x i16> %3, %1
3087 %5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %4)
3091 define signext i16 @mlav8i16i16(i16* %x, i16* %y) {
3092 ; CHECK-LABEL: mlav8i16i16:
3093 ; CHECK: @ %bb.0: @ %entry
3094 ; CHECK-NEXT: vldrh.u16 q0, [r0]
3095 ; CHECK-NEXT: vldrh.u16 q1, [r1]
3096 ; CHECK-NEXT: vmlav.u16 r0, q1, q0
3097 ; CHECK-NEXT: sxth r0, r0
3100 %0 = bitcast i16* %x to <8 x i16>*
3101 %1 = load <8 x i16>, <8 x i16>* %0, align 2
3102 %2 = bitcast i16* %y to <8 x i16>*
3103 %3 = load <8 x i16>, <8 x i16>* %2, align 2
3104 %4 = mul <8 x i16> %3, %1
3105 %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
3109 define signext i16 @mlav16i16i16(i16* %x, i16* %y) {
3110 ; CHECK-LABEL: mlav16i16i16:
3111 ; CHECK: @ %bb.0: @ %entry
3112 ; CHECK-NEXT: vldrh.u16 q0, [r0]
3113 ; CHECK-NEXT: vldrh.u16 q1, [r1]
3114 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3115 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
3116 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
3117 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3118 ; CHECK-NEXT: sxth r0, r2
3121 %0 = bitcast i16* %x to <16 x i16>*
3122 %1 = load <16 x i16>, <16 x i16>* %0, align 2
3123 %2 = bitcast i16* %y to <16 x i16>*
3124 %3 = load <16 x i16>, <16 x i16>* %2, align 2
3125 %4 = mul <16 x i16> %3, %1
3126 %5 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %4)
3130 define signext i16 @mlav24i16i16(i16* %x, i16* %y) {
3131 ; CHECK-LABEL: mlav24i16i16:
3132 ; CHECK: @ %bb.0: @ %entry
3133 ; CHECK-NEXT: vldrh.u16 q0, [r0]
3134 ; CHECK-NEXT: vldrh.u16 q1, [r1]
3135 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3136 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
3137 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
3138 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3139 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
3140 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
3141 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3142 ; CHECK-NEXT: sxth r0, r2
3145 %0 = bitcast i16* %x to <8 x i16>*
3146 %1 = load <8 x i16>, <8 x i16>* %0, align 2
3147 %2 = bitcast i16* %y to <8 x i16>*
3148 %3 = load <8 x i16>, <8 x i16>* %2, align 2
3149 %4 = mul <8 x i16> %3, %1
3150 %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8
3151 %arrayidx1.8 = getelementptr inbounds i16, i16* %y, i32 8
3152 %5 = bitcast i16* %arrayidx.8 to <16 x i16>*
3153 %6 = load <16 x i16>, <16 x i16>* %5, align 2
3154 %7 = bitcast i16* %arrayidx1.8 to <16 x i16>*
3155 %8 = load <16 x i16>, <16 x i16>* %7, align 2
3156 %9 = mul <16 x i16> %8, %6
3157 %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %9)
3158 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
3159 %op.rdx = add i16 %10, %11
3163 define signext i16 @mlav32i16i16(i16* %x, i16* %y) {
3164 ; CHECK-LABEL: mlav32i16i16:
3165 ; CHECK: @ %bb.0: @ %entry
3166 ; CHECK-NEXT: vldrh.u16 q0, [r0]
3167 ; CHECK-NEXT: vldrh.u16 q1, [r1]
3168 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3169 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
3170 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
3171 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3172 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
3173 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
3174 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3175 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
3176 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
3177 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3178 ; CHECK-NEXT: sxth r0, r2
3181 %0 = bitcast i16* %x to <32 x i16>*
3182 %1 = load <32 x i16>, <32 x i16>* %0, align 2
3183 %2 = bitcast i16* %y to <32 x i16>*
3184 %3 = load <32 x i16>, <32 x i16>* %2, align 2
3185 %4 = mul <32 x i16> %3, %1
3186 %5 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %4)
3190 define signext i16 @mlav64i16i16(i16* %x, i16* %y) {
3191 ; CHECK-LABEL: mlav64i16i16:
3192 ; CHECK: @ %bb.0: @ %entry
3193 ; CHECK-NEXT: vldrh.u16 q0, [r0]
3194 ; CHECK-NEXT: vldrh.u16 q1, [r1]
3195 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3196 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
3197 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
3198 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3199 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
3200 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
3201 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3202 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
3203 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
3204 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3205 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
3206 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
3207 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3208 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
3209 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
3210 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3211 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
3212 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
3213 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3214 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
3215 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
3216 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3217 ; CHECK-NEXT: sxth r0, r2
3220 %0 = bitcast i16* %x to <8 x i16>*
3221 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
3222 %1 = bitcast i16* %y to <8 x i16>*
3223 %wide.load13 = load <8 x i16>, <8 x i16>* %1, align 2
3224 %2 = mul <8 x i16> %wide.load13, %wide.load
3225 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
3226 %4 = getelementptr inbounds i16, i16* %x, i32 8
3227 %5 = bitcast i16* %4 to <8 x i16>*
3228 %wide.load.1 = load <8 x i16>, <8 x i16>* %5, align 2
3229 %6 = getelementptr inbounds i16, i16* %y, i32 8
3230 %7 = bitcast i16* %6 to <8 x i16>*
3231 %wide.load13.1 = load <8 x i16>, <8 x i16>* %7, align 2
3232 %8 = mul <8 x i16> %wide.load13.1, %wide.load.1
3233 %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %8)
3234 %10 = add i16 %9, %3
3235 %11 = getelementptr inbounds i16, i16* %x, i32 16
3236 %12 = bitcast i16* %11 to <8 x i16>*
3237 %wide.load.2 = load <8 x i16>, <8 x i16>* %12, align 2
3238 %13 = getelementptr inbounds i16, i16* %y, i32 16
3239 %14 = bitcast i16* %13 to <8 x i16>*
3240 %wide.load13.2 = load <8 x i16>, <8 x i16>* %14, align 2
3241 %15 = mul <8 x i16> %wide.load13.2, %wide.load.2
3242 %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %15)
3243 %17 = add i16 %16, %10
3244 %18 = getelementptr inbounds i16, i16* %x, i32 24
3245 %19 = bitcast i16* %18 to <8 x i16>*
3246 %wide.load.3 = load <8 x i16>, <8 x i16>* %19, align 2
3247 %20 = getelementptr inbounds i16, i16* %y, i32 24
3248 %21 = bitcast i16* %20 to <8 x i16>*
3249 %wide.load13.3 = load <8 x i16>, <8 x i16>* %21, align 2
3250 %22 = mul <8 x i16> %wide.load13.3, %wide.load.3
3251 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %22)
3252 %24 = add i16 %23, %17
3253 %25 = getelementptr inbounds i16, i16* %x, i32 32
3254 %26 = bitcast i16* %25 to <8 x i16>*
3255 %wide.load.4 = load <8 x i16>, <8 x i16>* %26, align 2
3256 %27 = getelementptr inbounds i16, i16* %y, i32 32
3257 %28 = bitcast i16* %27 to <8 x i16>*
3258 %wide.load13.4 = load <8 x i16>, <8 x i16>* %28, align 2
3259 %29 = mul <8 x i16> %wide.load13.4, %wide.load.4
3260 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
3261 %31 = add i16 %30, %24
3262 %32 = getelementptr inbounds i16, i16* %x, i32 40
3263 %33 = bitcast i16* %32 to <8 x i16>*
3264 %wide.load.5 = load <8 x i16>, <8 x i16>* %33, align 2
3265 %34 = getelementptr inbounds i16, i16* %y, i32 40
3266 %35 = bitcast i16* %34 to <8 x i16>*
3267 %wide.load13.5 = load <8 x i16>, <8 x i16>* %35, align 2
3268 %36 = mul <8 x i16> %wide.load13.5, %wide.load.5
3269 %37 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %36)
3270 %38 = add i16 %37, %31
3271 %39 = getelementptr inbounds i16, i16* %x, i32 48
3272 %40 = bitcast i16* %39 to <8 x i16>*
3273 %wide.load.6 = load <8 x i16>, <8 x i16>* %40, align 2
3274 %41 = getelementptr inbounds i16, i16* %y, i32 48
3275 %42 = bitcast i16* %41 to <8 x i16>*
3276 %wide.load13.6 = load <8 x i16>, <8 x i16>* %42, align 2
3277 %43 = mul <8 x i16> %wide.load13.6, %wide.load.6
3278 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %43)
3279 %45 = add i16 %44, %38
3280 %46 = getelementptr inbounds i16, i16* %x, i32 56
3281 %47 = bitcast i16* %46 to <8 x i16>*
3282 %wide.load.7 = load <8 x i16>, <8 x i16>* %47, align 2
3283 %48 = getelementptr inbounds i16, i16* %y, i32 56
3284 %49 = bitcast i16* %48 to <8 x i16>*
3285 %wide.load13.7 = load <8 x i16>, <8 x i16>* %49, align 2
3286 %50 = mul <8 x i16> %wide.load13.7, %wide.load.7
3287 %51 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %50)
3288 %52 = add i16 %51, %45
3292 define signext i16 @mlav128i16i16(i16* %x, i16* %y) {
3293 ; CHECK-LABEL: mlav128i16i16:
3294 ; CHECK: @ %bb.0: @ %entry
3295 ; CHECK-NEXT: vldrh.u16 q0, [r0]
3296 ; CHECK-NEXT: vldrh.u16 q1, [r1]
3297 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3298 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
3299 ; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
3300 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3301 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
3302 ; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
3303 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3304 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
3305 ; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
3306 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3307 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
3308 ; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
3309 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3310 ; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
3311 ; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
3312 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3313 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
3314 ; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
3315 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3316 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
3317 ; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
3318 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3319 ; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
3320 ; CHECK-NEXT: vldrh.u16 q1, [r1, #128]
3321 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3322 ; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
3323 ; CHECK-NEXT: vldrh.u16 q1, [r1, #144]
3324 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3325 ; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
3326 ; CHECK-NEXT: vldrh.u16 q1, [r1, #160]
3327 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3328 ; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
3329 ; CHECK-NEXT: vldrh.u16 q1, [r1, #176]
3330 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3331 ; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
3332 ; CHECK-NEXT: vldrh.u16 q1, [r1, #192]
3333 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3334 ; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
3335 ; CHECK-NEXT: vldrh.u16 q1, [r1, #208]
3336 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3337 ; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
3338 ; CHECK-NEXT: vldrh.u16 q1, [r1, #224]
3339 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3340 ; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
3341 ; CHECK-NEXT: vldrh.u16 q1, [r1, #240]
3342 ; CHECK-NEXT: vmlava.u16 r2, q1, q0
3343 ; CHECK-NEXT: sxth r0, r2
3346 %0 = bitcast i16* %x to <8 x i16>*
3347 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
3348 %1 = bitcast i16* %y to <8 x i16>*
3349 %wide.load13 = load <8 x i16>, <8 x i16>* %1, align 2
3350 %2 = mul <8 x i16> %wide.load13, %wide.load
3351 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
3352 %4 = getelementptr inbounds i16, i16* %x, i32 8
3353 %5 = bitcast i16* %4 to <8 x i16>*
3354 %wide.load.1 = load <8 x i16>, <8 x i16>* %5, align 2
3355 %6 = getelementptr inbounds i16, i16* %y, i32 8
3356 %7 = bitcast i16* %6 to <8 x i16>*
3357 %wide.load13.1 = load <8 x i16>, <8 x i16>* %7, align 2
3358 %8 = mul <8 x i16> %wide.load13.1, %wide.load.1
3359 %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %8)
3360 %10 = add i16 %9, %3
3361 %11 = getelementptr inbounds i16, i16* %x, i32 16
3362 %12 = bitcast i16* %11 to <8 x i16>*
3363 %wide.load.2 = load <8 x i16>, <8 x i16>* %12, align 2
3364 %13 = getelementptr inbounds i16, i16* %y, i32 16
3365 %14 = bitcast i16* %13 to <8 x i16>*
3366 %wide.load13.2 = load <8 x i16>, <8 x i16>* %14, align 2
3367 %15 = mul <8 x i16> %wide.load13.2, %wide.load.2
3368 %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %15)
3369 %17 = add i16 %16, %10
3370 %18 = getelementptr inbounds i16, i16* %x, i32 24
3371 %19 = bitcast i16* %18 to <8 x i16>*
3372 %wide.load.3 = load <8 x i16>, <8 x i16>* %19, align 2
3373 %20 = getelementptr inbounds i16, i16* %y, i32 24
3374 %21 = bitcast i16* %20 to <8 x i16>*
3375 %wide.load13.3 = load <8 x i16>, <8 x i16>* %21, align 2
3376 %22 = mul <8 x i16> %wide.load13.3, %wide.load.3
3377 %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %22)
3378 %24 = add i16 %23, %17
3379 %25 = getelementptr inbounds i16, i16* %x, i32 32
3380 %26 = bitcast i16* %25 to <8 x i16>*
3381 %wide.load.4 = load <8 x i16>, <8 x i16>* %26, align 2
3382 %27 = getelementptr inbounds i16, i16* %y, i32 32
3383 %28 = bitcast i16* %27 to <8 x i16>*
3384 %wide.load13.4 = load <8 x i16>, <8 x i16>* %28, align 2
3385 %29 = mul <8 x i16> %wide.load13.4, %wide.load.4
3386 %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
3387 %31 = add i16 %30, %24
3388 %32 = getelementptr inbounds i16, i16* %x, i32 40
3389 %33 = bitcast i16* %32 to <8 x i16>*
3390 %wide.load.5 = load <8 x i16>, <8 x i16>* %33, align 2
3391 %34 = getelementptr inbounds i16, i16* %y, i32 40
3392 %35 = bitcast i16* %34 to <8 x i16>*
3393 %wide.load13.5 = load <8 x i16>, <8 x i16>* %35, align 2
3394 %36 = mul <8 x i16> %wide.load13.5, %wide.load.5
3395 %37 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %36)
3396 %38 = add i16 %37, %31
3397 %39 = getelementptr inbounds i16, i16* %x, i32 48
3398 %40 = bitcast i16* %39 to <8 x i16>*
3399 %wide.load.6 = load <8 x i16>, <8 x i16>* %40, align 2
3400 %41 = getelementptr inbounds i16, i16* %y, i32 48
3401 %42 = bitcast i16* %41 to <8 x i16>*
3402 %wide.load13.6 = load <8 x i16>, <8 x i16>* %42, align 2
3403 %43 = mul <8 x i16> %wide.load13.6, %wide.load.6
3404 %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %43)
3405 %45 = add i16 %44, %38
3406 %46 = getelementptr inbounds i16, i16* %x, i32 56
3407 %47 = bitcast i16* %46 to <8 x i16>*
3408 %wide.load.7 = load <8 x i16>, <8 x i16>* %47, align 2
3409 %48 = getelementptr inbounds i16, i16* %y, i32 56
3410 %49 = bitcast i16* %48 to <8 x i16>*
3411 %wide.load13.7 = load <8 x i16>, <8 x i16>* %49, align 2
3412 %50 = mul <8 x i16> %wide.load13.7, %wide.load.7
3413 %51 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %50)
3414 %52 = add i16 %51, %45
3415 %53 = getelementptr inbounds i16, i16* %x, i32 64
3416 %54 = bitcast i16* %53 to <8 x i16>*
3417 %wide.load.8 = load <8 x i16>, <8 x i16>* %54, align 2
3418 %55 = getelementptr inbounds i16, i16* %y, i32 64
3419 %56 = bitcast i16* %55 to <8 x i16>*
3420 %wide.load13.8 = load <8 x i16>, <8 x i16>* %56, align 2
3421 %57 = mul <8 x i16> %wide.load13.8, %wide.load.8
3422 %58 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %57)
3423 %59 = add i16 %58, %52
3424 %60 = getelementptr inbounds i16, i16* %x, i32 72
3425 %61 = bitcast i16* %60 to <8 x i16>*
3426 %wide.load.9 = load <8 x i16>, <8 x i16>* %61, align 2
3427 %62 = getelementptr inbounds i16, i16* %y, i32 72
3428 %63 = bitcast i16* %62 to <8 x i16>*
3429 %wide.load13.9 = load <8 x i16>, <8 x i16>* %63, align 2
3430 %64 = mul <8 x i16> %wide.load13.9, %wide.load.9
3431 %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64)
3432 %66 = add i16 %65, %59
3433 %67 = getelementptr inbounds i16, i16* %x, i32 80
3434 %68 = bitcast i16* %67 to <8 x i16>*
3435 %wide.load.10 = load <8 x i16>, <8 x i16>* %68, align 2
3436 %69 = getelementptr inbounds i16, i16* %y, i32 80
3437 %70 = bitcast i16* %69 to <8 x i16>*
3438 %wide.load13.10 = load <8 x i16>, <8 x i16>* %70, align 2
3439 %71 = mul <8 x i16> %wide.load13.10, %wide.load.10
3440 %72 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %71)
3441 %73 = add i16 %72, %66
3442 %74 = getelementptr inbounds i16, i16* %x, i32 88
3443 %75 = bitcast i16* %74 to <8 x i16>*
3444 %wide.load.11 = load <8 x i16>, <8 x i16>* %75, align 2
3445 %76 = getelementptr inbounds i16, i16* %y, i32 88
3446 %77 = bitcast i16* %76 to <8 x i16>*
3447 %wide.load13.11 = load <8 x i16>, <8 x i16>* %77, align 2
3448 %78 = mul <8 x i16> %wide.load13.11, %wide.load.11
3449 %79 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %78)
3450 %80 = add i16 %79, %73
3451 %81 = getelementptr inbounds i16, i16* %x, i32 96
3452 %82 = bitcast i16* %81 to <8 x i16>*
3453 %wide.load.12 = load <8 x i16>, <8 x i16>* %82, align 2
3454 %83 = getelementptr inbounds i16, i16* %y, i32 96
3455 %84 = bitcast i16* %83 to <8 x i16>*
3456 %wide.load13.12 = load <8 x i16>, <8 x i16>* %84, align 2
3457 %85 = mul <8 x i16> %wide.load13.12, %wide.load.12
3458 %86 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %85)
3459 %87 = add i16 %86, %80
3460 %88 = getelementptr inbounds i16, i16* %x, i32 104
3461 %89 = bitcast i16* %88 to <8 x i16>*
3462 %wide.load.13 = load <8 x i16>, <8 x i16>* %89, align 2
3463 %90 = getelementptr inbounds i16, i16* %y, i32 104
3464 %91 = bitcast i16* %90 to <8 x i16>*
3465 %wide.load13.13 = load <8 x i16>, <8 x i16>* %91, align 2
3466 %92 = mul <8 x i16> %wide.load13.13, %wide.load.13
3467 %93 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %92)
3468 %94 = add i16 %93, %87
3469 %95 = getelementptr inbounds i16, i16* %x, i32 112
3470 %96 = bitcast i16* %95 to <8 x i16>*
3471 %wide.load.14 = load <8 x i16>, <8 x i16>* %96, align 2
3472 %97 = getelementptr inbounds i16, i16* %y, i32 112
3473 %98 = bitcast i16* %97 to <8 x i16>*
3474 %wide.load13.14 = load <8 x i16>, <8 x i16>* %98, align 2
3475 %99 = mul <8 x i16> %wide.load13.14, %wide.load.14
3476 %100 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %99)
3477 %101 = add i16 %100, %94
3478 %102 = getelementptr inbounds i16, i16* %x, i32 120
3479 %103 = bitcast i16* %102 to <8 x i16>*
3480 %wide.load.15 = load <8 x i16>, <8 x i16>* %103, align 2
3481 %104 = getelementptr inbounds i16, i16* %y, i32 120
3482 %105 = bitcast i16* %104 to <8 x i16>*
3483 %wide.load13.15 = load <8 x i16>, <8 x i16>* %105, align 2
3484 %106 = mul <8 x i16> %wide.load13.15, %wide.load.15
3485 %107 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %106)
3486 %108 = add i16 %107, %101
3490 define zeroext i8 @mlav2i8i8(i8* %x, i8* %y) {
3491 ; CHECK-LABEL: mlav2i8i8:
3492 ; CHECK: @ %bb.0: @ %entry
3493 ; CHECK-NEXT: ldrb r2, [r0]
3494 ; CHECK-NEXT: ldrb r3, [r1]
3495 ; CHECK-NEXT: ldrb r0, [r0, #1]
3496 ; CHECK-NEXT: ldrb r1, [r1, #1]
3497 ; CHECK-NEXT: muls r2, r3, r2
3498 ; CHECK-NEXT: mla r0, r1, r0, r2
3499 ; CHECK-NEXT: uxtb r0, r0
3502 %0 = load i8, i8* %x, align 1
3503 %1 = load i8, i8* %y, align 1
3504 %mul = mul i8 %1, %0
3505 %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
3506 %2 = load i8, i8* %arrayidx.1, align 1
3507 %arrayidx1.1 = getelementptr inbounds i8, i8* %y, i32 1
3508 %3 = load i8, i8* %arrayidx1.1, align 1
3509 %mul.1 = mul i8 %3, %2
3510 %add.1 = add i8 %mul.1, %mul
3514 define zeroext i8 @mlav4i8i8(i8* %x, i8* %y) {
3515 ; CHECK-LABEL: mlav4i8i8:
3516 ; CHECK: @ %bb.0: @ %entry
3517 ; CHECK-NEXT: vldrb.u32 q0, [r0]
3518 ; CHECK-NEXT: vldrb.u32 q1, [r1]
3519 ; CHECK-NEXT: vmlav.u32 r0, q1, q0
3520 ; CHECK-NEXT: uxtb r0, r0
3523 %0 = bitcast i8* %x to <4 x i8>*
3524 %1 = load <4 x i8>, <4 x i8>* %0, align 1
3525 %2 = bitcast i8* %y to <4 x i8>*
3526 %3 = load <4 x i8>, <4 x i8>* %2, align 1
3527 %4 = mul <4 x i8> %3, %1
3528 %5 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %4)
3532 define zeroext i8 @mlav8i8i8(i8* %x, i8* %y) {
3533 ; CHECK-LABEL: mlav8i8i8:
3534 ; CHECK: @ %bb.0: @ %entry
3535 ; CHECK-NEXT: vldrb.u16 q0, [r0]
3536 ; CHECK-NEXT: vldrb.u16 q1, [r1]
3537 ; CHECK-NEXT: vmlav.u16 r0, q1, q0
3538 ; CHECK-NEXT: uxtb r0, r0
3541 %0 = bitcast i8* %x to <8 x i8>*
3542 %1 = load <8 x i8>, <8 x i8>* %0, align 1
3543 %2 = bitcast i8* %y to <8 x i8>*
3544 %3 = load <8 x i8>, <8 x i8>* %2, align 1
3545 %4 = mul <8 x i8> %3, %1
3546 %5 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %4)
3550 define zeroext i8 @mlav16i8i8(i8* %x, i8* %y) {
3551 ; CHECK-LABEL: mlav16i8i8:
3552 ; CHECK: @ %bb.0: @ %entry
3553 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3554 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3555 ; CHECK-NEXT: vmlav.u8 r0, q1, q0
3556 ; CHECK-NEXT: uxtb r0, r0
3559 %0 = bitcast i8* %x to <16 x i8>*
3560 %1 = load <16 x i8>, <16 x i8>* %0, align 1
3561 %2 = bitcast i8* %y to <16 x i8>*
3562 %3 = load <16 x i8>, <16 x i8>* %2, align 1
3563 %4 = mul <16 x i8> %3, %1
3564 %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4)
3568 define zeroext i8 @mlav24i8i8(i8* %x, i8* %y) {
3569 ; CHECK-LABEL: mlav24i8i8:
3570 ; CHECK: @ %bb.0: @ %entry
3571 ; CHECK-NEXT: vldrb.u16 q0, [r0]
3572 ; CHECK-NEXT: vldrb.u16 q1, [r1]
3573 ; CHECK-NEXT: vmlav.u16 r2, q1, q0
3574 ; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
3575 ; CHECK-NEXT: vldrb.u8 q1, [r1, #8]
3576 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3577 ; CHECK-NEXT: uxtb r0, r2
3580 %0 = bitcast i8* %x to <8 x i8>*
3581 %1 = load <8 x i8>, <8 x i8>* %0, align 1
3582 %2 = bitcast i8* %y to <8 x i8>*
3583 %3 = load <8 x i8>, <8 x i8>* %2, align 1
3584 %4 = mul <8 x i8> %3, %1
3585 %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8
3586 %arrayidx1.8 = getelementptr inbounds i8, i8* %y, i32 8
3587 %5 = bitcast i8* %arrayidx.8 to <16 x i8>*
3588 %6 = load <16 x i8>, <16 x i8>* %5, align 1
3589 %7 = bitcast i8* %arrayidx1.8 to <16 x i8>*
3590 %8 = load <16 x i8>, <16 x i8>* %7, align 1
3591 %9 = mul <16 x i8> %8, %6
3592 %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9)
3593 %11 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %4)
3594 %op.rdx = add i8 %10, %11
3598 define zeroext i8 @mlav32i8i8(i8* %x, i8* %y) {
3599 ; CHECK-LABEL: mlav32i8i8:
3600 ; CHECK: @ %bb.0: @ %entry
3601 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3602 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3603 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
3604 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
3605 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
3606 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3607 ; CHECK-NEXT: uxtb r0, r2
3610 %0 = bitcast i8* %x to <32 x i8>*
3611 %1 = load <32 x i8>, <32 x i8>* %0, align 1
3612 %2 = bitcast i8* %y to <32 x i8>*
3613 %3 = load <32 x i8>, <32 x i8>* %2, align 1
3614 %4 = mul <32 x i8> %3, %1
3615 %5 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %4)
3619 define zeroext i8 @mlav64i8i8(i8* %x, i8* %y) {
3620 ; CHECK-LABEL: mlav64i8i8:
3621 ; CHECK: @ %bb.0: @ %entry
3622 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3623 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3624 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
3625 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
3626 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
3627 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3628 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
3629 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
3630 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3631 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
3632 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
3633 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3634 ; CHECK-NEXT: uxtb r0, r2
3637 %0 = bitcast i8* %x to <16 x i8>*
3638 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
3639 %1 = bitcast i8* %y to <16 x i8>*
3640 %wide.load12 = load <16 x i8>, <16 x i8>* %1, align 1
3641 %2 = mul <16 x i8> %wide.load12, %wide.load
3642 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
3643 %4 = getelementptr inbounds i8, i8* %x, i32 16
3644 %5 = bitcast i8* %4 to <16 x i8>*
3645 %wide.load.1 = load <16 x i8>, <16 x i8>* %5, align 1
3646 %6 = getelementptr inbounds i8, i8* %y, i32 16
3647 %7 = bitcast i8* %6 to <16 x i8>*
3648 %wide.load12.1 = load <16 x i8>, <16 x i8>* %7, align 1
3649 %8 = mul <16 x i8> %wide.load12.1, %wide.load.1
3650 %9 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %8)
3652 %11 = getelementptr inbounds i8, i8* %x, i32 32
3653 %12 = bitcast i8* %11 to <16 x i8>*
3654 %wide.load.2 = load <16 x i8>, <16 x i8>* %12, align 1
3655 %13 = getelementptr inbounds i8, i8* %y, i32 32
3656 %14 = bitcast i8* %13 to <16 x i8>*
3657 %wide.load12.2 = load <16 x i8>, <16 x i8>* %14, align 1
3658 %15 = mul <16 x i8> %wide.load12.2, %wide.load.2
3659 %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %15)
3660 %17 = add i8 %16, %10
3661 %18 = getelementptr inbounds i8, i8* %x, i32 48
3662 %19 = bitcast i8* %18 to <16 x i8>*
3663 %wide.load.3 = load <16 x i8>, <16 x i8>* %19, align 1
3664 %20 = getelementptr inbounds i8, i8* %y, i32 48
3665 %21 = bitcast i8* %20 to <16 x i8>*
3666 %wide.load12.3 = load <16 x i8>, <16 x i8>* %21, align 1
3667 %22 = mul <16 x i8> %wide.load12.3, %wide.load.3
3668 %23 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %22)
3669 %24 = add i8 %23, %17
3673 define zeroext i8 @mlav128i8i8(i8* %x, i8* %y) {
3674 ; CHECK-LABEL: mlav128i8i8:
3675 ; CHECK: @ %bb.0: @ %entry
3676 ; CHECK-NEXT: vldrb.u8 q0, [r0]
3677 ; CHECK-NEXT: vldrb.u8 q1, [r1]
3678 ; CHECK-NEXT: vmlav.u8 r2, q1, q0
3679 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
3680 ; CHECK-NEXT: vldrb.u8 q1, [r1, #16]
3681 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3682 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
3683 ; CHECK-NEXT: vldrb.u8 q1, [r1, #32]
3684 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3685 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
3686 ; CHECK-NEXT: vldrb.u8 q1, [r1, #48]
3687 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3688 ; CHECK-NEXT: vldrb.u8 q0, [r0, #64]
3689 ; CHECK-NEXT: vldrb.u8 q1, [r1, #64]
3690 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3691 ; CHECK-NEXT: vldrb.u8 q0, [r0, #80]
3692 ; CHECK-NEXT: vldrb.u8 q1, [r1, #80]
3693 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3694 ; CHECK-NEXT: vldrb.u8 q0, [r0, #96]
3695 ; CHECK-NEXT: vldrb.u8 q1, [r1, #96]
3696 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3697 ; CHECK-NEXT: vldrb.u8 q0, [r0, #112]
3698 ; CHECK-NEXT: vldrb.u8 q1, [r1, #112]
3699 ; CHECK-NEXT: vmlava.u8 r2, q1, q0
3700 ; CHECK-NEXT: uxtb r0, r2
3703 %0 = bitcast i8* %x to <16 x i8>*
3704 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
3705 %1 = bitcast i8* %y to <16 x i8>*
3706 %wide.load12 = load <16 x i8>, <16 x i8>* %1, align 1
3707 %2 = mul <16 x i8> %wide.load12, %wide.load
3708 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
3709 %4 = getelementptr inbounds i8, i8* %x, i32 16
3710 %5 = bitcast i8* %4 to <16 x i8>*
3711 %wide.load.1 = load <16 x i8>, <16 x i8>* %5, align 1
3712 %6 = getelementptr inbounds i8, i8* %y, i32 16
3713 %7 = bitcast i8* %6 to <16 x i8>*
3714 %wide.load12.1 = load <16 x i8>, <16 x i8>* %7, align 1
3715 %8 = mul <16 x i8> %wide.load12.1, %wide.load.1
3716 %9 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %8)
3718 %11 = getelementptr inbounds i8, i8* %x, i32 32
3719 %12 = bitcast i8* %11 to <16 x i8>*
3720 %wide.load.2 = load <16 x i8>, <16 x i8>* %12, align 1
3721 %13 = getelementptr inbounds i8, i8* %y, i32 32
3722 %14 = bitcast i8* %13 to <16 x i8>*
3723 %wide.load12.2 = load <16 x i8>, <16 x i8>* %14, align 1
3724 %15 = mul <16 x i8> %wide.load12.2, %wide.load.2
3725 %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %15)
3726 %17 = add i8 %16, %10
3727 %18 = getelementptr inbounds i8, i8* %x, i32 48
3728 %19 = bitcast i8* %18 to <16 x i8>*
3729 %wide.load.3 = load <16 x i8>, <16 x i8>* %19, align 1
3730 %20 = getelementptr inbounds i8, i8* %y, i32 48
3731 %21 = bitcast i8* %20 to <16 x i8>*
3732 %wide.load12.3 = load <16 x i8>, <16 x i8>* %21, align 1
3733 %22 = mul <16 x i8> %wide.load12.3, %wide.load.3
3734 %23 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %22)
3735 %24 = add i8 %23, %17
3736 %25 = getelementptr inbounds i8, i8* %x, i32 64
3737 %26 = bitcast i8* %25 to <16 x i8>*
3738 %wide.load.4 = load <16 x i8>, <16 x i8>* %26, align 1
3739 %27 = getelementptr inbounds i8, i8* %y, i32 64
3740 %28 = bitcast i8* %27 to <16 x i8>*
3741 %wide.load12.4 = load <16 x i8>, <16 x i8>* %28, align 1
3742 %29 = mul <16 x i8> %wide.load12.4, %wide.load.4
3743 %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29)
3744 %31 = add i8 %30, %24
3745 %32 = getelementptr inbounds i8, i8* %x, i32 80
3746 %33 = bitcast i8* %32 to <16 x i8>*
3747 %wide.load.5 = load <16 x i8>, <16 x i8>* %33, align 1
3748 %34 = getelementptr inbounds i8, i8* %y, i32 80
3749 %35 = bitcast i8* %34 to <16 x i8>*
3750 %wide.load12.5 = load <16 x i8>, <16 x i8>* %35, align 1
3751 %36 = mul <16 x i8> %wide.load12.5, %wide.load.5
3752 %37 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %36)
3753 %38 = add i8 %37, %31
3754 %39 = getelementptr inbounds i8, i8* %x, i32 96
3755 %40 = bitcast i8* %39 to <16 x i8>*
3756 %wide.load.6 = load <16 x i8>, <16 x i8>* %40, align 1
3757 %41 = getelementptr inbounds i8, i8* %y, i32 96
3758 %42 = bitcast i8* %41 to <16 x i8>*
3759 %wide.load12.6 = load <16 x i8>, <16 x i8>* %42, align 1
3760 %43 = mul <16 x i8> %wide.load12.6, %wide.load.6
3761 %44 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %43)
3762 %45 = add i8 %44, %38
3763 %46 = getelementptr inbounds i8, i8* %x, i32 112
3764 %47 = bitcast i8* %46 to <16 x i8>*
3765 %wide.load.7 = load <16 x i8>, <16 x i8>* %47, align 1
3766 %48 = getelementptr inbounds i8, i8* %y, i32 112
3767 %49 = bitcast i8* %48 to <16 x i8>*
3768 %wide.load12.7 = load <16 x i8>, <16 x i8>* %49, align 1
3769 %50 = mul <16 x i8> %wide.load12.7, %wide.load.7
3770 %51 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %50)
3771 %52 = add i8 %51, %45
3776 define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) {
3777 ; CHECK-LABEL: add_two_const:
3778 ; CHECK: @ %bb.0: @ %entry
3779 ; CHECK-NEXT: vaddv.u32 r0, q1
3780 ; CHECK-NEXT: vaddva.u32 r0, q0
3781 ; CHECK-NEXT: adds r0, #10
3784 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3785 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3791 define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) {
3792 ; CHECK-LABEL: add_two_const2:
3793 ; CHECK: @ %bb.0: @ %entry
3794 ; CHECK-NEXT: vaddv.u32 r0, q1
3795 ; CHECK-NEXT: vaddva.u32 r0, q0
3796 ; CHECK-NEXT: adds r0, #10
3799 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3800 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3806 define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) {
3807 ; CHECK-LABEL: add_two_const3:
3808 ; CHECK: @ %bb.0: @ %entry
3809 ; CHECK-NEXT: vaddv.u32 r0, q0
3810 ; CHECK-NEXT: vaddva.u32 r0, q1
3811 ; CHECK-NEXT: adds r0, #20
3814 %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3815 %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3822 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3823 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3824 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3825 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
3826 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
3827 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
3828 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3829 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3830 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
3831 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
3832 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
3833 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
3834 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
3835 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
3836 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)