1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod < %s | FileCheck %s
4 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
5 declare i32 @llvm.vector.reduce.add.v5i32(<5 x i32>)
6 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
7 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
8 declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
9 declare i32 @llvm.vector.reduce.add.v25i32(<25 x i32>)
10 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
11 declare i32 @llvm.vector.reduce.add.v33i32(<33 x i32>)
12 declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
13 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
15 define i32 @test_udot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
16 ; CHECK-LABEL: test_udot_v4i8:
17 ; CHECK: // %bb.0: // %entry
18 ; CHECK-NEXT: ldr s0, [x0]
19 ; CHECK-NEXT: ldr s1, [x1]
20 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
21 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
22 ; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h
23 ; CHECK-NEXT: addv s0, v0.4s
24 ; CHECK-NEXT: fmov w8, s0
25 ; CHECK-NEXT: add w0, w8, w2
28 %0 = load <4 x i8>, ptr %a
29 %1 = zext <4 x i8> %0 to <4 x i32>
30 %2 = load <4 x i8>, ptr %b
31 %3 = zext <4 x i8> %2 to <4 x i32>
32 %4 = mul nuw nsw <4 x i32> %3, %1
33 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
34 %op.extra = add i32 %5, %sum
38 define i32 @test_udot_v4i8_nomla(ptr nocapture readonly %a1) {
39 ; CHECK-LABEL: test_udot_v4i8_nomla:
40 ; CHECK: // %bb.0: // %entry
41 ; CHECK-NEXT: ldr s0, [x0]
42 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
43 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
44 ; CHECK-NEXT: addv s0, v0.4s
45 ; CHECK-NEXT: fmov w0, s0
48 %0 = load <4 x i8>, ptr %a1
49 %1 = zext <4 x i8> %0 to <4 x i32>
50 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
53 define i32 @test_sdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
54 ; CHECK-LABEL: test_sdot_v4i8:
55 ; CHECK: // %bb.0: // %entry
56 ; CHECK-NEXT: ldr s0, [x0]
57 ; CHECK-NEXT: ldr s1, [x1]
58 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
59 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
60 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
61 ; CHECK-NEXT: addv s0, v0.4s
62 ; CHECK-NEXT: fmov w8, s0
63 ; CHECK-NEXT: add w0, w8, w2
66 %0 = load <4 x i8>, ptr %a
67 %1 = sext <4 x i8> %0 to <4 x i32>
68 %2 = load <4 x i8>, ptr %b
69 %3 = sext <4 x i8> %2 to <4 x i32>
70 %4 = mul nsw <4 x i32> %3, %1
71 %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
72 %op.extra = add nsw i32 %5, %sum
76 define i32 @test_sdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
77 ; CHECK-LABEL: test_sdot_v4i8_double:
78 ; CHECK: // %bb.0: // %entry
79 ; CHECK-NEXT: ushll v3.4s, v3.4h, #0
80 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0
81 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
82 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
83 ; CHECK-NEXT: shl v2.4s, v2.4s, #24
84 ; CHECK-NEXT: shl v3.4s, v3.4s, #24
85 ; CHECK-NEXT: shl v1.4s, v1.4s, #24
86 ; CHECK-NEXT: shl v0.4s, v0.4s, #24
87 ; CHECK-NEXT: sshr v2.4s, v2.4s, #24
88 ; CHECK-NEXT: sshr v3.4s, v3.4s, #24
89 ; CHECK-NEXT: sshr v1.4s, v1.4s, #24
90 ; CHECK-NEXT: sshr v0.4s, v0.4s, #24
91 ; CHECK-NEXT: mul v2.4s, v2.4s, v3.4s
92 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
93 ; CHECK-NEXT: addv s0, v2.4s
94 ; CHECK-NEXT: fmov w0, s0
97 %az = sext <4 x i8> %a to <4 x i32>
98 %bz = sext <4 x i8> %b to <4 x i32>
99 %m1 = mul nuw nsw <4 x i32> %az, %bz
100 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
101 %cz = sext <4 x i8> %c to <4 x i32>
102 %dz = sext <4 x i8> %d to <4 x i32>
103 %m2 = mul nuw nsw <4 x i32> %cz, %dz
104 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
105 %x = add i32 %r1, %r2
109 define i32 @test_sdot_v4i8_double_nomla(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
110 ; CHECK-LABEL: test_sdot_v4i8_double_nomla:
111 ; CHECK: // %bb.0: // %entry
112 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
113 ; CHECK-NEXT: ushll v1.4s, v2.4h, #0
114 ; CHECK-NEXT: shl v0.4s, v0.4s, #24
115 ; CHECK-NEXT: shl v1.4s, v1.4s, #24
116 ; CHECK-NEXT: sshr v0.4s, v0.4s, #24
117 ; CHECK-NEXT: ssra v0.4s, v1.4s, #24
118 ; CHECK-NEXT: addv s0, v0.4s
119 ; CHECK-NEXT: fmov w0, s0
122 %az = sext <4 x i8> %a to <4 x i32>
123 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %az)
124 %cz = sext <4 x i8> %c to <4 x i32>
125 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %cz)
126 %x = add i32 %r1, %r2
130 define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
131 ; CHECK-LABEL: test_udot_v5i8:
132 ; CHECK: // %bb.0: // %entry
133 ; CHECK-NEXT: ldr d0, [x0]
134 ; CHECK-NEXT: ldr d1, [x1]
135 ; CHECK-NEXT: movi v3.2d, #0000000000000000
136 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
137 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
138 ; CHECK-NEXT: umull2 v2.4s, v1.8h, v0.8h
139 ; CHECK-NEXT: mov v3.s[0], v2.s[0]
140 ; CHECK-NEXT: umlal v3.4s, v1.4h, v0.4h
141 ; CHECK-NEXT: addv s0, v3.4s
142 ; CHECK-NEXT: fmov w8, s0
143 ; CHECK-NEXT: add w0, w8, w2
146 %0 = load <5 x i8>, ptr %a
147 %1 = zext <5 x i8> %0 to <5 x i32>
148 %2 = load <5 x i8>, ptr %b
149 %3 = zext <5 x i8> %2 to <5 x i32>
150 %4 = mul nuw nsw <5 x i32> %3, %1
151 %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4)
152 %op.extra = add i32 %5, %sum
156 define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
157 ; CHECK-LABEL: test_udot_v5i8_nomla:
158 ; CHECK: // %bb.0: // %entry
159 ; CHECK-NEXT: ldr d0, [x0]
160 ; CHECK-NEXT: movi v1.2d, #0000000000000000
161 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
162 ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
163 ; CHECK-NEXT: mov v1.s[0], v2.s[0]
164 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
165 ; CHECK-NEXT: addv s0, v0.4s
166 ; CHECK-NEXT: fmov w0, s0
169 %0 = load <5 x i8>, ptr %a1
170 %1 = zext <5 x i8> %0 to <5 x i32>
171 %2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %1)
174 define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
175 ; CHECK-LABEL: test_sdot_v5i8:
176 ; CHECK: // %bb.0: // %entry
177 ; CHECK-NEXT: ldr d0, [x0]
178 ; CHECK-NEXT: ldr d1, [x1]
179 ; CHECK-NEXT: movi v3.2d, #0000000000000000
180 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
181 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
182 ; CHECK-NEXT: smull2 v2.4s, v1.8h, v0.8h
183 ; CHECK-NEXT: mov v3.s[0], v2.s[0]
184 ; CHECK-NEXT: smlal v3.4s, v1.4h, v0.4h
185 ; CHECK-NEXT: addv s0, v3.4s
186 ; CHECK-NEXT: fmov w8, s0
187 ; CHECK-NEXT: add w0, w8, w2
190 %0 = load <5 x i8>, ptr %a
191 %1 = sext <5 x i8> %0 to <5 x i32>
192 %2 = load <5 x i8>, ptr %b
193 %3 = sext <5 x i8> %2 to <5 x i32>
194 %4 = mul nsw <5 x i32> %3, %1
195 %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4)
196 %op.extra = add nsw i32 %5, %sum
200 define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
201 ; CHECK-LABEL: test_sdot_v5i8_double:
202 ; CHECK: // %bb.0: // %entry
203 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
204 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
205 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
206 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
207 ; CHECK-NEXT: movi v5.2d, #0000000000000000
208 ; CHECK-NEXT: movi v6.2d, #0000000000000000
209 ; CHECK-NEXT: smull2 v4.4s, v0.8h, v1.8h
210 ; CHECK-NEXT: smull2 v7.4s, v2.8h, v3.8h
211 ; CHECK-NEXT: mov v6.s[0], v4.s[0]
212 ; CHECK-NEXT: mov v5.s[0], v7.s[0]
213 ; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h
214 ; CHECK-NEXT: smlal v5.4s, v2.4h, v3.4h
215 ; CHECK-NEXT: add v0.4s, v6.4s, v5.4s
216 ; CHECK-NEXT: addv s0, v0.4s
217 ; CHECK-NEXT: fmov w0, s0
220 %az = sext <5 x i8> %a to <5 x i32>
221 %bz = sext <5 x i8> %b to <5 x i32>
222 %m1 = mul nuw nsw <5 x i32> %az, %bz
223 %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m1)
224 %cz = sext <5 x i8> %c to <5 x i32>
225 %dz = sext <5 x i8> %d to <5 x i32>
226 %m2 = mul nuw nsw <5 x i32> %cz, %dz
227 %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m2)
228 %x = add i32 %r1, %r2
232 define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
233 ; CHECK-LABEL: test_sdot_v5i8_double_nomla:
234 ; CHECK: // %bb.0: // %entry
235 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
236 ; CHECK-NEXT: sshll v1.8h, v2.8b, #0
237 ; CHECK-NEXT: movi v2.2d, #0000000000000000
238 ; CHECK-NEXT: movi v3.2d, #0000000000000000
239 ; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0
240 ; CHECK-NEXT: sshll2 v5.4s, v1.8h, #0
241 ; CHECK-NEXT: mov v3.s[0], v4.s[0]
242 ; CHECK-NEXT: mov v2.s[0], v5.s[0]
243 ; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h
244 ; CHECK-NEXT: saddw v1.4s, v2.4s, v1.4h
245 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
246 ; CHECK-NEXT: addv s0, v0.4s
247 ; CHECK-NEXT: fmov w0, s0
250 %az = sext <5 x i8> %a to <5 x i32>
251 %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %az)
252 %cz = sext <5 x i8> %c to <5 x i32>
253 %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %cz)
254 %x = add i32 %r1, %r2
258 define i32 @test_udot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
259 ; CHECK-LABEL: test_udot_v8i8:
260 ; CHECK: // %bb.0: // %entry
261 ; CHECK-NEXT: movi v0.2d, #0000000000000000
262 ; CHECK-NEXT: ldr d1, [x0]
263 ; CHECK-NEXT: ldr d2, [x1]
264 ; CHECK-NEXT: udot v0.2s, v2.8b, v1.8b
265 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
266 ; CHECK-NEXT: fmov w0, s0
269 %0 = load <8 x i8>, ptr %a
270 %1 = zext <8 x i8> %0 to <8 x i32>
271 %2 = load <8 x i8>, ptr %b
272 %3 = zext <8 x i8> %2 to <8 x i32>
273 %4 = mul nuw nsw <8 x i32> %3, %1
274 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
278 define i32 @test_udot_v8i8_nomla(ptr nocapture readonly %a1) {
279 ; CHECK-LABEL: test_udot_v8i8_nomla:
280 ; CHECK: // %bb.0: // %entry
281 ; CHECK-NEXT: movi v0.2d, #0000000000000000
282 ; CHECK-NEXT: movi v1.8b, #1
283 ; CHECK-NEXT: ldr d2, [x0]
284 ; CHECK-NEXT: udot v0.2s, v2.8b, v1.8b
285 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
286 ; CHECK-NEXT: fmov w0, s0
289 %0 = load <8 x i8>, ptr %a1
290 %1 = zext <8 x i8> %0 to <8 x i32>
291 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
295 define i32 @test_sdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
296 ; CHECK-LABEL: test_sdot_v8i8:
297 ; CHECK: // %bb.0: // %entry
298 ; CHECK-NEXT: movi v0.2d, #0000000000000000
299 ; CHECK-NEXT: ldr d1, [x0]
300 ; CHECK-NEXT: ldr d2, [x1]
301 ; CHECK-NEXT: sdot v0.2s, v2.8b, v1.8b
302 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
303 ; CHECK-NEXT: fmov w0, s0
306 %0 = load <8 x i8>, ptr %a
307 %1 = sext <8 x i8> %0 to <8 x i32>
308 %2 = load <8 x i8>, ptr %b
309 %3 = sext <8 x i8> %2 to <8 x i32>
310 %4 = mul nsw <8 x i32> %3, %1
311 %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
315 define i32 @test_sdot_v8i8_nomla(ptr nocapture readonly %a1) {
316 ; CHECK-LABEL: test_sdot_v8i8_nomla:
317 ; CHECK: // %bb.0: // %entry
318 ; CHECK-NEXT: movi v0.2d, #0000000000000000
319 ; CHECK-NEXT: movi v1.8b, #1
320 ; CHECK-NEXT: ldr d2, [x0]
321 ; CHECK-NEXT: sdot v0.2s, v2.8b, v1.8b
322 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
323 ; CHECK-NEXT: fmov w0, s0
326 %0 = load <8 x i8>, ptr %a1
327 %1 = sext <8 x i8> %0 to <8 x i32>
328 %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
333 define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
334 ; CHECK-LABEL: test_udot_v16i8:
335 ; CHECK: // %bb.0: // %entry
336 ; CHECK-NEXT: movi v0.2d, #0000000000000000
337 ; CHECK-NEXT: ldr q1, [x0]
338 ; CHECK-NEXT: ldr q2, [x1]
339 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
340 ; CHECK-NEXT: addv s0, v0.4s
341 ; CHECK-NEXT: fmov w8, s0
342 ; CHECK-NEXT: add w0, w8, w2
345 %0 = load <16 x i8>, ptr %a
346 %1 = zext <16 x i8> %0 to <16 x i32>
347 %2 = load <16 x i8>, ptr %b
348 %3 = zext <16 x i8> %2 to <16 x i32>
349 %4 = mul nuw nsw <16 x i32> %3, %1
350 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
351 %op.extra = add i32 %5, %sum
355 define i32 @test_udot_v16i8_nomla(ptr nocapture readonly %a1) {
356 ; CHECK-LABEL: test_udot_v16i8_nomla:
357 ; CHECK: // %bb.0: // %entry
358 ; CHECK-NEXT: movi v0.16b, #1
359 ; CHECK-NEXT: movi v1.2d, #0000000000000000
360 ; CHECK-NEXT: ldr q2, [x0]
361 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
362 ; CHECK-NEXT: addv s0, v1.4s
363 ; CHECK-NEXT: fmov w0, s0
366 %0 = load <16 x i8>, ptr %a1
367 %1 = zext <16 x i8> %0 to <16 x i32>
368 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
372 define i32 @test_sdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
373 ; CHECK-LABEL: test_sdot_v16i8:
374 ; CHECK: // %bb.0: // %entry
375 ; CHECK-NEXT: movi v0.2d, #0000000000000000
376 ; CHECK-NEXT: ldr q1, [x0]
377 ; CHECK-NEXT: ldr q2, [x1]
378 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
379 ; CHECK-NEXT: addv s0, v0.4s
380 ; CHECK-NEXT: fmov w8, s0
381 ; CHECK-NEXT: add w0, w8, w2
384 %0 = load <16 x i8>, ptr %a
385 %1 = sext <16 x i8> %0 to <16 x i32>
386 %2 = load <16 x i8>, ptr %b
387 %3 = sext <16 x i8> %2 to <16 x i32>
388 %4 = mul nsw <16 x i32> %3, %1
389 %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
390 %op.extra = add nsw i32 %5, %sum
394 define i32 @test_sdot_v16i8_nomla(ptr nocapture readonly %a1) {
395 ; CHECK-LABEL: test_sdot_v16i8_nomla:
396 ; CHECK: // %bb.0: // %entry
397 ; CHECK-NEXT: movi v0.16b, #1
398 ; CHECK-NEXT: movi v1.2d, #0000000000000000
399 ; CHECK-NEXT: ldr q2, [x0]
400 ; CHECK-NEXT: sdot v1.4s, v2.16b, v0.16b
401 ; CHECK-NEXT: addv s0, v1.4s
402 ; CHECK-NEXT: fmov w0, s0
405 %0 = load <16 x i8>, ptr %a1
406 %1 = sext <16 x i8> %0 to <16 x i32>
407 %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
412 define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
413 ; CHECK-LABEL: test_udot_v8i8_double:
414 ; CHECK: // %bb.0: // %entry
415 ; CHECK-NEXT: movi v4.2d, #0000000000000000
416 ; CHECK-NEXT: udot v4.2s, v2.8b, v3.8b
417 ; CHECK-NEXT: udot v4.2s, v0.8b, v1.8b
418 ; CHECK-NEXT: addp v0.2s, v4.2s, v4.2s
419 ; CHECK-NEXT: fmov w0, s0
422 %az = zext <8 x i8> %a to <8 x i32>
423 %bz = zext <8 x i8> %b to <8 x i32>
424 %m1 = mul nuw nsw <8 x i32> %az, %bz
425 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
426 %cz = zext <8 x i8> %c to <8 x i32>
427 %dz = zext <8 x i8> %d to <8 x i32>
428 %m2 = mul nuw nsw <8 x i32> %cz, %dz
429 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
430 %x = add i32 %r1, %r2
434 define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
435 ; CHECK-LABEL: test_udot_v8i8_double_nomla:
436 ; CHECK: // %bb.0: // %entry
437 ; CHECK-NEXT: movi v1.2d, #0000000000000000
438 ; CHECK-NEXT: movi v3.8b, #1
439 ; CHECK-NEXT: udot v1.2s, v2.8b, v3.8b
440 ; CHECK-NEXT: udot v1.2s, v0.8b, v3.8b
441 ; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s
442 ; CHECK-NEXT: fmov w0, s0
445 %az = zext <8 x i8> %a to <8 x i32>
446 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az)
447 %cz = zext <8 x i8> %c to <8 x i32>
448 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz)
449 %x = add i32 %r1, %r2
453 define i32 @test_udot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
454 ; CHECK-LABEL: test_udot_v16i8_double:
455 ; CHECK: // %bb.0: // %entry
456 ; CHECK-NEXT: movi v4.2d, #0000000000000000
457 ; CHECK-NEXT: udot v4.4s, v2.16b, v3.16b
458 ; CHECK-NEXT: udot v4.4s, v0.16b, v1.16b
459 ; CHECK-NEXT: addv s0, v4.4s
460 ; CHECK-NEXT: fmov w0, s0
463 %az = zext <16 x i8> %a to <16 x i32>
464 %bz = zext <16 x i8> %b to <16 x i32>
465 %m1 = mul nuw nsw <16 x i32> %az, %bz
466 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
467 %cz = zext <16 x i8> %c to <16 x i32>
468 %dz = zext <16 x i8> %d to <16 x i32>
469 %m2 = mul nuw nsw <16 x i32> %cz, %dz
470 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
471 %x = add i32 %r1, %r2
475 define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
476 ; CHECK-LABEL: test_udot_v16i8_double_nomla:
477 ; CHECK: // %bb.0: // %entry
478 ; CHECK-NEXT: movi v1.16b, #1
479 ; CHECK-NEXT: movi v3.2d, #0000000000000000
480 ; CHECK-NEXT: udot v3.4s, v2.16b, v1.16b
481 ; CHECK-NEXT: udot v3.4s, v0.16b, v1.16b
482 ; CHECK-NEXT: addv s0, v3.4s
483 ; CHECK-NEXT: fmov w0, s0
486 %az = zext <16 x i8> %a to <16 x i32>
487 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
488 %cz = zext <16 x i8> %c to <16 x i32>
489 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
490 %x = add i32 %r1, %r2
494 define i32 @test_sdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
495 ; CHECK-LABEL: test_sdot_v8i8_double:
496 ; CHECK: // %bb.0: // %entry
497 ; CHECK-NEXT: movi v4.2d, #0000000000000000
498 ; CHECK-NEXT: sdot v4.2s, v2.8b, v3.8b
499 ; CHECK-NEXT: sdot v4.2s, v0.8b, v1.8b
500 ; CHECK-NEXT: addp v0.2s, v4.2s, v4.2s
501 ; CHECK-NEXT: fmov w0, s0
504 %az = sext <8 x i8> %a to <8 x i32>
505 %bz = sext <8 x i8> %b to <8 x i32>
506 %m1 = mul nuw nsw <8 x i32> %az, %bz
507 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
508 %cz = sext <8 x i8> %c to <8 x i32>
509 %dz = sext <8 x i8> %d to <8 x i32>
510 %m2 = mul nuw nsw <8 x i32> %cz, %dz
511 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
512 %x = add i32 %r1, %r2
516 define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
517 ; CHECK-LABEL: test_sdot_v8i8_double_nomla:
518 ; CHECK: // %bb.0: // %entry
519 ; CHECK-NEXT: movi v1.2d, #0000000000000000
520 ; CHECK-NEXT: movi v3.8b, #1
521 ; CHECK-NEXT: sdot v1.2s, v2.8b, v3.8b
522 ; CHECK-NEXT: sdot v1.2s, v0.8b, v3.8b
523 ; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s
524 ; CHECK-NEXT: fmov w0, s0
527 %az = sext <8 x i8> %a to <8 x i32>
528 %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az)
529 %cz = sext <8 x i8> %c to <8 x i32>
530 %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz)
531 %x = add i32 %r1, %r2
535 define i32 @test_sdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
536 ; CHECK-LABEL: test_sdot_v16i8_double:
537 ; CHECK: // %bb.0: // %entry
538 ; CHECK-NEXT: movi v4.2d, #0000000000000000
539 ; CHECK-NEXT: sdot v4.4s, v2.16b, v3.16b
540 ; CHECK-NEXT: sdot v4.4s, v0.16b, v1.16b
541 ; CHECK-NEXT: addv s0, v4.4s
542 ; CHECK-NEXT: fmov w0, s0
545 %az = sext <16 x i8> %a to <16 x i32>
546 %bz = sext <16 x i8> %b to <16 x i32>
547 %m1 = mul nuw nsw <16 x i32> %az, %bz
548 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
549 %cz = sext <16 x i8> %c to <16 x i32>
550 %dz = sext <16 x i8> %d to <16 x i32>
551 %m2 = mul nuw nsw <16 x i32> %cz, %dz
552 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
553 %x = add i32 %r1, %r2
557 define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
558 ; CHECK-LABEL: test_sdot_v16i8_double_nomla:
559 ; CHECK: // %bb.0: // %entry
560 ; CHECK-NEXT: movi v1.16b, #1
561 ; CHECK-NEXT: movi v3.2d, #0000000000000000
562 ; CHECK-NEXT: sdot v3.4s, v2.16b, v1.16b
563 ; CHECK-NEXT: sdot v3.4s, v0.16b, v1.16b
564 ; CHECK-NEXT: addv s0, v3.4s
565 ; CHECK-NEXT: fmov w0, s0
568 %az = sext <16 x i8> %a to <16 x i32>
569 %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
570 %cz = sext <16 x i8> %c to <16 x i32>
571 %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
572 %x = add i32 %r1, %r2
576 define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
577 ; CHECK-LABEL: test_udot_v24i8:
578 ; CHECK: // %bb.0: // %entry
579 ; CHECK-NEXT: movi v0.2d, #0000000000000000
580 ; CHECK-NEXT: movi v1.2d, #0000000000000000
581 ; CHECK-NEXT: ldr q2, [x0]
582 ; CHECK-NEXT: ldr q3, [x1]
583 ; CHECK-NEXT: ldr d4, [x0, #16]
584 ; CHECK-NEXT: ldr d5, [x1, #16]
585 ; CHECK-NEXT: udot v1.2s, v5.8b, v4.8b
586 ; CHECK-NEXT: udot v0.4s, v3.16b, v2.16b
587 ; CHECK-NEXT: addp v1.2s, v1.2s, v1.2s
588 ; CHECK-NEXT: addv s0, v0.4s
589 ; CHECK-NEXT: fmov w8, s1
590 ; CHECK-NEXT: fmov w9, s0
591 ; CHECK-NEXT: add w8, w9, w8
592 ; CHECK-NEXT: add w0, w8, w2
595 %0 = load <24 x i8>, ptr %a
596 %1 = zext <24 x i8> %0 to <24 x i32>
597 %2 = load <24 x i8>, ptr %b
598 %3 = zext <24 x i8> %2 to <24 x i32>
599 %4 = mul nuw nsw <24 x i32> %3, %1
600 %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4)
601 %op.extra = add i32 %5, %sum
605 define i32 @test_udot_v24i8_nomla(ptr nocapture readonly %a1) {
606 ; CHECK-LABEL: test_udot_v24i8_nomla:
607 ; CHECK: // %bb.0: // %entry
608 ; CHECK-NEXT: movi v0.2d, #0000000000000000
609 ; CHECK-NEXT: movi v1.8b, #1
610 ; CHECK-NEXT: ldr q4, [x0]
611 ; CHECK-NEXT: movi v2.2d, #0000000000000000
612 ; CHECK-NEXT: movi v3.16b, #1
613 ; CHECK-NEXT: ldr d5, [x0, #16]
614 ; CHECK-NEXT: udot v2.2s, v5.8b, v1.8b
615 ; CHECK-NEXT: udot v0.4s, v4.16b, v3.16b
616 ; CHECK-NEXT: addp v1.2s, v2.2s, v2.2s
617 ; CHECK-NEXT: addv s0, v0.4s
618 ; CHECK-NEXT: fmov w8, s1
619 ; CHECK-NEXT: fmov w9, s0
620 ; CHECK-NEXT: add w0, w9, w8
623 %0 = load <24 x i8>, ptr %a1
624 %1 = zext <24 x i8> %0 to <24 x i32>
625 %2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %1)
628 define i32 @test_sdot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
629 ; CHECK-LABEL: test_sdot_v24i8:
630 ; CHECK: // %bb.0: // %entry
631 ; CHECK-NEXT: movi v0.2d, #0000000000000000
632 ; CHECK-NEXT: movi v1.2d, #0000000000000000
633 ; CHECK-NEXT: ldr q2, [x0]
634 ; CHECK-NEXT: ldr q3, [x1]
635 ; CHECK-NEXT: ldr d4, [x0, #16]
636 ; CHECK-NEXT: ldr d5, [x1, #16]
637 ; CHECK-NEXT: sdot v1.2s, v5.8b, v4.8b
638 ; CHECK-NEXT: sdot v0.4s, v3.16b, v2.16b
639 ; CHECK-NEXT: addp v1.2s, v1.2s, v1.2s
640 ; CHECK-NEXT: addv s0, v0.4s
641 ; CHECK-NEXT: fmov w8, s1
642 ; CHECK-NEXT: fmov w9, s0
643 ; CHECK-NEXT: add w8, w9, w8
644 ; CHECK-NEXT: add w0, w8, w2
647 %0 = load <24 x i8>, ptr %a
648 %1 = sext <24 x i8> %0 to <24 x i32>
649 %2 = load <24 x i8>, ptr %b
650 %3 = sext <24 x i8> %2 to <24 x i32>
651 %4 = mul nsw <24 x i32> %3, %1
652 %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4)
653 %op.extra = add nsw i32 %5, %sum
657 define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) {
658 ; CHECK-LABEL: test_sdot_v24i8_double:
659 ; CHECK: // %bb.0: // %entry
660 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
661 ; CHECK-NEXT: .cfi_def_cfa_offset 16
662 ; CHECK-NEXT: .cfi_offset w29, -16
663 ; CHECK-NEXT: fmov s0, w0
664 ; CHECK-NEXT: ldr b1, [sp, #144]
665 ; CHECK-NEXT: add x10, sp, #152
666 ; CHECK-NEXT: add x9, sp, #160
667 ; CHECK-NEXT: add x8, sp, #168
668 ; CHECK-NEXT: ldr b2, [sp, #272]
669 ; CHECK-NEXT: ld1 { v1.b }[1], [x10]
670 ; CHECK-NEXT: add x11, sp, #280
671 ; CHECK-NEXT: ldr b3, [sp, #80]
672 ; CHECK-NEXT: mov v0.b[1], w1
673 ; CHECK-NEXT: ldr b4, [sp, #528]
674 ; CHECK-NEXT: add x10, sp, #88
675 ; CHECK-NEXT: ld1 { v2.b }[1], [x11]
676 ; CHECK-NEXT: add x11, sp, #536
677 ; CHECK-NEXT: ldr b5, [sp, #336]
678 ; CHECK-NEXT: ld1 { v1.b }[2], [x9]
679 ; CHECK-NEXT: ld1 { v3.b }[1], [x10]
680 ; CHECK-NEXT: add x10, sp, #344
681 ; CHECK-NEXT: ld1 { v4.b }[1], [x11]
682 ; CHECK-NEXT: add x11, sp, #176
683 ; CHECK-NEXT: ldr b6, [sp, #656]
684 ; CHECK-NEXT: mov v0.b[2], w2
685 ; CHECK-NEXT: ld1 { v5.b }[1], [x10]
686 ; CHECK-NEXT: ldr b7, [sp, #464]
687 ; CHECK-NEXT: ld1 { v1.b }[3], [x8]
688 ; CHECK-NEXT: add x12, sp, #664
689 ; CHECK-NEXT: add x9, sp, #472
690 ; CHECK-NEXT: ld1 { v6.b }[1], [x12]
691 ; CHECK-NEXT: add x8, sp, #96
692 ; CHECK-NEXT: add x10, sp, #184
693 ; CHECK-NEXT: add x12, sp, #288
694 ; CHECK-NEXT: ld1 { v7.b }[1], [x9]
695 ; CHECK-NEXT: ld1 { v3.b }[2], [x8]
696 ; CHECK-NEXT: mov v0.b[3], w3
697 ; CHECK-NEXT: ld1 { v1.b }[4], [x11]
698 ; CHECK-NEXT: add x8, sp, #352
699 ; CHECK-NEXT: ld1 { v2.b }[2], [x12]
700 ; CHECK-NEXT: add x13, sp, #544
701 ; CHECK-NEXT: ld1 { v5.b }[2], [x8]
702 ; CHECK-NEXT: add x8, sp, #672
703 ; CHECK-NEXT: ld1 { v4.b }[2], [x13]
704 ; CHECK-NEXT: add x9, sp, #192
705 ; CHECK-NEXT: ld1 { v1.b }[5], [x10]
706 ; CHECK-NEXT: ld1 { v6.b }[2], [x8]
707 ; CHECK-NEXT: add x8, sp, #480
708 ; CHECK-NEXT: mov v0.b[4], w4
709 ; CHECK-NEXT: ld1 { v7.b }[2], [x8]
710 ; CHECK-NEXT: add x8, sp, #296
711 ; CHECK-NEXT: ld1 { v2.b }[3], [x8]
712 ; CHECK-NEXT: add x8, sp, #552
713 ; CHECK-NEXT: add x12, sp, #200
714 ; CHECK-NEXT: ld1 { v1.b }[6], [x9]
715 ; CHECK-NEXT: ld1 { v4.b }[3], [x8]
716 ; CHECK-NEXT: add x8, sp, #360
717 ; CHECK-NEXT: ld1 { v5.b }[3], [x8]
718 ; CHECK-NEXT: add x8, sp, #104
719 ; CHECK-NEXT: add x9, sp, #560
720 ; CHECK-NEXT: mov v0.b[5], w5
721 ; CHECK-NEXT: ld1 { v3.b }[3], [x8]
722 ; CHECK-NEXT: add x8, sp, #368
723 ; CHECK-NEXT: ld1 { v1.b }[7], [x12]
724 ; CHECK-NEXT: ld1 { v4.b }[4], [x9]
725 ; CHECK-NEXT: add x13, sp, #208
726 ; CHECK-NEXT: ld1 { v5.b }[4], [x8]
727 ; CHECK-NEXT: add x12, sp, #304
728 ; CHECK-NEXT: add x8, sp, #568
729 ; CHECK-NEXT: ld1 { v2.b }[4], [x12]
730 ; CHECK-NEXT: add x12, sp, #16
731 ; CHECK-NEXT: add x17, sp, #376
732 ; CHECK-NEXT: mov v0.b[6], w6
733 ; CHECK-NEXT: ld1 { v1.b }[8], [x13]
734 ; CHECK-NEXT: ld1 { v4.b }[5], [x8]
735 ; CHECK-NEXT: add x14, sp, #216
736 ; CHECK-NEXT: ld1 { v5.b }[5], [x17]
737 ; CHECK-NEXT: add x13, sp, #576
738 ; CHECK-NEXT: add x11, sp, #224
739 ; CHECK-NEXT: add x10, sp, #232
740 ; CHECK-NEXT: add x15, sp, #240
741 ; CHECK-NEXT: ld1 { v1.b }[9], [x14]
742 ; CHECK-NEXT: ld1 { v4.b }[6], [x13]
743 ; CHECK-NEXT: add x13, sp, #384
744 ; CHECK-NEXT: mov v0.b[7], w7
745 ; CHECK-NEXT: ld1 { v5.b }[6], [x13]
746 ; CHECK-NEXT: add x13, sp, #112
747 ; CHECK-NEXT: ld1 { v3.b }[4], [x13]
748 ; CHECK-NEXT: add x13, sp, #32
749 ; CHECK-NEXT: add x14, sp, #584
750 ; CHECK-NEXT: ld1 { v1.b }[10], [x11]
751 ; CHECK-NEXT: ld1 { v4.b }[7], [x14]
752 ; CHECK-NEXT: add x11, sp, #312
753 ; CHECK-NEXT: add x14, sp, #40
754 ; CHECK-NEXT: ld1 { v2.b }[5], [x11]
755 ; CHECK-NEXT: add x11, sp, #592
756 ; CHECK-NEXT: ld1 { v0.b }[8], [x12]
757 ; CHECK-NEXT: add x12, sp, #24
758 ; CHECK-NEXT: add x16, sp, #248
759 ; CHECK-NEXT: ld1 { v1.b }[11], [x10]
760 ; CHECK-NEXT: ld1 { v4.b }[8], [x11]
761 ; CHECK-NEXT: add x11, sp, #400
762 ; CHECK-NEXT: add x9, sp, #256
763 ; CHECK-NEXT: add x8, sp, #264
764 ; CHECK-NEXT: add x10, sp, #72
765 ; CHECK-NEXT: ld1 { v0.b }[9], [x12]
766 ; CHECK-NEXT: add x12, sp, #392
767 ; CHECK-NEXT: movi v16.2d, #0000000000000000
768 ; CHECK-NEXT: ld1 { v5.b }[7], [x12]
769 ; CHECK-NEXT: add x12, sp, #48
770 ; CHECK-NEXT: ld1 { v1.b }[12], [x15]
771 ; CHECK-NEXT: add x15, sp, #120
772 ; CHECK-NEXT: movi v17.2d, #0000000000000000
773 ; CHECK-NEXT: movi v18.2d, #0000000000000000
774 ; CHECK-NEXT: ld1 { v0.b }[10], [x13]
775 ; CHECK-NEXT: ld1 { v3.b }[5], [x15]
776 ; CHECK-NEXT: add x15, sp, #408
777 ; CHECK-NEXT: ld1 { v5.b }[8], [x11]
778 ; CHECK-NEXT: add x13, sp, #56
779 ; CHECK-NEXT: ld1 { v1.b }[13], [x16]
780 ; CHECK-NEXT: add x11, sp, #64
781 ; CHECK-NEXT: add x16, sp, #616
782 ; CHECK-NEXT: movi v19.2d, #0000000000000000
783 ; CHECK-NEXT: ld1 { v0.b }[11], [x14]
784 ; CHECK-NEXT: add x14, sp, #600
785 ; CHECK-NEXT: ld1 { v4.b }[9], [x14]
786 ; CHECK-NEXT: ld1 { v5.b }[9], [x15]
787 ; CHECK-NEXT: add x15, sp, #608
788 ; CHECK-NEXT: ld1 { v1.b }[14], [x9]
789 ; CHECK-NEXT: add x9, sp, #488
790 ; CHECK-NEXT: add x14, sp, #320
791 ; CHECK-NEXT: ld1 { v0.b }[12], [x12]
792 ; CHECK-NEXT: ld1 { v7.b }[3], [x9]
793 ; CHECK-NEXT: ld1 { v2.b }[6], [x14]
794 ; CHECK-NEXT: ld1 { v4.b }[10], [x15]
795 ; CHECK-NEXT: add x14, sp, #624
796 ; CHECK-NEXT: add x9, sp, #688
797 ; CHECK-NEXT: ld1 { v1.b }[15], [x8]
798 ; CHECK-NEXT: add x8, sp, #432
799 ; CHECK-NEXT: add x12, sp, #328
800 ; CHECK-NEXT: ld1 { v0.b }[13], [x13]
801 ; CHECK-NEXT: add x13, sp, #416
802 ; CHECK-NEXT: ld1 { v2.b }[7], [x12]
803 ; CHECK-NEXT: ld1 { v5.b }[10], [x13]
804 ; CHECK-NEXT: ld1 { v4.b }[11], [x16]
805 ; CHECK-NEXT: add x16, sp, #680
806 ; CHECK-NEXT: ld1 { v6.b }[3], [x16]
807 ; CHECK-NEXT: add x13, sp, #632
808 ; CHECK-NEXT: add x12, sp, #504
809 ; CHECK-NEXT: ld1 { v0.b }[14], [x11]
810 ; CHECK-NEXT: add x11, sp, #424
811 ; CHECK-NEXT: add x15, sp, #128
812 ; CHECK-NEXT: ld1 { v5.b }[11], [x11]
813 ; CHECK-NEXT: ld1 { v4.b }[12], [x14]
814 ; CHECK-NEXT: add x11, sp, #696
815 ; CHECK-NEXT: ld1 { v6.b }[4], [x9]
816 ; CHECK-NEXT: ld1 { v3.b }[6], [x15]
817 ; CHECK-NEXT: add x9, sp, #640
818 ; CHECK-NEXT: ld1 { v0.b }[15], [x10]
819 ; CHECK-NEXT: add x10, sp, #496
820 ; CHECK-NEXT: ld1 { v5.b }[12], [x8]
821 ; CHECK-NEXT: ld1 { v7.b }[4], [x10]
822 ; CHECK-NEXT: ld1 { v4.b }[13], [x13]
823 ; CHECK-NEXT: add x10, sp, #440
824 ; CHECK-NEXT: ld1 { v6.b }[5], [x11]
825 ; CHECK-NEXT: add x11, sp, #512
826 ; CHECK-NEXT: add x8, sp, #136
827 ; CHECK-NEXT: sdot v17.4s, v0.16b, v1.16b
828 ; CHECK-NEXT: ld1 { v5.b }[13], [x10]
829 ; CHECK-NEXT: ld1 { v7.b }[5], [x12]
830 ; CHECK-NEXT: ld1 { v4.b }[14], [x9]
831 ; CHECK-NEXT: add x9, sp, #448
832 ; CHECK-NEXT: add x10, sp, #704
833 ; CHECK-NEXT: ld1 { v3.b }[7], [x8]
834 ; CHECK-NEXT: ld1 { v6.b }[6], [x10]
835 ; CHECK-NEXT: add x8, sp, #648
836 ; CHECK-NEXT: add x10, sp, #520
837 ; CHECK-NEXT: ld1 { v5.b }[14], [x9]
838 ; CHECK-NEXT: ld1 { v7.b }[6], [x11]
839 ; CHECK-NEXT: ld1 { v4.b }[15], [x8]
840 ; CHECK-NEXT: add x8, sp, #456
841 ; CHECK-NEXT: add x9, sp, #712
842 ; CHECK-NEXT: sdot v19.2s, v3.8b, v2.8b
843 ; CHECK-NEXT: ld1 { v6.b }[7], [x9]
844 ; CHECK-NEXT: addv s0, v17.4s
845 ; CHECK-NEXT: ld1 { v5.b }[15], [x8]
846 ; CHECK-NEXT: ld1 { v7.b }[7], [x10]
847 ; CHECK-NEXT: addp v1.2s, v19.2s, v19.2s
848 ; CHECK-NEXT: sdot v16.4s, v5.16b, v4.16b
849 ; CHECK-NEXT: sdot v18.2s, v7.8b, v6.8b
850 ; CHECK-NEXT: fmov w8, s0
851 ; CHECK-NEXT: fmov w9, s1
852 ; CHECK-NEXT: addv s2, v16.4s
853 ; CHECK-NEXT: addp v3.2s, v18.2s, v18.2s
854 ; CHECK-NEXT: add w8, w8, w9
855 ; CHECK-NEXT: fmov w10, s2
856 ; CHECK-NEXT: fmov w11, s3
857 ; CHECK-NEXT: add w9, w10, w11
858 ; CHECK-NEXT: add w0, w8, w9
859 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
862 %az = sext <24 x i8> %a to <24 x i32>
863 %bz = sext <24 x i8> %b to <24 x i32>
864 %m1 = mul nuw nsw <24 x i32> %az, %bz
865 %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m1)
866 %cz = sext <24 x i8> %c to <24 x i32>
867 %dz = sext <24 x i8> %d to <24 x i32>
868 %m2 = mul nuw nsw <24 x i32> %cz, %dz
869 %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m2)
870 %x = add i32 %r1, %r2
874 define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) {
875 ; CHECK-LABEL: test_sdot_v24i8_double_nomla:
876 ; CHECK: // %bb.0: // %entry
877 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
878 ; CHECK-NEXT: .cfi_def_cfa_offset 16
879 ; CHECK-NEXT: .cfi_offset w29, -16
880 ; CHECK-NEXT: fmov s0, w0
881 ; CHECK-NEXT: ldr b1, [sp, #336]
882 ; CHECK-NEXT: add x8, sp, #344
883 ; CHECK-NEXT: add x9, sp, #400
884 ; CHECK-NEXT: ldr b2, [sp, #80]
885 ; CHECK-NEXT: ldr b3, [sp, #464]
886 ; CHECK-NEXT: ld1 { v1.b }[1], [x8]
887 ; CHECK-NEXT: add x8, sp, #352
888 ; CHECK-NEXT: add x10, sp, #408
889 ; CHECK-NEXT: mov v0.b[1], w1
890 ; CHECK-NEXT: add x11, sp, #472
891 ; CHECK-NEXT: add x12, sp, #480
892 ; CHECK-NEXT: ld1 { v3.b }[1], [x11]
893 ; CHECK-NEXT: add x11, sp, #416
894 ; CHECK-NEXT: add x13, sp, #488
895 ; CHECK-NEXT: ld1 { v1.b }[2], [x8]
896 ; CHECK-NEXT: add x8, sp, #360
897 ; CHECK-NEXT: add x14, sp, #496
898 ; CHECK-NEXT: movi v4.16b, #1
899 ; CHECK-NEXT: movi v5.2d, #0000000000000000
900 ; CHECK-NEXT: movi v6.2d, #0000000000000000
901 ; CHECK-NEXT: mov v0.b[2], w2
902 ; CHECK-NEXT: ld1 { v3.b }[2], [x12]
903 ; CHECK-NEXT: add x12, sp, #424
904 ; CHECK-NEXT: ld1 { v1.b }[3], [x8]
905 ; CHECK-NEXT: add x8, sp, #368
906 ; CHECK-NEXT: movi v7.2d, #0000000000000000
907 ; CHECK-NEXT: movi v16.8b, #1
908 ; CHECK-NEXT: movi v17.2d, #0000000000000000
909 ; CHECK-NEXT: ld1 { v3.b }[3], [x13]
910 ; CHECK-NEXT: add x13, sp, #432
911 ; CHECK-NEXT: mov v0.b[3], w3
912 ; CHECK-NEXT: ld1 { v1.b }[4], [x8]
913 ; CHECK-NEXT: add x8, sp, #376
914 ; CHECK-NEXT: ld1 { v3.b }[4], [x14]
915 ; CHECK-NEXT: ld1 { v1.b }[5], [x8]
916 ; CHECK-NEXT: add x8, sp, #384
917 ; CHECK-NEXT: mov v0.b[4], w4
918 ; CHECK-NEXT: ld1 { v1.b }[6], [x8]
919 ; CHECK-NEXT: add x8, sp, #392
920 ; CHECK-NEXT: mov v0.b[5], w5
921 ; CHECK-NEXT: ld1 { v1.b }[7], [x8]
922 ; CHECK-NEXT: add x8, sp, #16
923 ; CHECK-NEXT: mov v0.b[6], w6
924 ; CHECK-NEXT: ld1 { v1.b }[8], [x9]
925 ; CHECK-NEXT: add x9, sp, #88
926 ; CHECK-NEXT: ld1 { v2.b }[1], [x9]
927 ; CHECK-NEXT: add x9, sp, #40
928 ; CHECK-NEXT: ld1 { v1.b }[9], [x10]
929 ; CHECK-NEXT: add x10, sp, #96
930 ; CHECK-NEXT: mov v0.b[7], w7
931 ; CHECK-NEXT: ld1 { v2.b }[2], [x10]
932 ; CHECK-NEXT: add x10, sp, #56
933 ; CHECK-NEXT: ld1 { v1.b }[10], [x11]
934 ; CHECK-NEXT: add x11, sp, #104
935 ; CHECK-NEXT: ld1 { v2.b }[3], [x11]
936 ; CHECK-NEXT: add x11, sp, #72
937 ; CHECK-NEXT: ld1 { v0.b }[8], [x8]
938 ; CHECK-NEXT: add x8, sp, #24
939 ; CHECK-NEXT: ld1 { v1.b }[11], [x12]
940 ; CHECK-NEXT: add x12, sp, #112
941 ; CHECK-NEXT: ld1 { v2.b }[4], [x12]
942 ; CHECK-NEXT: add x12, sp, #440
943 ; CHECK-NEXT: ld1 { v0.b }[9], [x8]
944 ; CHECK-NEXT: add x8, sp, #32
945 ; CHECK-NEXT: ld1 { v1.b }[12], [x13]
946 ; CHECK-NEXT: add x13, sp, #504
947 ; CHECK-NEXT: ld1 { v3.b }[5], [x13]
948 ; CHECK-NEXT: add x13, sp, #512
949 ; CHECK-NEXT: ld1 { v0.b }[10], [x8]
950 ; CHECK-NEXT: add x8, sp, #48
951 ; CHECK-NEXT: ld1 { v1.b }[13], [x12]
952 ; CHECK-NEXT: add x12, sp, #448
953 ; CHECK-NEXT: ld1 { v3.b }[6], [x13]
954 ; CHECK-NEXT: ld1 { v0.b }[11], [x9]
955 ; CHECK-NEXT: add x9, sp, #64
956 ; CHECK-NEXT: ld1 { v1.b }[14], [x12]
957 ; CHECK-NEXT: ld1 { v0.b }[12], [x8]
958 ; CHECK-NEXT: add x8, sp, #120
959 ; CHECK-NEXT: ld1 { v2.b }[5], [x8]
960 ; CHECK-NEXT: add x8, sp, #128
961 ; CHECK-NEXT: ld1 { v0.b }[13], [x10]
962 ; CHECK-NEXT: add x10, sp, #136
963 ; CHECK-NEXT: ld1 { v2.b }[6], [x8]
964 ; CHECK-NEXT: add x8, sp, #456
965 ; CHECK-NEXT: ld1 { v1.b }[15], [x8]
966 ; CHECK-NEXT: ld1 { v0.b }[14], [x9]
967 ; CHECK-NEXT: add x9, sp, #520
968 ; CHECK-NEXT: ld1 { v2.b }[7], [x10]
969 ; CHECK-NEXT: ld1 { v3.b }[7], [x9]
970 ; CHECK-NEXT: sdot v5.4s, v1.16b, v4.16b
971 ; CHECK-NEXT: ld1 { v0.b }[15], [x11]
972 ; CHECK-NEXT: sdot v17.2s, v2.8b, v16.8b
973 ; CHECK-NEXT: sdot v7.2s, v3.8b, v16.8b
974 ; CHECK-NEXT: sdot v6.4s, v0.16b, v4.16b
975 ; CHECK-NEXT: addv s3, v5.4s
976 ; CHECK-NEXT: addp v1.2s, v17.2s, v17.2s
977 ; CHECK-NEXT: addp v2.2s, v7.2s, v7.2s
978 ; CHECK-NEXT: addv s0, v6.4s
979 ; CHECK-NEXT: fmov w10, s3
980 ; CHECK-NEXT: fmov w9, s1
981 ; CHECK-NEXT: fmov w11, s2
982 ; CHECK-NEXT: fmov w8, s0
983 ; CHECK-NEXT: add w8, w8, w9
984 ; CHECK-NEXT: add w9, w10, w11
985 ; CHECK-NEXT: add w0, w8, w9
986 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
989 %az = sext <24 x i8> %a to <24 x i32>
990 %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %az)
991 %cz = sext <24 x i8> %c to <24 x i32>
992 %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %cz)
993 %x = add i32 %r1, %r2
998 define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
999 ; CHECK-LABEL: test_udot_v25i8:
1000 ; CHECK: // %bb.0: // %entry
1001 ; CHECK-NEXT: ldp q4, q0, [x0]
1002 ; CHECK-NEXT: ldp q5, q1, [x1]
1003 ; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0
1004 ; CHECK-NEXT: ushll v6.8h, v4.8b, #0
1005 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1006 ; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
1007 ; CHECK-NEXT: ushll v7.8h, v5.8b, #0
1008 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1009 ; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h
1010 ; CHECK-NEXT: movi v3.2d, #0000000000000000
1011 ; CHECK-NEXT: umull2 v16.4s, v7.8h, v6.8h
1012 ; CHECK-NEXT: umull v6.4s, v7.4h, v6.4h
1013 ; CHECK-NEXT: mov v3.s[0], v2.s[0]
1014 ; CHECK-NEXT: ushll2 v2.8h, v4.16b, #0
1015 ; CHECK-NEXT: ushll2 v4.8h, v5.16b, #0
1016 ; CHECK-NEXT: umlal v6.4s, v1.4h, v0.4h
1017 ; CHECK-NEXT: umlal2 v16.4s, v1.8h, v0.8h
1018 ; CHECK-NEXT: umlal v3.4s, v4.4h, v2.4h
1019 ; CHECK-NEXT: umlal2 v16.4s, v4.8h, v2.8h
1020 ; CHECK-NEXT: add v0.4s, v6.4s, v3.4s
1021 ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
1022 ; CHECK-NEXT: addv s0, v0.4s
1023 ; CHECK-NEXT: fmov w8, s0
1024 ; CHECK-NEXT: add w0, w8, w2
1027 %0 = load <25 x i8>, ptr %a
1028 %1 = zext <25 x i8> %0 to <25 x i32>
1029 %2 = load <25 x i8>, ptr %b
1030 %3 = zext <25 x i8> %2 to <25 x i32>
1031 %4 = mul nuw nsw <25 x i32> %3, %1
1032 %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
1033 %op.extra = add i32 %5, %sum
1037 define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
1038 ; CHECK-LABEL: test_udot_v25i8_nomla:
1039 ; CHECK: // %bb.0: // %entry
1040 ; CHECK-NEXT: ldp q2, q1, [x0]
1041 ; CHECK-NEXT: movi v0.2d, #0000000000000000
1042 ; CHECK-NEXT: ushll v3.8h, v1.8b, #0
1043 ; CHECK-NEXT: ushll v4.8h, v2.8b, #0
1044 ; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
1045 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0
1046 ; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v3.8h
1047 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1048 ; CHECK-NEXT: uaddl v3.4s, v4.4h, v3.4h
1049 ; CHECK-NEXT: mov v0.s[0], v1.s[0]
1050 ; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v2.8h
1051 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
1052 ; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
1053 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
1054 ; CHECK-NEXT: addv s0, v0.4s
1055 ; CHECK-NEXT: fmov w0, s0
1058 %0 = load <25 x i8>, ptr %a1
1059 %1 = zext <25 x i8> %0 to <25 x i32>
1060 %2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %1)
1063 define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1064 ; CHECK-LABEL: test_sdot_v25i8:
1065 ; CHECK: // %bb.0: // %entry
1066 ; CHECK-NEXT: ldp q4, q0, [x0]
1067 ; CHECK-NEXT: ldp q5, q1, [x1]
1068 ; CHECK-NEXT: sshll2 v2.8h, v0.16b, #0
1069 ; CHECK-NEXT: sshll v6.8h, v4.8b, #0
1070 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1071 ; CHECK-NEXT: sshll2 v3.8h, v1.16b, #0
1072 ; CHECK-NEXT: sshll v7.8h, v5.8b, #0
1073 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1074 ; CHECK-NEXT: smull v2.4s, v3.4h, v2.4h
1075 ; CHECK-NEXT: movi v3.2d, #0000000000000000
1076 ; CHECK-NEXT: smull2 v16.4s, v7.8h, v6.8h
1077 ; CHECK-NEXT: smull v6.4s, v7.4h, v6.4h
1078 ; CHECK-NEXT: mov v3.s[0], v2.s[0]
1079 ; CHECK-NEXT: sshll2 v2.8h, v4.16b, #0
1080 ; CHECK-NEXT: sshll2 v4.8h, v5.16b, #0
1081 ; CHECK-NEXT: smlal v6.4s, v1.4h, v0.4h
1082 ; CHECK-NEXT: smlal2 v16.4s, v1.8h, v0.8h
1083 ; CHECK-NEXT: smlal v3.4s, v4.4h, v2.4h
1084 ; CHECK-NEXT: smlal2 v16.4s, v4.8h, v2.8h
1085 ; CHECK-NEXT: add v0.4s, v6.4s, v3.4s
1086 ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
1087 ; CHECK-NEXT: addv s0, v0.4s
1088 ; CHECK-NEXT: fmov w8, s0
1089 ; CHECK-NEXT: add w0, w8, w2
1092 %0 = load <25 x i8>, ptr %a
1093 %1 = sext <25 x i8> %0 to <25 x i32>
1094 %2 = load <25 x i8>, ptr %b
1095 %3 = sext <25 x i8> %2 to <25 x i32>
1096 %4 = mul nsw <25 x i32> %3, %1
1097 %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
1098 %op.extra = add nsw i32 %5, %sum
1102 define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
1103 ; CHECK-LABEL: test_sdot_v25i8_double:
1104 ; CHECK: // %bb.0: // %entry
1105 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1106 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1107 ; CHECK-NEXT: .cfi_offset w29, -16
1108 ; CHECK-NEXT: fmov s4, w0
1109 ; CHECK-NEXT: ldr b0, [sp, #80]
1110 ; CHECK-NEXT: add x8, sp, #88
1111 ; CHECK-NEXT: ldr b1, [sp, #16]
1112 ; CHECK-NEXT: add x10, sp, #24
1113 ; CHECK-NEXT: ldr b2, [sp, #280]
1114 ; CHECK-NEXT: ld1 { v0.b }[1], [x8]
1115 ; CHECK-NEXT: ldr b3, [sp, #216]
1116 ; CHECK-NEXT: add x11, sp, #224
1117 ; CHECK-NEXT: mov v4.b[1], w1
1118 ; CHECK-NEXT: ld1 { v1.b }[1], [x10]
1119 ; CHECK-NEXT: add x10, sp, #288
1120 ; CHECK-NEXT: ldr b5, [sp, #152]
1121 ; CHECK-NEXT: add x9, sp, #96
1122 ; CHECK-NEXT: ld1 { v2.b }[1], [x10]
1123 ; CHECK-NEXT: ld1 { v3.b }[1], [x11]
1124 ; CHECK-NEXT: add x10, sp, #160
1125 ; CHECK-NEXT: ld1 { v0.b }[2], [x9]
1126 ; CHECK-NEXT: ld1 { v5.b }[1], [x10]
1127 ; CHECK-NEXT: add x10, sp, #32
1128 ; CHECK-NEXT: add x11, sp, #296
1129 ; CHECK-NEXT: mov v4.b[2], w2
1130 ; CHECK-NEXT: ld1 { v1.b }[2], [x10]
1131 ; CHECK-NEXT: add x10, sp, #232
1132 ; CHECK-NEXT: add x8, sp, #104
1133 ; CHECK-NEXT: ld1 { v2.b }[2], [x11]
1134 ; CHECK-NEXT: ld1 { v3.b }[2], [x10]
1135 ; CHECK-NEXT: add x11, sp, #168
1136 ; CHECK-NEXT: ld1 { v0.b }[3], [x8]
1137 ; CHECK-NEXT: add x8, sp, #40
1138 ; CHECK-NEXT: ld1 { v5.b }[2], [x11]
1139 ; CHECK-NEXT: ld1 { v1.b }[3], [x8]
1140 ; CHECK-NEXT: add x8, sp, #240
1141 ; CHECK-NEXT: mov v4.b[3], w3
1142 ; CHECK-NEXT: ld1 { v3.b }[3], [x8]
1143 ; CHECK-NEXT: add x8, sp, #176
1144 ; CHECK-NEXT: add x12, sp, #112
1145 ; CHECK-NEXT: add x13, sp, #48
1146 ; CHECK-NEXT: add x9, sp, #120
1147 ; CHECK-NEXT: ld1 { v5.b }[3], [x8]
1148 ; CHECK-NEXT: ld1 { v0.b }[4], [x12]
1149 ; CHECK-NEXT: add x12, sp, #184
1150 ; CHECK-NEXT: ld1 { v1.b }[4], [x13]
1151 ; CHECK-NEXT: add x15, sp, #56
1152 ; CHECK-NEXT: add x14, sp, #128
1153 ; CHECK-NEXT: mov v4.b[4], w4
1154 ; CHECK-NEXT: add x11, sp, #304
1155 ; CHECK-NEXT: add x13, sp, #256
1156 ; CHECK-NEXT: ld1 { v5.b }[4], [x12]
1157 ; CHECK-NEXT: ld1 { v0.b }[5], [x9]
1158 ; CHECK-NEXT: add x9, sp, #192
1159 ; CHECK-NEXT: add x12, sp, #248
1160 ; CHECK-NEXT: ld1 { v1.b }[5], [x15]
1161 ; CHECK-NEXT: add x15, sp, #200
1162 ; CHECK-NEXT: ld1 { v3.b }[4], [x12]
1163 ; CHECK-NEXT: ld1 { v2.b }[3], [x11]
1164 ; CHECK-NEXT: add x11, sp, #64
1165 ; CHECK-NEXT: mov v4.b[5], w5
1166 ; CHECK-NEXT: ld1 { v5.b }[5], [x9]
1167 ; CHECK-NEXT: ld1 { v0.b }[6], [x14]
1168 ; CHECK-NEXT: ldr b6, [sp, #352]
1169 ; CHECK-NEXT: add x10, sp, #136
1170 ; CHECK-NEXT: ld1 { v1.b }[6], [x11]
1171 ; CHECK-NEXT: add x11, sp, #360
1172 ; CHECK-NEXT: ld1 { v3.b }[5], [x13]
1173 ; CHECK-NEXT: ldr b18, [sp, #552]
1174 ; CHECK-NEXT: ld1 { v5.b }[6], [x15]
1175 ; CHECK-NEXT: add x14, sp, #208
1176 ; CHECK-NEXT: ld1 { v6.b }[1], [x11]
1177 ; CHECK-NEXT: mov v4.b[6], w6
1178 ; CHECK-NEXT: ld1 { v0.b }[7], [x10]
1179 ; CHECK-NEXT: add x10, sp, #560
1180 ; CHECK-NEXT: add x9, sp, #264
1181 ; CHECK-NEXT: ld1 { v18.b }[1], [x10]
1182 ; CHECK-NEXT: add x10, sp, #568
1183 ; CHECK-NEXT: ld1 { v5.b }[7], [x14]
1184 ; CHECK-NEXT: ld1 { v3.b }[6], [x9]
1185 ; CHECK-NEXT: add x9, sp, #368
1186 ; CHECK-NEXT: ld1 { v6.b }[2], [x9]
1187 ; CHECK-NEXT: add x11, sp, #488
1188 ; CHECK-NEXT: ldr b7, [sp, #144]
1189 ; CHECK-NEXT: mov v4.b[7], w7
1190 ; CHECK-NEXT: ld1 { v18.b }[2], [x10]
1191 ; CHECK-NEXT: add x10, sp, #376
1192 ; CHECK-NEXT: sshll v17.8h, v5.8b, #0
1193 ; CHECK-NEXT: ldr b5, [sp, #480]
1194 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0
1195 ; CHECK-NEXT: ld1 { v6.b }[3], [x10]
1196 ; CHECK-NEXT: add x10, sp, #576
1197 ; CHECK-NEXT: add x8, sp, #312
1198 ; CHECK-NEXT: ld1 { v5.b }[1], [x11]
1199 ; CHECK-NEXT: ld1 { v18.b }[3], [x10]
1200 ; CHECK-NEXT: add x11, sp, #496
1201 ; CHECK-NEXT: sshll v16.8h, v4.8b, #0
1202 ; CHECK-NEXT: ldr b4, [sp, #344]
1203 ; CHECK-NEXT: add x10, sp, #384
1204 ; CHECK-NEXT: ld1 { v6.b }[4], [x10]
1205 ; CHECK-NEXT: add x10, sp, #584
1206 ; CHECK-NEXT: ld1 { v2.b }[4], [x8]
1207 ; CHECK-NEXT: sshll v19.8h, v4.8b, #0
1208 ; CHECK-NEXT: ld1 { v5.b }[2], [x11]
1209 ; CHECK-NEXT: ld1 { v18.b }[4], [x10]
1210 ; CHECK-NEXT: smull2 v4.4s, v16.8h, v17.8h
1211 ; CHECK-NEXT: smull v16.4s, v16.4h, v17.4h
1212 ; CHECK-NEXT: ldr b17, [sp, #416]
1213 ; CHECK-NEXT: add x11, sp, #504
1214 ; CHECK-NEXT: add x10, sp, #424
1215 ; CHECK-NEXT: add x16, sp, #320
1216 ; CHECK-NEXT: smull v19.4s, v7.4h, v19.4h
1217 ; CHECK-NEXT: movi v7.2d, #0000000000000000
1218 ; CHECK-NEXT: ld1 { v5.b }[3], [x11]
1219 ; CHECK-NEXT: add x11, sp, #392
1220 ; CHECK-NEXT: ld1 { v17.b }[1], [x10]
1221 ; CHECK-NEXT: add x10, sp, #592
1222 ; CHECK-NEXT: ld1 { v2.b }[5], [x16]
1223 ; CHECK-NEXT: ld1 { v6.b }[5], [x11]
1224 ; CHECK-NEXT: ld1 { v18.b }[5], [x10]
1225 ; CHECK-NEXT: add x11, sp, #512
1226 ; CHECK-NEXT: add x10, sp, #432
1227 ; CHECK-NEXT: add x12, sp, #328
1228 ; CHECK-NEXT: mov v7.s[0], v19.s[0]
1229 ; CHECK-NEXT: ld1 { v5.b }[4], [x11]
1230 ; CHECK-NEXT: add x11, sp, #400
1231 ; CHECK-NEXT: ld1 { v17.b }[2], [x10]
1232 ; CHECK-NEXT: add x10, sp, #600
1233 ; CHECK-NEXT: ldr b19, [sp, #680]
1234 ; CHECK-NEXT: ldr b20, [sp, #616]
1235 ; CHECK-NEXT: ld1 { v2.b }[6], [x12]
1236 ; CHECK-NEXT: ld1 { v6.b }[6], [x11]
1237 ; CHECK-NEXT: ld1 { v18.b }[6], [x10]
1238 ; CHECK-NEXT: add x11, sp, #688
1239 ; CHECK-NEXT: add x12, sp, #624
1240 ; CHECK-NEXT: ld1 { v19.b }[1], [x11]
1241 ; CHECK-NEXT: ld1 { v20.b }[1], [x12]
1242 ; CHECK-NEXT: add x10, sp, #408
1243 ; CHECK-NEXT: add x11, sp, #608
1244 ; CHECK-NEXT: add x12, sp, #440
1245 ; CHECK-NEXT: ld1 { v6.b }[7], [x10]
1246 ; CHECK-NEXT: ld1 { v18.b }[7], [x11]
1247 ; CHECK-NEXT: ld1 { v17.b }[3], [x12]
1248 ; CHECK-NEXT: add x10, sp, #696
1249 ; CHECK-NEXT: add x11, sp, #632
1250 ; CHECK-NEXT: ld1 { v19.b }[2], [x10]
1251 ; CHECK-NEXT: add x10, sp, #448
1252 ; CHECK-NEXT: ld1 { v20.b }[2], [x11]
1253 ; CHECK-NEXT: add x11, sp, #640
1254 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0
1255 ; CHECK-NEXT: ld1 { v17.b }[4], [x10]
1256 ; CHECK-NEXT: add x10, sp, #704
1257 ; CHECK-NEXT: sshll v18.8h, v18.8b, #0
1258 ; CHECK-NEXT: ld1 { v19.b }[3], [x10]
1259 ; CHECK-NEXT: add x10, sp, #712
1260 ; CHECK-NEXT: add x12, sp, #520
1261 ; CHECK-NEXT: ld1 { v20.b }[3], [x11]
1262 ; CHECK-NEXT: add x11, sp, #648
1263 ; CHECK-NEXT: ldr b21, [sp, #544]
1264 ; CHECK-NEXT: smull2 v22.4s, v6.8h, v18.8h
1265 ; CHECK-NEXT: smull v6.4s, v6.4h, v18.4h
1266 ; CHECK-NEXT: ldr b18, [sp, #744]
1267 ; CHECK-NEXT: ld1 { v19.b }[4], [x10]
1268 ; CHECK-NEXT: ld1 { v5.b }[5], [x12]
1269 ; CHECK-NEXT: add x12, sp, #656
1270 ; CHECK-NEXT: ld1 { v20.b }[4], [x11]
1271 ; CHECK-NEXT: add x11, sp, #456
1272 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0
1273 ; CHECK-NEXT: ld1 { v17.b }[5], [x11]
1274 ; CHECK-NEXT: add x11, sp, #720
1275 ; CHECK-NEXT: sshll v18.8h, v18.8b, #0
1276 ; CHECK-NEXT: ld1 { v19.b }[5], [x11]
1277 ; CHECK-NEXT: add x10, sp, #528
1278 ; CHECK-NEXT: add x11, sp, #464
1279 ; CHECK-NEXT: ld1 { v20.b }[5], [x12]
1280 ; CHECK-NEXT: ld1 { v5.b }[6], [x10]
1281 ; CHECK-NEXT: add x12, sp, #728
1282 ; CHECK-NEXT: add x13, sp, #664
1283 ; CHECK-NEXT: add x8, sp, #72
1284 ; CHECK-NEXT: ld1 { v17.b }[6], [x11]
1285 ; CHECK-NEXT: ld1 { v19.b }[6], [x12]
1286 ; CHECK-NEXT: ld1 { v1.b }[7], [x8]
1287 ; CHECK-NEXT: add x8, sp, #336
1288 ; CHECK-NEXT: ld1 { v20.b }[6], [x13]
1289 ; CHECK-NEXT: add x9, sp, #272
1290 ; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h
1291 ; CHECK-NEXT: movi v21.2d, #0000000000000000
1292 ; CHECK-NEXT: add x10, sp, #536
1293 ; CHECK-NEXT: ld1 { v2.b }[7], [x8]
1294 ; CHECK-NEXT: ld1 { v3.b }[7], [x9]
1295 ; CHECK-NEXT: ld1 { v5.b }[7], [x10]
1296 ; CHECK-NEXT: add x8, sp, #472
1297 ; CHECK-NEXT: add x9, sp, #736
1298 ; CHECK-NEXT: add x10, sp, #672
1299 ; CHECK-NEXT: ld1 { v17.b }[7], [x8]
1300 ; CHECK-NEXT: ld1 { v19.b }[7], [x9]
1301 ; CHECK-NEXT: ld1 { v20.b }[7], [x10]
1302 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1303 ; CHECK-NEXT: mov v21.s[0], v18.s[0]
1304 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1305 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
1306 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
1307 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0
1308 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0
1309 ; CHECK-NEXT: sshll v18.8h, v19.8b, #0
1310 ; CHECK-NEXT: sshll v19.8h, v20.8b, #0
1311 ; CHECK-NEXT: smlal v16.4s, v0.4h, v2.4h
1312 ; CHECK-NEXT: smlal2 v4.4s, v0.8h, v2.8h
1313 ; CHECK-NEXT: smlal v7.4s, v1.4h, v3.4h
1314 ; CHECK-NEXT: smlal v6.4s, v5.4h, v18.4h
1315 ; CHECK-NEXT: smlal2 v22.4s, v5.8h, v18.8h
1316 ; CHECK-NEXT: smlal v21.4s, v17.4h, v19.4h
1317 ; CHECK-NEXT: smlal2 v4.4s, v1.8h, v3.8h
1318 ; CHECK-NEXT: add v0.4s, v16.4s, v7.4s
1319 ; CHECK-NEXT: add v1.4s, v6.4s, v21.4s
1320 ; CHECK-NEXT: smlal2 v22.4s, v17.8h, v19.8h
1321 ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s
1322 ; CHECK-NEXT: add v1.4s, v1.4s, v22.4s
1323 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
1324 ; CHECK-NEXT: addv s0, v0.4s
1325 ; CHECK-NEXT: fmov w0, s0
1326 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1329 %az = sext <25 x i8> %a to <25 x i32>
1330 %bz = sext <25 x i8> %b to <25 x i32>
1331 %m1 = mul nuw nsw <25 x i32> %az, %bz
1332 %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m1)
1333 %cz = sext <25 x i8> %c to <25 x i32>
1334 %dz = sext <25 x i8> %d to <25 x i32>
1335 %m2 = mul nuw nsw <25 x i32> %cz, %dz
1336 %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m2)
1337 %x = add i32 %r1, %r2
1341 define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
1342 ; CHECK-LABEL: test_sdot_v25i8_double_nomla:
1343 ; CHECK: // %bb.0: // %entry
1344 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1345 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1346 ; CHECK-NEXT: .cfi_offset w29, -16
1347 ; CHECK-NEXT: fmov s0, w0
1348 ; CHECK-NEXT: ldr b1, [sp, #80]
1349 ; CHECK-NEXT: add x10, sp, #88
1350 ; CHECK-NEXT: ldr b2, [sp, #16]
1351 ; CHECK-NEXT: add x9, sp, #96
1352 ; CHECK-NEXT: ldr b3, [sp, #480]
1353 ; CHECK-NEXT: ld1 { v1.b }[1], [x10]
1354 ; CHECK-NEXT: add x10, sp, #24
1355 ; CHECK-NEXT: ldr b4, [sp, #352]
1356 ; CHECK-NEXT: mov v0.b[1], w1
1357 ; CHECK-NEXT: ld1 { v2.b }[1], [x10]
1358 ; CHECK-NEXT: add x11, sp, #488
1359 ; CHECK-NEXT: add x10, sp, #360
1360 ; CHECK-NEXT: ldr b5, [sp, #416]
1361 ; CHECK-NEXT: add x8, sp, #104
1362 ; CHECK-NEXT: ld1 { v1.b }[2], [x9]
1363 ; CHECK-NEXT: add x9, sp, #32
1364 ; CHECK-NEXT: ld1 { v3.b }[1], [x11]
1365 ; CHECK-NEXT: ld1 { v2.b }[2], [x9]
1366 ; CHECK-NEXT: add x11, sp, #424
1367 ; CHECK-NEXT: ld1 { v4.b }[1], [x10]
1368 ; CHECK-NEXT: mov v0.b[2], w2
1369 ; CHECK-NEXT: ld1 { v5.b }[1], [x11]
1370 ; CHECK-NEXT: add x9, sp, #368
1371 ; CHECK-NEXT: ld1 { v1.b }[3], [x8]
1372 ; CHECK-NEXT: add x8, sp, #40
1373 ; CHECK-NEXT: add x12, sp, #496
1374 ; CHECK-NEXT: ld1 { v2.b }[3], [x8]
1375 ; CHECK-NEXT: ld1 { v4.b }[2], [x9]
1376 ; CHECK-NEXT: add x8, sp, #432
1377 ; CHECK-NEXT: ld1 { v3.b }[2], [x12]
1378 ; CHECK-NEXT: add x13, sp, #48
1379 ; CHECK-NEXT: ld1 { v5.b }[2], [x8]
1380 ; CHECK-NEXT: mov v0.b[3], w3
1381 ; CHECK-NEXT: add x10, sp, #112
1382 ; CHECK-NEXT: add x8, sp, #504
1383 ; CHECK-NEXT: ld1 { v2.b }[4], [x13]
1384 ; CHECK-NEXT: add x13, sp, #376
1385 ; CHECK-NEXT: ld1 { v1.b }[4], [x10]
1386 ; CHECK-NEXT: ld1 { v4.b }[3], [x13]
1387 ; CHECK-NEXT: add x13, sp, #440
1388 ; CHECK-NEXT: ld1 { v3.b }[3], [x8]
1389 ; CHECK-NEXT: ld1 { v5.b }[3], [x13]
1390 ; CHECK-NEXT: add x11, sp, #120
1391 ; CHECK-NEXT: add x8, sp, #56
1392 ; CHECK-NEXT: mov v0.b[4], w4
1393 ; CHECK-NEXT: add x13, sp, #512
1394 ; CHECK-NEXT: ld1 { v1.b }[5], [x11]
1395 ; CHECK-NEXT: ld1 { v2.b }[5], [x8]
1396 ; CHECK-NEXT: add x8, sp, #384
1397 ; CHECK-NEXT: add x11, sp, #448
1398 ; CHECK-NEXT: ld1 { v3.b }[4], [x13]
1399 ; CHECK-NEXT: ld1 { v4.b }[4], [x8]
1400 ; CHECK-NEXT: ld1 { v5.b }[4], [x11]
1401 ; CHECK-NEXT: add x12, sp, #128
1402 ; CHECK-NEXT: add x10, sp, #64
1403 ; CHECK-NEXT: add x8, sp, #520
1404 ; CHECK-NEXT: mov v0.b[5], w5
1405 ; CHECK-NEXT: ld1 { v1.b }[6], [x12]
1406 ; CHECK-NEXT: ld1 { v2.b }[6], [x10]
1407 ; CHECK-NEXT: add x10, sp, #392
1408 ; CHECK-NEXT: add x11, sp, #456
1409 ; CHECK-NEXT: ldr b6, [sp, #144]
1410 ; CHECK-NEXT: ldr b7, [sp, #544]
1411 ; CHECK-NEXT: ld1 { v3.b }[5], [x8]
1412 ; CHECK-NEXT: ld1 { v4.b }[5], [x10]
1413 ; CHECK-NEXT: ld1 { v5.b }[5], [x11]
1414 ; CHECK-NEXT: add x9, sp, #136
1415 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0
1416 ; CHECK-NEXT: mov v0.b[6], w6
1417 ; CHECK-NEXT: ld1 { v1.b }[7], [x9]
1418 ; CHECK-NEXT: add x8, sp, #528
1419 ; CHECK-NEXT: add x9, sp, #400
1420 ; CHECK-NEXT: add x10, sp, #464
1421 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0
1422 ; CHECK-NEXT: ld1 { v3.b }[6], [x8]
1423 ; CHECK-NEXT: ld1 { v4.b }[6], [x9]
1424 ; CHECK-NEXT: ld1 { v5.b }[6], [x10]
1425 ; CHECK-NEXT: movi v16.2d, #0000000000000000
1426 ; CHECK-NEXT: movi v17.2d, #0000000000000000
1427 ; CHECK-NEXT: add x14, sp, #72
1428 ; CHECK-NEXT: mov v0.b[7], w7
1429 ; CHECK-NEXT: sshll v6.4s, v6.4h, #0
1430 ; CHECK-NEXT: add x8, sp, #536
1431 ; CHECK-NEXT: add x9, sp, #408
1432 ; CHECK-NEXT: add x10, sp, #472
1433 ; CHECK-NEXT: sshll v7.4s, v7.4h, #0
1434 ; CHECK-NEXT: ld1 { v2.b }[7], [x14]
1435 ; CHECK-NEXT: ld1 { v3.b }[7], [x8]
1436 ; CHECK-NEXT: ld1 { v4.b }[7], [x9]
1437 ; CHECK-NEXT: ld1 { v5.b }[7], [x10]
1438 ; CHECK-NEXT: mov v16.s[0], v6.s[0]
1439 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1440 ; CHECK-NEXT: mov v17.s[0], v7.s[0]
1441 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1442 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
1443 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
1444 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0
1445 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0
1446 ; CHECK-NEXT: saddl v7.4s, v0.4h, v1.4h
1447 ; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h
1448 ; CHECK-NEXT: saddw v6.4s, v16.4s, v2.4h
1449 ; CHECK-NEXT: saddl v1.4s, v4.4h, v3.4h
1450 ; CHECK-NEXT: saddl2 v3.4s, v4.8h, v3.8h
1451 ; CHECK-NEXT: saddw v4.4s, v17.4s, v5.4h
1452 ; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h
1453 ; CHECK-NEXT: add v6.4s, v7.4s, v6.4s
1454 ; CHECK-NEXT: saddw2 v2.4s, v3.4s, v5.8h
1455 ; CHECK-NEXT: add v1.4s, v1.4s, v4.4s
1456 ; CHECK-NEXT: add v0.4s, v6.4s, v0.4s
1457 ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
1458 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
1459 ; CHECK-NEXT: addv s0, v0.4s
1460 ; CHECK-NEXT: fmov w0, s0
1461 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1464 %az = sext <25 x i8> %a to <25 x i32>
1465 %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %az)
1466 %cz = sext <25 x i8> %c to <25 x i32>
1467 %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %cz)
1468 %x = add i32 %r1, %r2
1472 define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1473 ; CHECK-LABEL: test_udot_v32i8:
1474 ; CHECK: // %bb.0: // %entry
1475 ; CHECK-NEXT: movi v0.2d, #0000000000000000
1476 ; CHECK-NEXT: ldr q1, [x0, #16]
1477 ; CHECK-NEXT: ldr q2, [x1, #16]
1478 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
1479 ; CHECK-NEXT: ldr q1, [x0]
1480 ; CHECK-NEXT: ldr q2, [x1]
1481 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
1482 ; CHECK-NEXT: addv s0, v0.4s
1483 ; CHECK-NEXT: fmov w8, s0
1484 ; CHECK-NEXT: add w0, w8, w2
1487 %0 = load <32 x i8>, ptr %a
1488 %1 = zext <32 x i8> %0 to <32 x i32>
1489 %2 = load <32 x i8>, ptr %b
1490 %3 = zext <32 x i8> %2 to <32 x i32>
1491 %4 = mul nuw nsw <32 x i32> %3, %1
1492 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
1493 %op.extra = add i32 %5, %sum
1497 define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) {
1498 ; CHECK-LABEL: test_udot_v32i8_nomla:
1499 ; CHECK: // %bb.0: // %entry
1500 ; CHECK-NEXT: movi v0.16b, #1
1501 ; CHECK-NEXT: movi v1.2d, #0000000000000000
1502 ; CHECK-NEXT: ldr q2, [x0, #16]
1503 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
1504 ; CHECK-NEXT: ldr q2, [x0]
1505 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
1506 ; CHECK-NEXT: addv s0, v1.4s
1507 ; CHECK-NEXT: fmov w0, s0
1510 %0 = load <32 x i8>, ptr %a1
1511 %1 = zext <32 x i8> %0 to <32 x i32>
1512 %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
1515 define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1516 ; CHECK-LABEL: test_sdot_v32i8:
1517 ; CHECK: // %bb.0: // %entry
1518 ; CHECK-NEXT: movi v0.2d, #0000000000000000
1519 ; CHECK-NEXT: ldr q1, [x0, #16]
1520 ; CHECK-NEXT: ldr q2, [x1, #16]
1521 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
1522 ; CHECK-NEXT: ldr q1, [x0]
1523 ; CHECK-NEXT: ldr q2, [x1]
1524 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
1525 ; CHECK-NEXT: addv s0, v0.4s
1526 ; CHECK-NEXT: fmov w8, s0
1527 ; CHECK-NEXT: add w0, w8, w2
1530 %0 = load <32 x i8>, ptr %a
1531 %1 = sext <32 x i8> %0 to <32 x i32>
1532 %2 = load <32 x i8>, ptr %b
1533 %3 = sext <32 x i8> %2 to <32 x i32>
1534 %4 = mul nsw <32 x i32> %3, %1
1535 %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
1536 %op.extra = add nsw i32 %5, %sum
1540 define i32 @test_sdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
1541 ; CHECK-LABEL: test_sdot_v32i8_double:
1542 ; CHECK: // %bb.0: // %entry
1543 ; CHECK-NEXT: movi v16.2d, #0000000000000000
1544 ; CHECK-NEXT: movi v17.2d, #0000000000000000
1545 ; CHECK-NEXT: sdot v17.4s, v1.16b, v3.16b
1546 ; CHECK-NEXT: sdot v16.4s, v5.16b, v7.16b
1547 ; CHECK-NEXT: sdot v17.4s, v0.16b, v2.16b
1548 ; CHECK-NEXT: sdot v16.4s, v4.16b, v6.16b
1549 ; CHECK-NEXT: add v0.4s, v17.4s, v16.4s
1550 ; CHECK-NEXT: addv s0, v0.4s
1551 ; CHECK-NEXT: fmov w0, s0
1554 %az = sext <32 x i8> %a to <32 x i32>
1555 %bz = sext <32 x i8> %b to <32 x i32>
1556 %m1 = mul nuw nsw <32 x i32> %az, %bz
1557 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
1558 %cz = sext <32 x i8> %c to <32 x i32>
1559 %dz = sext <32 x i8> %d to <32 x i32>
1560 %m2 = mul nuw nsw <32 x i32> %cz, %dz
1561 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
1562 %x = add i32 %r1, %r2
1566 define i32 @test_sdot_v32i8_double_nomla(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
1567 ; CHECK-LABEL: test_sdot_v32i8_double_nomla:
1568 ; CHECK: // %bb.0: // %entry
1569 ; CHECK-NEXT: movi v2.16b, #1
1570 ; CHECK-NEXT: movi v3.2d, #0000000000000000
1571 ; CHECK-NEXT: movi v6.2d, #0000000000000000
1572 ; CHECK-NEXT: sdot v6.4s, v1.16b, v2.16b
1573 ; CHECK-NEXT: sdot v3.4s, v5.16b, v2.16b
1574 ; CHECK-NEXT: sdot v6.4s, v0.16b, v2.16b
1575 ; CHECK-NEXT: sdot v3.4s, v4.16b, v2.16b
1576 ; CHECK-NEXT: add v0.4s, v6.4s, v3.4s
1577 ; CHECK-NEXT: addv s0, v0.4s
1578 ; CHECK-NEXT: fmov w0, s0
1581 %az = sext <32 x i8> %a to <32 x i32>
1582 %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %az)
1583 %cz = sext <32 x i8> %c to <32 x i32>
1584 %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %cz)
1585 %x = add i32 %r1, %r2
1589 define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1590 ; CHECK-LABEL: test_udot_v33i8:
1591 ; CHECK: // %bb.0: // %entry
1592 ; CHECK-NEXT: ldp q2, q3, [x0]
1593 ; CHECK-NEXT: movi v18.2d, #0000000000000000
1594 ; CHECK-NEXT: ldp q4, q5, [x1]
1595 ; CHECK-NEXT: ldr b0, [x0, #32]
1596 ; CHECK-NEXT: ldr b1, [x1, #32]
1597 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1598 ; CHECK-NEXT: ushll v6.8h, v2.8b, #0
1599 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0
1600 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1601 ; CHECK-NEXT: ushll v7.8h, v4.8b, #0
1602 ; CHECK-NEXT: ushll2 v4.8h, v4.16b, #0
1603 ; CHECK-NEXT: ushll2 v16.8h, v3.16b, #0
1604 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
1605 ; CHECK-NEXT: ushll2 v19.8h, v5.16b, #0
1606 ; CHECK-NEXT: ushll v5.8h, v5.8b, #0
1607 ; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h
1608 ; CHECK-NEXT: umull2 v1.4s, v7.8h, v6.8h
1609 ; CHECK-NEXT: umull2 v17.4s, v4.8h, v2.8h
1610 ; CHECK-NEXT: umull v2.4s, v4.4h, v2.4h
1611 ; CHECK-NEXT: umlal2 v17.4s, v19.8h, v16.8h
1612 ; CHECK-NEXT: umlal2 v1.4s, v5.8h, v3.8h
1613 ; CHECK-NEXT: mov v18.s[0], v0.s[0]
1614 ; CHECK-NEXT: umlal v2.4s, v19.4h, v16.4h
1615 ; CHECK-NEXT: add v0.4s, v1.4s, v17.4s
1616 ; CHECK-NEXT: umlal v18.4s, v7.4h, v6.4h
1617 ; CHECK-NEXT: umlal v18.4s, v5.4h, v3.4h
1618 ; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
1619 ; CHECK-NEXT: add v0.4s, v18.4s, v0.4s
1620 ; CHECK-NEXT: addv s0, v0.4s
1621 ; CHECK-NEXT: fmov w8, s0
1622 ; CHECK-NEXT: add w0, w8, w2
1625 %0 = load <33 x i8>, ptr %a
1626 %1 = zext <33 x i8> %0 to <33 x i32>
1627 %2 = load <33 x i8>, ptr %b
1628 %3 = zext <33 x i8> %2 to <33 x i32>
1629 %4 = mul nuw nsw <33 x i32> %3, %1
1630 %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
1631 %op.extra = add i32 %5, %sum
1635 define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
1636 ; CHECK-LABEL: test_udot_v33i8_nomla:
1637 ; CHECK: // %bb.0: // %entry
1638 ; CHECK-NEXT: ldr b1, [x0, #32]
1639 ; CHECK-NEXT: ldp q3, q2, [x0]
1640 ; CHECK-NEXT: movi v0.2d, #0000000000000000
1641 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1642 ; CHECK-NEXT: ushll v4.8h, v2.8b, #0
1643 ; CHECK-NEXT: ushll v5.8h, v3.8b, #0
1644 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0
1645 ; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0
1646 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0
1647 ; CHECK-NEXT: uaddl2 v6.4s, v3.8h, v2.8h
1648 ; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h
1649 ; CHECK-NEXT: mov v0.s[0], v1.s[0]
1650 ; CHECK-NEXT: uaddl2 v1.4s, v5.8h, v4.8h
1651 ; CHECK-NEXT: add v1.4s, v1.4s, v6.4s
1652 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h
1653 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h
1654 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
1655 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
1656 ; CHECK-NEXT: addv s0, v0.4s
1657 ; CHECK-NEXT: fmov w0, s0
1660 %0 = load <33 x i8>, ptr %a1
1661 %1 = zext <33 x i8> %0 to <33 x i32>
1662 %2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %1)
1665 define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1666 ; CHECK-LABEL: test_sdot_v33i8:
1667 ; CHECK: // %bb.0: // %entry
1668 ; CHECK-NEXT: ldp q2, q3, [x0]
1669 ; CHECK-NEXT: movi v18.2d, #0000000000000000
1670 ; CHECK-NEXT: ldp q4, q5, [x1]
1671 ; CHECK-NEXT: ldr b0, [x0, #32]
1672 ; CHECK-NEXT: ldr b1, [x1, #32]
1673 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1674 ; CHECK-NEXT: sshll v6.8h, v2.8b, #0
1675 ; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0
1676 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1677 ; CHECK-NEXT: sshll v7.8h, v4.8b, #0
1678 ; CHECK-NEXT: sshll2 v4.8h, v4.16b, #0
1679 ; CHECK-NEXT: sshll2 v16.8h, v3.16b, #0
1680 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
1681 ; CHECK-NEXT: sshll2 v19.8h, v5.16b, #0
1682 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0
1683 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
1684 ; CHECK-NEXT: smull2 v1.4s, v7.8h, v6.8h
1685 ; CHECK-NEXT: smull2 v17.4s, v4.8h, v2.8h
1686 ; CHECK-NEXT: smull v2.4s, v4.4h, v2.4h
1687 ; CHECK-NEXT: smlal2 v17.4s, v19.8h, v16.8h
1688 ; CHECK-NEXT: smlal2 v1.4s, v5.8h, v3.8h
1689 ; CHECK-NEXT: mov v18.s[0], v0.s[0]
1690 ; CHECK-NEXT: smlal v2.4s, v19.4h, v16.4h
1691 ; CHECK-NEXT: add v0.4s, v1.4s, v17.4s
1692 ; CHECK-NEXT: smlal v18.4s, v7.4h, v6.4h
1693 ; CHECK-NEXT: smlal v18.4s, v5.4h, v3.4h
1694 ; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
1695 ; CHECK-NEXT: add v0.4s, v18.4s, v0.4s
1696 ; CHECK-NEXT: addv s0, v0.4s
1697 ; CHECK-NEXT: fmov w8, s0
1698 ; CHECK-NEXT: add w0, w8, w2
1701 %0 = load <33 x i8>, ptr %a
1702 %1 = sext <33 x i8> %0 to <33 x i32>
1703 %2 = load <33 x i8>, ptr %b
1704 %3 = sext <33 x i8> %2 to <33 x i32>
1705 %4 = mul nsw <33 x i32> %3, %1
1706 %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
1707 %op.extra = add nsw i32 %5, %sum
1711 define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
1712 ; CHECK-LABEL: test_sdot_v33i8_double:
1713 ; CHECK: // %bb.0: // %entry
1714 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1715 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1716 ; CHECK-NEXT: .cfi_offset w29, -16
1717 ; CHECK-NEXT: ldr b0, [sp, #80]
1718 ; CHECK-NEXT: add x8, sp, #88
1719 ; CHECK-NEXT: ldr b2, [sp, #144]
1720 ; CHECK-NEXT: fmov s4, w0
1721 ; CHECK-NEXT: add x10, sp, #152
1722 ; CHECK-NEXT: ldr b3, [sp, #16]
1723 ; CHECK-NEXT: ld1 { v0.b }[1], [x8]
1724 ; CHECK-NEXT: ld1 { v2.b }[1], [x10]
1725 ; CHECK-NEXT: add x10, sp, #24
1726 ; CHECK-NEXT: ldr b1, [sp, #344]
1727 ; CHECK-NEXT: add x9, sp, #96
1728 ; CHECK-NEXT: ld1 { v3.b }[1], [x10]
1729 ; CHECK-NEXT: add x10, sp, #352
1730 ; CHECK-NEXT: mov v4.b[1], w1
1731 ; CHECK-NEXT: add x8, sp, #104
1732 ; CHECK-NEXT: ld1 { v0.b }[2], [x9]
1733 ; CHECK-NEXT: add x9, sp, #160
1734 ; CHECK-NEXT: ld1 { v1.b }[1], [x10]
1735 ; CHECK-NEXT: ld1 { v2.b }[2], [x9]
1736 ; CHECK-NEXT: add x9, sp, #32
1737 ; CHECK-NEXT: add x12, sp, #360
1738 ; CHECK-NEXT: ld1 { v3.b }[2], [x9]
1739 ; CHECK-NEXT: add x11, sp, #112
1740 ; CHECK-NEXT: add x10, sp, #120
1741 ; CHECK-NEXT: ld1 { v1.b }[2], [x12]
1742 ; CHECK-NEXT: add x12, sp, #168
1743 ; CHECK-NEXT: ld1 { v0.b }[3], [x8]
1744 ; CHECK-NEXT: mov v4.b[2], w2
1745 ; CHECK-NEXT: ld1 { v2.b }[3], [x12]
1746 ; CHECK-NEXT: add x12, sp, #40
1747 ; CHECK-NEXT: ld1 { v3.b }[3], [x12]
1748 ; CHECK-NEXT: add x13, sp, #176
1749 ; CHECK-NEXT: ldr b16, [sp, #216]
1750 ; CHECK-NEXT: ld1 { v0.b }[4], [x11]
1751 ; CHECK-NEXT: add x11, sp, #48
1752 ; CHECK-NEXT: add x12, sp, #368
1753 ; CHECK-NEXT: ld1 { v2.b }[4], [x13]
1754 ; CHECK-NEXT: add x13, sp, #224
1755 ; CHECK-NEXT: add x9, sp, #128
1756 ; CHECK-NEXT: mov v4.b[3], w3
1757 ; CHECK-NEXT: ld1 { v3.b }[4], [x11]
1758 ; CHECK-NEXT: ld1 { v16.b }[1], [x13]
1759 ; CHECK-NEXT: ld1 { v0.b }[5], [x10]
1760 ; CHECK-NEXT: add x10, sp, #56
1761 ; CHECK-NEXT: ld1 { v1.b }[3], [x12]
1762 ; CHECK-NEXT: add x12, sp, #184
1763 ; CHECK-NEXT: ldr b5, [sp, #280]
1764 ; CHECK-NEXT: add x11, sp, #376
1765 ; CHECK-NEXT: ld1 { v3.b }[5], [x10]
1766 ; CHECK-NEXT: ld1 { v2.b }[5], [x12]
1767 ; CHECK-NEXT: add x10, sp, #232
1768 ; CHECK-NEXT: mov v4.b[4], w4
1769 ; CHECK-NEXT: ld1 { v0.b }[6], [x9]
1770 ; CHECK-NEXT: add x9, sp, #288
1771 ; CHECK-NEXT: add x15, sp, #64
1772 ; CHECK-NEXT: ld1 { v16.b }[2], [x10]
1773 ; CHECK-NEXT: ldr b17, [sp, #408]
1774 ; CHECK-NEXT: ld1 { v5.b }[1], [x9]
1775 ; CHECK-NEXT: add x14, sp, #192
1776 ; CHECK-NEXT: ld1 { v1.b }[4], [x11]
1777 ; CHECK-NEXT: ld1 { v3.b }[6], [x15]
1778 ; CHECK-NEXT: add x15, sp, #416
1779 ; CHECK-NEXT: ld1 { v2.b }[6], [x14]
1780 ; CHECK-NEXT: add x14, sp, #240
1781 ; CHECK-NEXT: ld1 { v17.b }[1], [x15]
1782 ; CHECK-NEXT: add x9, sp, #296
1783 ; CHECK-NEXT: add x8, sp, #136
1784 ; CHECK-NEXT: mov v4.b[5], w5
1785 ; CHECK-NEXT: add x13, sp, #384
1786 ; CHECK-NEXT: ld1 { v16.b }[3], [x14]
1787 ; CHECK-NEXT: ld1 { v5.b }[2], [x9]
1788 ; CHECK-NEXT: ld1 { v1.b }[5], [x13]
1789 ; CHECK-NEXT: ld1 { v0.b }[7], [x8]
1790 ; CHECK-NEXT: add x8, sp, #424
1791 ; CHECK-NEXT: add x9, sp, #248
1792 ; CHECK-NEXT: ld1 { v17.b }[2], [x8]
1793 ; CHECK-NEXT: add x8, sp, #304
1794 ; CHECK-NEXT: add x10, sp, #392
1795 ; CHECK-NEXT: ld1 { v16.b }[4], [x9]
1796 ; CHECK-NEXT: ld1 { v5.b }[3], [x8]
1797 ; CHECK-NEXT: mov v4.b[6], w6
1798 ; CHECK-NEXT: ld1 { v1.b }[6], [x10]
1799 ; CHECK-NEXT: add x10, sp, #432
1800 ; CHECK-NEXT: add x9, sp, #256
1801 ; CHECK-NEXT: ld1 { v17.b }[3], [x10]
1802 ; CHECK-NEXT: add x10, sp, #312
1803 ; CHECK-NEXT: ldr b22, [sp, #608]
1804 ; CHECK-NEXT: add x8, sp, #400
1805 ; CHECK-NEXT: ld1 { v16.b }[5], [x9]
1806 ; CHECK-NEXT: ld1 { v5.b }[4], [x10]
1807 ; CHECK-NEXT: add x9, sp, #616
1808 ; CHECK-NEXT: ld1 { v1.b }[7], [x8]
1809 ; CHECK-NEXT: add x8, sp, #440
1810 ; CHECK-NEXT: ld1 { v22.b }[1], [x9]
1811 ; CHECK-NEXT: mov v4.b[7], w7
1812 ; CHECK-NEXT: ld1 { v17.b }[4], [x8]
1813 ; CHECK-NEXT: add x8, sp, #320
1814 ; CHECK-NEXT: add x10, sp, #448
1815 ; CHECK-NEXT: ldr b6, [sp, #208]
1816 ; CHECK-NEXT: ld1 { v5.b }[5], [x8]
1817 ; CHECK-NEXT: add x8, sp, #624
1818 ; CHECK-NEXT: ldr b7, [sp, #472]
1819 ; CHECK-NEXT: ld1 { v22.b }[2], [x8]
1820 ; CHECK-NEXT: ld1 { v17.b }[5], [x10]
1821 ; CHECK-NEXT: add x10, sp, #328
1822 ; CHECK-NEXT: sshll v20.8h, v4.8b, #0
1823 ; CHECK-NEXT: ldr b4, [sp, #480]
1824 ; CHECK-NEXT: add x8, sp, #456
1825 ; CHECK-NEXT: ld1 { v5.b }[6], [x10]
1826 ; CHECK-NEXT: add x10, sp, #632
1827 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0
1828 ; CHECK-NEXT: ld1 { v22.b }[3], [x10]
1829 ; CHECK-NEXT: add x10, sp, #488
1830 ; CHECK-NEXT: ld1 { v17.b }[6], [x8]
1831 ; CHECK-NEXT: add x8, sp, #336
1832 ; CHECK-NEXT: ld1 { v4.b }[1], [x10]
1833 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0
1834 ; CHECK-NEXT: ld1 { v5.b }[7], [x8]
1835 ; CHECK-NEXT: add x8, sp, #640
1836 ; CHECK-NEXT: add x9, sp, #264
1837 ; CHECK-NEXT: ld1 { v22.b }[4], [x8]
1838 ; CHECK-NEXT: add x8, sp, #496
1839 ; CHECK-NEXT: ld1 { v16.b }[6], [x9]
1840 ; CHECK-NEXT: ld1 { v4.b }[2], [x8]
1841 ; CHECK-NEXT: add x8, sp, #648
1842 ; CHECK-NEXT: smull v18.4s, v6.4h, v7.4h
1843 ; CHECK-NEXT: ldr b7, [sp, #544]
1844 ; CHECK-NEXT: add x9, sp, #272
1845 ; CHECK-NEXT: movi v6.2d, #0000000000000000
1846 ; CHECK-NEXT: ld1 { v22.b }[5], [x8]
1847 ; CHECK-NEXT: add x8, sp, #504
1848 ; CHECK-NEXT: ld1 { v16.b }[7], [x9]
1849 ; CHECK-NEXT: ld1 { v4.b }[3], [x8]
1850 ; CHECK-NEXT: add x8, sp, #552
1851 ; CHECK-NEXT: add x9, sp, #656
1852 ; CHECK-NEXT: ld1 { v7.b }[1], [x8]
1853 ; CHECK-NEXT: add x8, sp, #512
1854 ; CHECK-NEXT: ldr b21, [sp, #672]
1855 ; CHECK-NEXT: ld1 { v22.b }[6], [x9]
1856 ; CHECK-NEXT: mov v6.s[0], v18.s[0]
1857 ; CHECK-NEXT: add x9, sp, #664
1858 ; CHECK-NEXT: ld1 { v4.b }[4], [x8]
1859 ; CHECK-NEXT: add x8, sp, #560
1860 ; CHECK-NEXT: sshll v23.8h, v16.8b, #0
1861 ; CHECK-NEXT: ld1 { v7.b }[2], [x8]
1862 ; CHECK-NEXT: add x8, sp, #520
1863 ; CHECK-NEXT: movi v19.2d, #0000000000000000
1864 ; CHECK-NEXT: ld1 { v22.b }[7], [x9]
1865 ; CHECK-NEXT: add x9, sp, #528
1866 ; CHECK-NEXT: add x10, sp, #464
1867 ; CHECK-NEXT: ld1 { v4.b }[5], [x8]
1868 ; CHECK-NEXT: add x8, sp, #568
1869 ; CHECK-NEXT: smull2 v18.4s, v20.8h, v23.8h
1870 ; CHECK-NEXT: ld1 { v7.b }[3], [x8]
1871 ; CHECK-NEXT: add x8, sp, #680
1872 ; CHECK-NEXT: smlal v6.4s, v20.4h, v23.4h
1873 ; CHECK-NEXT: ld1 { v21.b }[1], [x8]
1874 ; CHECK-NEXT: sshll v20.8h, v22.8b, #0
1875 ; CHECK-NEXT: ldr b22, [sp, #736]
1876 ; CHECK-NEXT: ld1 { v4.b }[6], [x9]
1877 ; CHECK-NEXT: add x9, sp, #576
1878 ; CHECK-NEXT: ldr b23, [sp, #1000]
1879 ; CHECK-NEXT: ld1 { v7.b }[4], [x9]
1880 ; CHECK-NEXT: add x9, sp, #688
1881 ; CHECK-NEXT: sshll v24.8h, v22.8b, #0
1882 ; CHECK-NEXT: ld1 { v21.b }[2], [x9]
1883 ; CHECK-NEXT: add x9, sp, #696
1884 ; CHECK-NEXT: sshll v25.8h, v23.8b, #0
1885 ; CHECK-NEXT: add x8, sp, #536
1886 ; CHECK-NEXT: ldr b22, [sp, #872]
1887 ; CHECK-NEXT: ldr b23, [sp, #936]
1888 ; CHECK-NEXT: ld1 { v4.b }[7], [x8]
1889 ; CHECK-NEXT: add x8, sp, #584
1890 ; CHECK-NEXT: ld1 { v17.b }[7], [x10]
1891 ; CHECK-NEXT: ld1 { v21.b }[3], [x9]
1892 ; CHECK-NEXT: ld1 { v7.b }[5], [x8]
1893 ; CHECK-NEXT: add x8, sp, #880
1894 ; CHECK-NEXT: add x9, sp, #704
1895 ; CHECK-NEXT: smull v25.4s, v24.4h, v25.4h
1896 ; CHECK-NEXT: ldr b24, [sp, #744]
1897 ; CHECK-NEXT: ld1 { v22.b }[1], [x8]
1898 ; CHECK-NEXT: add x8, sp, #944
1899 ; CHECK-NEXT: add x10, sp, #888
1900 ; CHECK-NEXT: ld1 { v21.b }[4], [x9]
1901 ; CHECK-NEXT: add x9, sp, #752
1902 ; CHECK-NEXT: ld1 { v23.b }[1], [x8]
1903 ; CHECK-NEXT: ld1 { v24.b }[1], [x9]
1904 ; CHECK-NEXT: add x8, sp, #712
1905 ; CHECK-NEXT: add x9, sp, #760
1906 ; CHECK-NEXT: ld1 { v22.b }[2], [x10]
1907 ; CHECK-NEXT: add x10, sp, #952
1908 ; CHECK-NEXT: mov v19.s[0], v25.s[0]
1909 ; CHECK-NEXT: ldr b25, [sp, #808]
1910 ; CHECK-NEXT: ld1 { v23.b }[2], [x10]
1911 ; CHECK-NEXT: ld1 { v21.b }[5], [x8]
1912 ; CHECK-NEXT: ld1 { v24.b }[2], [x9]
1913 ; CHECK-NEXT: add x8, sp, #816
1914 ; CHECK-NEXT: add x9, sp, #896
1915 ; CHECK-NEXT: ld1 { v25.b }[1], [x8]
1916 ; CHECK-NEXT: add x8, sp, #960
1917 ; CHECK-NEXT: ld1 { v22.b }[3], [x9]
1918 ; CHECK-NEXT: add x9, sp, #768
1919 ; CHECK-NEXT: ld1 { v23.b }[3], [x8]
1920 ; CHECK-NEXT: add x10, sp, #904
1921 ; CHECK-NEXT: ld1 { v24.b }[3], [x9]
1922 ; CHECK-NEXT: add x9, sp, #824
1923 ; CHECK-NEXT: add x8, sp, #720
1924 ; CHECK-NEXT: ld1 { v25.b }[2], [x9]
1925 ; CHECK-NEXT: add x9, sp, #968
1926 ; CHECK-NEXT: ld1 { v22.b }[4], [x10]
1927 ; CHECK-NEXT: add x10, sp, #776
1928 ; CHECK-NEXT: ld1 { v23.b }[4], [x9]
1929 ; CHECK-NEXT: ld1 { v21.b }[6], [x8]
1930 ; CHECK-NEXT: ld1 { v24.b }[4], [x10]
1931 ; CHECK-NEXT: add x8, sp, #832
1932 ; CHECK-NEXT: add x9, sp, #912
1933 ; CHECK-NEXT: ld1 { v25.b }[3], [x8]
1934 ; CHECK-NEXT: add x8, sp, #976
1935 ; CHECK-NEXT: ld1 { v22.b }[5], [x9]
1936 ; CHECK-NEXT: add x9, sp, #784
1937 ; CHECK-NEXT: ld1 { v23.b }[5], [x8]
1938 ; CHECK-NEXT: add x10, sp, #920
1939 ; CHECK-NEXT: ld1 { v24.b }[5], [x9]
1940 ; CHECK-NEXT: add x9, sp, #840
1941 ; CHECK-NEXT: add x8, sp, #728
1942 ; CHECK-NEXT: ld1 { v25.b }[4], [x9]
1943 ; CHECK-NEXT: add x9, sp, #984
1944 ; CHECK-NEXT: ld1 { v22.b }[6], [x10]
1945 ; CHECK-NEXT: add x10, sp, #792
1946 ; CHECK-NEXT: ld1 { v23.b }[6], [x9]
1947 ; CHECK-NEXT: ld1 { v21.b }[7], [x8]
1948 ; CHECK-NEXT: ld1 { v24.b }[6], [x10]
1949 ; CHECK-NEXT: add x8, sp, #848
1950 ; CHECK-NEXT: add x9, sp, #928
1951 ; CHECK-NEXT: ld1 { v25.b }[5], [x8]
1952 ; CHECK-NEXT: add x12, sp, #72
1953 ; CHECK-NEXT: add x8, sp, #992
1954 ; CHECK-NEXT: ld1 { v22.b }[7], [x9]
1955 ; CHECK-NEXT: add x9, sp, #800
1956 ; CHECK-NEXT: ld1 { v3.b }[7], [x12]
1957 ; CHECK-NEXT: ld1 { v23.b }[7], [x8]
1958 ; CHECK-NEXT: add x8, sp, #592
1959 ; CHECK-NEXT: ld1 { v24.b }[7], [x9]
1960 ; CHECK-NEXT: add x9, sp, #856
1961 ; CHECK-NEXT: ld1 { v7.b }[6], [x8]
1962 ; CHECK-NEXT: add x11, sp, #200
1963 ; CHECK-NEXT: ld1 { v25.b }[6], [x9]
1964 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
1965 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0
1966 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0
1967 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0
1968 ; CHECK-NEXT: sshll v22.8h, v22.8b, #0
1969 ; CHECK-NEXT: sshll v23.8h, v23.8b, #0
1970 ; CHECK-NEXT: add x8, sp, #600
1971 ; CHECK-NEXT: sshll v24.8h, v24.8b, #0
1972 ; CHECK-NEXT: add x9, sp, #864
1973 ; CHECK-NEXT: ld1 { v2.b }[7], [x11]
1974 ; CHECK-NEXT: ld1 { v7.b }[7], [x8]
1975 ; CHECK-NEXT: ld1 { v25.b }[7], [x9]
1976 ; CHECK-NEXT: smull v16.4s, v3.4h, v5.4h
1977 ; CHECK-NEXT: smull2 v3.4s, v3.8h, v5.8h
1978 ; CHECK-NEXT: smull v5.4s, v21.4h, v23.4h
1979 ; CHECK-NEXT: smull2 v21.4s, v21.8h, v23.8h
1980 ; CHECK-NEXT: smull2 v23.4s, v20.8h, v22.8h
1981 ; CHECK-NEXT: smlal v19.4s, v4.4h, v24.4h
1982 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
1983 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0
1984 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
1985 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
1986 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0
1987 ; CHECK-NEXT: sshll v25.8h, v25.8b, #0
1988 ; CHECK-NEXT: smlal2 v3.4s, v2.8h, v17.8h
1989 ; CHECK-NEXT: smlal v16.4s, v2.4h, v17.4h
1990 ; CHECK-NEXT: smlal2 v23.4s, v4.8h, v24.8h
1991 ; CHECK-NEXT: smlal2 v18.4s, v0.8h, v1.8h
1992 ; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h
1993 ; CHECK-NEXT: smlal v19.4s, v20.4h, v22.4h
1994 ; CHECK-NEXT: smlal2 v21.4s, v7.8h, v25.8h
1995 ; CHECK-NEXT: smlal v5.4s, v7.4h, v25.4h
1996 ; CHECK-NEXT: add v0.4s, v18.4s, v3.4s
1997 ; CHECK-NEXT: add v1.4s, v6.4s, v16.4s
1998 ; CHECK-NEXT: add v2.4s, v23.4s, v21.4s
1999 ; CHECK-NEXT: add v3.4s, v19.4s, v5.4s
2000 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
2001 ; CHECK-NEXT: add v1.4s, v3.4s, v2.4s
2002 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
2003 ; CHECK-NEXT: addv s0, v0.4s
2004 ; CHECK-NEXT: fmov w0, s0
2005 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2008 %az = sext <33 x i8> %a to <33 x i32>
2009 %bz = sext <33 x i8> %b to <33 x i32>
2010 %m1 = mul nuw nsw <33 x i32> %az, %bz
2011 %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m1)
2012 %cz = sext <33 x i8> %c to <33 x i32>
2013 %dz = sext <33 x i8> %d to <33 x i32>
2014 %m2 = mul nuw nsw <33 x i32> %cz, %dz
2015 %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m2)
2016 %x = add i32 %r1, %r2
2020 define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
2021 ; CHECK-LABEL: test_sdot_v33i8_double_nomla:
2022 ; CHECK: // %bb.0: // %entry
2023 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
2024 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2025 ; CHECK-NEXT: .cfi_offset w29, -16
2026 ; CHECK-NEXT: ldr b1, [sp, #80]
2027 ; CHECK-NEXT: add x8, sp, #88
2028 ; CHECK-NEXT: ldr b2, [sp, #144]
2029 ; CHECK-NEXT: add x9, sp, #152
2030 ; CHECK-NEXT: ldr b3, [sp, #16]
2031 ; CHECK-NEXT: add x12, sp, #32
2032 ; CHECK-NEXT: ld1 { v1.b }[1], [x8]
2033 ; CHECK-NEXT: ld1 { v2.b }[1], [x9]
2034 ; CHECK-NEXT: add x9, sp, #96
2035 ; CHECK-NEXT: add x8, sp, #24
2036 ; CHECK-NEXT: add x11, sp, #112
2037 ; CHECK-NEXT: fmov s0, w0
2038 ; CHECK-NEXT: ld1 { v3.b }[1], [x8]
2039 ; CHECK-NEXT: add x8, sp, #160
2040 ; CHECK-NEXT: ldr b4, [sp, #480]
2041 ; CHECK-NEXT: ld1 { v1.b }[2], [x9]
2042 ; CHECK-NEXT: add x9, sp, #104
2043 ; CHECK-NEXT: ld1 { v2.b }[2], [x8]
2044 ; CHECK-NEXT: add x8, sp, #168
2045 ; CHECK-NEXT: add x10, sp, #120
2046 ; CHECK-NEXT: add x13, sp, #48
2047 ; CHECK-NEXT: ld1 { v3.b }[2], [x12]
2048 ; CHECK-NEXT: add x12, sp, #40
2049 ; CHECK-NEXT: ldr b5, [sp, #608]
2050 ; CHECK-NEXT: ld1 { v1.b }[3], [x9]
2051 ; CHECK-NEXT: ld1 { v2.b }[3], [x8]
2052 ; CHECK-NEXT: mov v0.b[1], w1
2053 ; CHECK-NEXT: add x9, sp, #128
2054 ; CHECK-NEXT: add x14, sp, #184
2055 ; CHECK-NEXT: ldr b16, [sp, #544]
2056 ; CHECK-NEXT: ld1 { v3.b }[3], [x12]
2057 ; CHECK-NEXT: add x12, sp, #176
2058 ; CHECK-NEXT: ldr b17, [sp, #672]
2059 ; CHECK-NEXT: ld1 { v1.b }[4], [x11]
2060 ; CHECK-NEXT: add x11, sp, #488
2061 ; CHECK-NEXT: ld1 { v2.b }[4], [x12]
2062 ; CHECK-NEXT: ld1 { v4.b }[1], [x11]
2063 ; CHECK-NEXT: mov v0.b[2], w2
2064 ; CHECK-NEXT: add x11, sp, #192
2065 ; CHECK-NEXT: ld1 { v3.b }[4], [x13]
2066 ; CHECK-NEXT: add x13, sp, #616
2067 ; CHECK-NEXT: add x12, sp, #56
2068 ; CHECK-NEXT: ld1 { v1.b }[5], [x10]
2069 ; CHECK-NEXT: ld1 { v5.b }[1], [x13]
2070 ; CHECK-NEXT: add x13, sp, #496
2071 ; CHECK-NEXT: ld1 { v4.b }[2], [x13]
2072 ; CHECK-NEXT: ld1 { v2.b }[5], [x14]
2073 ; CHECK-NEXT: add x14, sp, #680
2074 ; CHECK-NEXT: ld1 { v17.b }[1], [x14]
2075 ; CHECK-NEXT: add x13, sp, #504
2076 ; CHECK-NEXT: ld1 { v3.b }[5], [x12]
2077 ; CHECK-NEXT: ld1 { v1.b }[6], [x9]
2078 ; CHECK-NEXT: add x9, sp, #552
2079 ; CHECK-NEXT: add x12, sp, #688
2080 ; CHECK-NEXT: ld1 { v16.b }[1], [x9]
2081 ; CHECK-NEXT: add x9, sp, #624
2082 ; CHECK-NEXT: ld1 { v4.b }[3], [x13]
2083 ; CHECK-NEXT: ld1 { v2.b }[6], [x11]
2084 ; CHECK-NEXT: add x11, sp, #560
2085 ; CHECK-NEXT: add x8, sp, #136
2086 ; CHECK-NEXT: ld1 { v17.b }[2], [x12]
2087 ; CHECK-NEXT: ld1 { v5.b }[2], [x9]
2088 ; CHECK-NEXT: ld1 { v1.b }[7], [x8]
2089 ; CHECK-NEXT: ld1 { v16.b }[2], [x11]
2090 ; CHECK-NEXT: add x8, sp, #512
2091 ; CHECK-NEXT: mov v0.b[3], w3
2092 ; CHECK-NEXT: ld1 { v4.b }[4], [x8]
2093 ; CHECK-NEXT: add x8, sp, #568
2094 ; CHECK-NEXT: add x9, sp, #696
2095 ; CHECK-NEXT: add x11, sp, #632
2096 ; CHECK-NEXT: ld1 { v17.b }[3], [x9]
2097 ; CHECK-NEXT: add x9, sp, #520
2098 ; CHECK-NEXT: ld1 { v16.b }[3], [x8]
2099 ; CHECK-NEXT: ld1 { v5.b }[3], [x11]
2100 ; CHECK-NEXT: add x8, sp, #640
2101 ; CHECK-NEXT: ld1 { v4.b }[5], [x9]
2102 ; CHECK-NEXT: add x9, sp, #576
2103 ; CHECK-NEXT: add x11, sp, #704
2104 ; CHECK-NEXT: ldr b18, [sp, #736]
2105 ; CHECK-NEXT: mov v0.b[4], w4
2106 ; CHECK-NEXT: ld1 { v17.b }[4], [x11]
2107 ; CHECK-NEXT: ld1 { v16.b }[4], [x9]
2108 ; CHECK-NEXT: ld1 { v5.b }[4], [x8]
2109 ; CHECK-NEXT: add x9, sp, #528
2110 ; CHECK-NEXT: sshll v18.8h, v18.8b, #0
2111 ; CHECK-NEXT: add x8, sp, #648
2112 ; CHECK-NEXT: add x11, sp, #584
2113 ; CHECK-NEXT: add x12, sp, #712
2114 ; CHECK-NEXT: ld1 { v4.b }[6], [x9]
2115 ; CHECK-NEXT: movi v7.2d, #0000000000000000
2116 ; CHECK-NEXT: ld1 { v16.b }[5], [x11]
2117 ; CHECK-NEXT: ld1 { v17.b }[5], [x12]
2118 ; CHECK-NEXT: ld1 { v5.b }[5], [x8]
2119 ; CHECK-NEXT: mov v0.b[5], w5
2120 ; CHECK-NEXT: add x9, sp, #536
2121 ; CHECK-NEXT: sshll v18.4s, v18.4h, #0
2122 ; CHECK-NEXT: add x8, sp, #656
2123 ; CHECK-NEXT: add x11, sp, #592
2124 ; CHECK-NEXT: add x12, sp, #720
2125 ; CHECK-NEXT: ld1 { v4.b }[7], [x9]
2126 ; CHECK-NEXT: ld1 { v16.b }[6], [x11]
2127 ; CHECK-NEXT: ld1 { v17.b }[6], [x12]
2128 ; CHECK-NEXT: ld1 { v5.b }[6], [x8]
2129 ; CHECK-NEXT: ldr b6, [sp, #208]
2130 ; CHECK-NEXT: add x10, sp, #64
2131 ; CHECK-NEXT: mov v7.s[0], v18.s[0]
2132 ; CHECK-NEXT: mov v0.b[6], w6
2133 ; CHECK-NEXT: ld1 { v3.b }[6], [x10]
2134 ; CHECK-NEXT: add x8, sp, #664
2135 ; CHECK-NEXT: add x9, sp, #600
2136 ; CHECK-NEXT: add x10, sp, #728
2137 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0
2138 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0
2139 ; CHECK-NEXT: ld1 { v16.b }[7], [x9]
2140 ; CHECK-NEXT: ld1 { v17.b }[7], [x10]
2141 ; CHECK-NEXT: ld1 { v5.b }[7], [x8]
2142 ; CHECK-NEXT: movi v18.2d, #0000000000000000
2143 ; CHECK-NEXT: mov v0.b[7], w7
2144 ; CHECK-NEXT: add x9, sp, #200
2145 ; CHECK-NEXT: add x10, sp, #72
2146 ; CHECK-NEXT: saddw v7.4s, v7.4s, v4.4h
2147 ; CHECK-NEXT: sshll v6.4s, v6.4h, #0
2148 ; CHECK-NEXT: sshll v16.8h, v16.8b, #0
2149 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0
2150 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0
2151 ; CHECK-NEXT: ld1 { v2.b }[7], [x9]
2152 ; CHECK-NEXT: ld1 { v3.b }[7], [x10]
2153 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
2154 ; CHECK-NEXT: mov v18.s[0], v6.s[0]
2155 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
2156 ; CHECK-NEXT: saddl2 v6.4s, v17.8h, v16.8h
2157 ; CHECK-NEXT: saddl2 v4.4s, v5.8h, v4.8h
2158 ; CHECK-NEXT: saddl v16.4s, v17.4h, v16.4h
2159 ; CHECK-NEXT: saddw v5.4s, v7.4s, v5.4h
2160 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
2161 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
2162 ; CHECK-NEXT: saddl2 v17.4s, v0.8h, v1.8h
2163 ; CHECK-NEXT: saddw v0.4s, v18.4s, v0.4h
2164 ; CHECK-NEXT: saddl2 v7.4s, v3.8h, v2.8h
2165 ; CHECK-NEXT: add v4.4s, v4.4s, v6.4s
2166 ; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h
2167 ; CHECK-NEXT: add v5.4s, v5.4s, v16.4s
2168 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
2169 ; CHECK-NEXT: add v6.4s, v17.4s, v7.4s
2170 ; CHECK-NEXT: add v1.4s, v5.4s, v4.4s
2171 ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
2172 ; CHECK-NEXT: add v1.4s, v6.4s, v1.4s
2173 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
2174 ; CHECK-NEXT: addv s0, v0.4s
2175 ; CHECK-NEXT: fmov w0, s0
2176 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2179 %az = sext <33 x i8> %a to <33 x i32>
2180 %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %az)
2181 %cz = sext <33 x i8> %c to <33 x i32>
2182 %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %cz)
2183 %x = add i32 %r1, %r2
2186 define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2187 ; CHECK-LABEL: test_udot_v48i8:
2188 ; CHECK: // %bb.0: // %entry
2189 ; CHECK-NEXT: movi v0.2d, #0000000000000000
2190 ; CHECK-NEXT: ldr q1, [x0, #32]
2191 ; CHECK-NEXT: ldr q2, [x1, #32]
2192 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
2193 ; CHECK-NEXT: ldr q1, [x0]
2194 ; CHECK-NEXT: ldr q2, [x1]
2195 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
2196 ; CHECK-NEXT: ldr q1, [x0, #16]
2197 ; CHECK-NEXT: ldr q2, [x1, #16]
2198 ; CHECK-NEXT: udot v0.4s, v2.16b, v1.16b
2199 ; CHECK-NEXT: addv s0, v0.4s
2200 ; CHECK-NEXT: fmov w8, s0
2201 ; CHECK-NEXT: add w0, w8, w2
2204 %0 = load <48 x i8>, ptr %a
2205 %1 = zext <48 x i8> %0 to <48 x i32>
2206 %2 = load <48 x i8>, ptr %b
2207 %3 = zext <48 x i8> %2 to <48 x i32>
2208 %4 = mul nuw nsw <48 x i32> %3, %1
2209 %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4)
2210 %op.extra = add i32 %5, %sum
2214 define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) {
2215 ; CHECK-LABEL: test_udot_v48i8_nomla:
2216 ; CHECK: // %bb.0: // %entry
2217 ; CHECK-NEXT: movi v0.16b, #1
2218 ; CHECK-NEXT: movi v1.2d, #0000000000000000
2219 ; CHECK-NEXT: ldr q2, [x0, #32]
2220 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
2221 ; CHECK-NEXT: ldr q2, [x0]
2222 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
2223 ; CHECK-NEXT: ldr q2, [x0, #16]
2224 ; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b
2225 ; CHECK-NEXT: addv s0, v1.4s
2226 ; CHECK-NEXT: fmov w0, s0
2229 %0 = load <48 x i8>, ptr %a1
2230 %1 = zext <48 x i8> %0 to <48 x i32>
2231 %2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %1)
2234 define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2235 ; CHECK-LABEL: test_sdot_v48i8:
2236 ; CHECK: // %bb.0: // %entry
2237 ; CHECK-NEXT: movi v0.2d, #0000000000000000
2238 ; CHECK-NEXT: ldr q1, [x0, #32]
2239 ; CHECK-NEXT: ldr q2, [x1, #32]
2240 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
2241 ; CHECK-NEXT: ldr q1, [x0]
2242 ; CHECK-NEXT: ldr q2, [x1]
2243 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
2244 ; CHECK-NEXT: ldr q1, [x0, #16]
2245 ; CHECK-NEXT: ldr q2, [x1, #16]
2246 ; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
2247 ; CHECK-NEXT: addv s0, v0.4s
2248 ; CHECK-NEXT: fmov w8, s0
2249 ; CHECK-NEXT: add w0, w8, w2
2252 %0 = load <48 x i8>, ptr %a
2253 %1 = sext <48 x i8> %0 to <48 x i32>
2254 %2 = load <48 x i8>, ptr %b
2255 %3 = sext <48 x i8> %2 to <48 x i32>
2256 %4 = mul nsw <48 x i32> %3, %1
2257 %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4)
2258 %op.extra = add nsw i32 %5, %sum
2262 define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) {
2263 ; CHECK-LABEL: test_sdot_v48i8_double:
2264 ; CHECK: // %bb.0: // %entry
2265 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
2266 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2267 ; CHECK-NEXT: .cfi_offset w29, -16
2268 ; CHECK-NEXT: ldr b3, [sp, #592]
2269 ; CHECK-NEXT: add x8, sp, #600
2270 ; CHECK-NEXT: ldr b6, [sp, #208]
2271 ; CHECK-NEXT: ldr b0, [sp, #336]
2272 ; CHECK-NEXT: add x9, sp, #344
2273 ; CHECK-NEXT: ldr b2, [sp, #464]
2274 ; CHECK-NEXT: ld1 { v3.b }[1], [x8]
2275 ; CHECK-NEXT: add x8, sp, #216
2276 ; CHECK-NEXT: add x10, sp, #624
2277 ; CHECK-NEXT: ld1 { v6.b }[1], [x8]
2278 ; CHECK-NEXT: add x8, sp, #608
2279 ; CHECK-NEXT: ld1 { v0.b }[1], [x9]
2280 ; CHECK-NEXT: add x9, sp, #232
2281 ; CHECK-NEXT: fmov s1, w0
2282 ; CHECK-NEXT: ldr b7, [sp, #1360]
2283 ; CHECK-NEXT: ld1 { v3.b }[2], [x8]
2284 ; CHECK-NEXT: add x8, sp, #224
2285 ; CHECK-NEXT: add x11, sp, #648
2286 ; CHECK-NEXT: ld1 { v6.b }[2], [x8]
2287 ; CHECK-NEXT: add x8, sp, #616
2288 ; CHECK-NEXT: add x12, sp, #376
2289 ; CHECK-NEXT: mov v1.b[1], w1
2290 ; CHECK-NEXT: ldr b16, [sp, #976]
2291 ; CHECK-NEXT: add x14, sp, #288
2292 ; CHECK-NEXT: ld1 { v3.b }[3], [x8]
2293 ; CHECK-NEXT: add x8, sp, #632
2294 ; CHECK-NEXT: add x15, sp, #408
2295 ; CHECK-NEXT: ld1 { v6.b }[3], [x9]
2296 ; CHECK-NEXT: add x9, sp, #472
2297 ; CHECK-NEXT: add x13, sp, #696
2298 ; CHECK-NEXT: ld1 { v2.b }[1], [x9]
2299 ; CHECK-NEXT: add x9, sp, #240
2300 ; CHECK-NEXT: add x16, sp, #448
2301 ; CHECK-NEXT: ld1 { v3.b }[4], [x10]
2302 ; CHECK-NEXT: add x10, sp, #352
2303 ; CHECK-NEXT: mov v1.b[2], w2
2304 ; CHECK-NEXT: ld1 { v6.b }[4], [x9]
2305 ; CHECK-NEXT: ld1 { v0.b }[2], [x10]
2306 ; CHECK-NEXT: add x10, sp, #1368
2307 ; CHECK-NEXT: ld1 { v7.b }[1], [x10]
2308 ; CHECK-NEXT: add x10, sp, #248
2309 ; CHECK-NEXT: add x9, sp, #640
2310 ; CHECK-NEXT: ld1 { v3.b }[5], [x8]
2311 ; CHECK-NEXT: add x8, sp, #656
2312 ; CHECK-NEXT: movi v5.2d, #0000000000000000
2313 ; CHECK-NEXT: ld1 { v6.b }[5], [x10]
2314 ; CHECK-NEXT: add x10, sp, #360
2315 ; CHECK-NEXT: mov v1.b[3], w3
2316 ; CHECK-NEXT: ld1 { v0.b }[3], [x10]
2317 ; CHECK-NEXT: add x10, sp, #256
2318 ; CHECK-NEXT: movi v4.2d, #0000000000000000
2319 ; CHECK-NEXT: ld1 { v3.b }[6], [x9]
2320 ; CHECK-NEXT: add x9, sp, #368
2321 ; CHECK-NEXT: ldr b17, [sp, #720]
2322 ; CHECK-NEXT: ld1 { v6.b }[6], [x10]
2323 ; CHECK-NEXT: add x10, sp, #984
2324 ; CHECK-NEXT: ld1 { v0.b }[4], [x9]
2325 ; CHECK-NEXT: ld1 { v16.b }[1], [x10]
2326 ; CHECK-NEXT: add x10, sp, #664
2327 ; CHECK-NEXT: ld1 { v3.b }[7], [x11]
2328 ; CHECK-NEXT: add x11, sp, #264
2329 ; CHECK-NEXT: mov v1.b[4], w4
2330 ; CHECK-NEXT: ld1 { v6.b }[7], [x11]
2331 ; CHECK-NEXT: add x9, sp, #672
2332 ; CHECK-NEXT: add x11, sp, #680
2333 ; CHECK-NEXT: ld1 { v0.b }[5], [x12]
2334 ; CHECK-NEXT: add x12, sp, #480
2335 ; CHECK-NEXT: ld1 { v2.b }[2], [x12]
2336 ; CHECK-NEXT: add x12, sp, #272
2337 ; CHECK-NEXT: ld1 { v3.b }[8], [x8]
2338 ; CHECK-NEXT: ld1 { v6.b }[8], [x12]
2339 ; CHECK-NEXT: add x12, sp, #384
2340 ; CHECK-NEXT: mov v1.b[5], w5
2341 ; CHECK-NEXT: ld1 { v0.b }[6], [x12]
2342 ; CHECK-NEXT: add x12, sp, #280
2343 ; CHECK-NEXT: add x8, sp, #688
2344 ; CHECK-NEXT: ld1 { v3.b }[9], [x10]
2345 ; CHECK-NEXT: add x10, sp, #1376
2346 ; CHECK-NEXT: ld1 { v7.b }[2], [x10]
2347 ; CHECK-NEXT: add x10, sp, #392
2348 ; CHECK-NEXT: ld1 { v6.b }[9], [x12]
2349 ; CHECK-NEXT: ld1 { v0.b }[7], [x10]
2350 ; CHECK-NEXT: mov v1.b[6], w6
2351 ; CHECK-NEXT: add x12, sp, #704
2352 ; CHECK-NEXT: ld1 { v3.b }[10], [x9]
2353 ; CHECK-NEXT: add x9, sp, #400
2354 ; CHECK-NEXT: add x10, sp, #712
2355 ; CHECK-NEXT: ld1 { v6.b }[10], [x14]
2356 ; CHECK-NEXT: add x14, sp, #992
2357 ; CHECK-NEXT: ld1 { v0.b }[8], [x9]
2358 ; CHECK-NEXT: ld1 { v16.b }[2], [x14]
2359 ; CHECK-NEXT: add x14, sp, #296
2360 ; CHECK-NEXT: ld1 { v3.b }[11], [x11]
2361 ; CHECK-NEXT: add x9, sp, #304
2362 ; CHECK-NEXT: add x11, sp, #312
2363 ; CHECK-NEXT: ld1 { v6.b }[11], [x14]
2364 ; CHECK-NEXT: mov v1.b[7], w7
2365 ; CHECK-NEXT: add x14, sp, #320
2366 ; CHECK-NEXT: ld1 { v0.b }[9], [x15]
2367 ; CHECK-NEXT: add x15, sp, #328
2368 ; CHECK-NEXT: ld1 { v3.b }[12], [x8]
2369 ; CHECK-NEXT: add x8, sp, #416
2370 ; CHECK-NEXT: ld1 { v6.b }[12], [x9]
2371 ; CHECK-NEXT: add x9, sp, #1384
2372 ; CHECK-NEXT: ld1 { v0.b }[10], [x8]
2373 ; CHECK-NEXT: ld1 { v7.b }[3], [x9]
2374 ; CHECK-NEXT: add x9, sp, #424
2375 ; CHECK-NEXT: ld1 { v3.b }[13], [x13]
2376 ; CHECK-NEXT: add x8, sp, #432
2377 ; CHECK-NEXT: add x13, sp, #440
2378 ; CHECK-NEXT: ld1 { v6.b }[13], [x11]
2379 ; CHECK-NEXT: add x11, sp, #16
2380 ; CHECK-NEXT: ld1 { v0.b }[11], [x9]
2381 ; CHECK-NEXT: add x9, sp, #1000
2382 ; CHECK-NEXT: ld1 { v1.b }[8], [x11]
2383 ; CHECK-NEXT: ld1 { v16.b }[3], [x9]
2384 ; CHECK-NEXT: ld1 { v3.b }[14], [x12]
2385 ; CHECK-NEXT: add x12, sp, #488
2386 ; CHECK-NEXT: ld1 { v6.b }[14], [x14]
2387 ; CHECK-NEXT: add x14, sp, #1392
2388 ; CHECK-NEXT: ld1 { v2.b }[3], [x12]
2389 ; CHECK-NEXT: ld1 { v7.b }[4], [x14]
2390 ; CHECK-NEXT: add x11, sp, #1008
2391 ; CHECK-NEXT: ld1 { v0.b }[12], [x8]
2392 ; CHECK-NEXT: ld1 { v16.b }[4], [x11]
2393 ; CHECK-NEXT: add x8, sp, #1400
2394 ; CHECK-NEXT: ld1 { v3.b }[15], [x10]
2395 ; CHECK-NEXT: add x10, sp, #496
2396 ; CHECK-NEXT: add x9, sp, #24
2397 ; CHECK-NEXT: ld1 { v6.b }[15], [x15]
2398 ; CHECK-NEXT: ld1 { v7.b }[5], [x8]
2399 ; CHECK-NEXT: ld1 { v2.b }[4], [x10]
2400 ; CHECK-NEXT: add x10, sp, #1016
2401 ; CHECK-NEXT: ld1 { v16.b }[5], [x10]
2402 ; CHECK-NEXT: ld1 { v0.b }[13], [x13]
2403 ; CHECK-NEXT: add x8, sp, #1408
2404 ; CHECK-NEXT: ld1 { v1.b }[9], [x9]
2405 ; CHECK-NEXT: add x9, sp, #504
2406 ; CHECK-NEXT: add x10, sp, #512
2407 ; CHECK-NEXT: ld1 { v7.b }[6], [x8]
2408 ; CHECK-NEXT: ld1 { v2.b }[5], [x9]
2409 ; CHECK-NEXT: add x9, sp, #1024
2410 ; CHECK-NEXT: add x8, sp, #32
2411 ; CHECK-NEXT: ld1 { v16.b }[6], [x9]
2412 ; CHECK-NEXT: ld1 { v0.b }[14], [x16]
2413 ; CHECK-NEXT: ld1 { v1.b }[10], [x8]
2414 ; CHECK-NEXT: add x8, sp, #1416
2415 ; CHECK-NEXT: add x9, sp, #456
2416 ; CHECK-NEXT: ld1 { v7.b }[7], [x8]
2417 ; CHECK-NEXT: ld1 { v2.b }[6], [x10]
2418 ; CHECK-NEXT: add x10, sp, #1032
2419 ; CHECK-NEXT: add x8, sp, #40
2420 ; CHECK-NEXT: ld1 { v16.b }[7], [x10]
2421 ; CHECK-NEXT: ld1 { v0.b }[15], [x9]
2422 ; CHECK-NEXT: ld1 { v1.b }[11], [x8]
2423 ; CHECK-NEXT: add x8, sp, #1424
2424 ; CHECK-NEXT: add x9, sp, #520
2425 ; CHECK-NEXT: ld1 { v7.b }[8], [x8]
2426 ; CHECK-NEXT: ld1 { v2.b }[7], [x9]
2427 ; CHECK-NEXT: add x9, sp, #1040
2428 ; CHECK-NEXT: add x8, sp, #48
2429 ; CHECK-NEXT: ld1 { v16.b }[8], [x9]
2430 ; CHECK-NEXT: add x10, sp, #528
2431 ; CHECK-NEXT: ld1 { v1.b }[12], [x8]
2432 ; CHECK-NEXT: add x8, sp, #1432
2433 ; CHECK-NEXT: sdot v5.4s, v6.16b, v3.16b
2434 ; CHECK-NEXT: ld1 { v7.b }[9], [x8]
2435 ; CHECK-NEXT: ld1 { v2.b }[8], [x10]
2436 ; CHECK-NEXT: add x8, sp, #1048
2437 ; CHECK-NEXT: ldr b3, [sp, #80]
2438 ; CHECK-NEXT: ld1 { v16.b }[9], [x8]
2439 ; CHECK-NEXT: add x10, sp, #88
2440 ; CHECK-NEXT: add x8, sp, #536
2441 ; CHECK-NEXT: add x11, sp, #1440
2442 ; CHECK-NEXT: add x9, sp, #56
2443 ; CHECK-NEXT: ld1 { v3.b }[1], [x10]
2444 ; CHECK-NEXT: ld1 { v2.b }[9], [x8]
2445 ; CHECK-NEXT: add x8, sp, #1056
2446 ; CHECK-NEXT: ld1 { v7.b }[10], [x11]
2447 ; CHECK-NEXT: ld1 { v16.b }[10], [x8]
2448 ; CHECK-NEXT: ld1 { v1.b }[13], [x9]
2449 ; CHECK-NEXT: add x9, sp, #96
2450 ; CHECK-NEXT: add x8, sp, #544
2451 ; CHECK-NEXT: add x10, sp, #1448
2452 ; CHECK-NEXT: ld1 { v3.b }[2], [x9]
2453 ; CHECK-NEXT: ld1 { v2.b }[10], [x8]
2454 ; CHECK-NEXT: add x8, sp, #1064
2455 ; CHECK-NEXT: ld1 { v7.b }[11], [x10]
2456 ; CHECK-NEXT: ld1 { v16.b }[11], [x8]
2457 ; CHECK-NEXT: add x10, sp, #104
2458 ; CHECK-NEXT: add x8, sp, #552
2459 ; CHECK-NEXT: add x11, sp, #1456
2460 ; CHECK-NEXT: add x9, sp, #64
2461 ; CHECK-NEXT: ld1 { v3.b }[3], [x10]
2462 ; CHECK-NEXT: ld1 { v2.b }[11], [x8]
2463 ; CHECK-NEXT: add x8, sp, #1072
2464 ; CHECK-NEXT: ld1 { v7.b }[12], [x11]
2465 ; CHECK-NEXT: ld1 { v16.b }[12], [x8]
2466 ; CHECK-NEXT: ld1 { v1.b }[14], [x9]
2467 ; CHECK-NEXT: add x9, sp, #112
2468 ; CHECK-NEXT: add x8, sp, #560
2469 ; CHECK-NEXT: add x10, sp, #1464
2470 ; CHECK-NEXT: ld1 { v3.b }[4], [x9]
2471 ; CHECK-NEXT: ld1 { v2.b }[12], [x8]
2472 ; CHECK-NEXT: add x8, sp, #1080
2473 ; CHECK-NEXT: ld1 { v7.b }[13], [x10]
2474 ; CHECK-NEXT: ld1 { v16.b }[13], [x8]
2475 ; CHECK-NEXT: add x10, sp, #120
2476 ; CHECK-NEXT: add x8, sp, #568
2477 ; CHECK-NEXT: add x11, sp, #1472
2478 ; CHECK-NEXT: add x9, sp, #72
2479 ; CHECK-NEXT: ld1 { v3.b }[5], [x10]
2480 ; CHECK-NEXT: ld1 { v2.b }[13], [x8]
2481 ; CHECK-NEXT: add x8, sp, #1088
2482 ; CHECK-NEXT: ld1 { v7.b }[14], [x11]
2483 ; CHECK-NEXT: ld1 { v16.b }[14], [x8]
2484 ; CHECK-NEXT: ld1 { v1.b }[15], [x9]
2485 ; CHECK-NEXT: add x9, sp, #128
2486 ; CHECK-NEXT: ldr b6, [sp, #1104]
2487 ; CHECK-NEXT: add x10, sp, #1480
2488 ; CHECK-NEXT: ld1 { v3.b }[6], [x9]
2489 ; CHECK-NEXT: add x8, sp, #1096
2490 ; CHECK-NEXT: add x9, sp, #1112
2491 ; CHECK-NEXT: ld1 { v7.b }[15], [x10]
2492 ; CHECK-NEXT: ld1 { v16.b }[15], [x8]
2493 ; CHECK-NEXT: ld1 { v6.b }[1], [x9]
2494 ; CHECK-NEXT: add x8, sp, #728
2495 ; CHECK-NEXT: add x9, sp, #576
2496 ; CHECK-NEXT: add x10, sp, #136
2497 ; CHECK-NEXT: ld1 { v17.b }[1], [x8]
2498 ; CHECK-NEXT: add x8, sp, #1120
2499 ; CHECK-NEXT: ld1 { v2.b }[14], [x9]
2500 ; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b
2501 ; CHECK-NEXT: ld1 { v6.b }[2], [x8]
2502 ; CHECK-NEXT: add x8, sp, #736
2503 ; CHECK-NEXT: ldr b7, [sp, #1232]
2504 ; CHECK-NEXT: ldr b16, [sp, #848]
2505 ; CHECK-NEXT: ld1 { v3.b }[7], [x10]
2506 ; CHECK-NEXT: ld1 { v17.b }[2], [x8]
2507 ; CHECK-NEXT: add x9, sp, #1240
2508 ; CHECK-NEXT: add x10, sp, #856
2509 ; CHECK-NEXT: ld1 { v7.b }[1], [x9]
2510 ; CHECK-NEXT: ld1 { v16.b }[1], [x10]
2511 ; CHECK-NEXT: add x8, sp, #1128
2512 ; CHECK-NEXT: add x11, sp, #744
2513 ; CHECK-NEXT: ld1 { v6.b }[3], [x8]
2514 ; CHECK-NEXT: add x10, sp, #1248
2515 ; CHECK-NEXT: ld1 { v17.b }[3], [x11]
2516 ; CHECK-NEXT: add x11, sp, #864
2517 ; CHECK-NEXT: add x9, sp, #144
2518 ; CHECK-NEXT: ld1 { v7.b }[2], [x10]
2519 ; CHECK-NEXT: ld1 { v16.b }[2], [x11]
2520 ; CHECK-NEXT: add x8, sp, #1136
2521 ; CHECK-NEXT: add x12, sp, #752
2522 ; CHECK-NEXT: ld1 { v3.b }[8], [x9]
2523 ; CHECK-NEXT: ld1 { v6.b }[4], [x8]
2524 ; CHECK-NEXT: ld1 { v17.b }[4], [x12]
2525 ; CHECK-NEXT: add x9, sp, #1256
2526 ; CHECK-NEXT: add x10, sp, #872
2527 ; CHECK-NEXT: ld1 { v7.b }[3], [x9]
2528 ; CHECK-NEXT: ld1 { v16.b }[3], [x10]
2529 ; CHECK-NEXT: add x8, sp, #1144
2530 ; CHECK-NEXT: add x11, sp, #760
2531 ; CHECK-NEXT: ld1 { v6.b }[5], [x8]
2532 ; CHECK-NEXT: add x10, sp, #1264
2533 ; CHECK-NEXT: ld1 { v17.b }[5], [x11]
2534 ; CHECK-NEXT: add x11, sp, #880
2535 ; CHECK-NEXT: add x9, sp, #152
2536 ; CHECK-NEXT: ld1 { v7.b }[4], [x10]
2537 ; CHECK-NEXT: ld1 { v16.b }[4], [x11]
2538 ; CHECK-NEXT: add x8, sp, #1152
2539 ; CHECK-NEXT: add x12, sp, #768
2540 ; CHECK-NEXT: ld1 { v3.b }[9], [x9]
2541 ; CHECK-NEXT: ld1 { v6.b }[6], [x8]
2542 ; CHECK-NEXT: ld1 { v17.b }[6], [x12]
2543 ; CHECK-NEXT: add x9, sp, #1272
2544 ; CHECK-NEXT: add x10, sp, #888
2545 ; CHECK-NEXT: ld1 { v7.b }[5], [x9]
2546 ; CHECK-NEXT: ld1 { v16.b }[5], [x10]
2547 ; CHECK-NEXT: add x8, sp, #1160
2548 ; CHECK-NEXT: add x11, sp, #776
2549 ; CHECK-NEXT: ld1 { v6.b }[7], [x8]
2550 ; CHECK-NEXT: add x10, sp, #1280
2551 ; CHECK-NEXT: ld1 { v17.b }[7], [x11]
2552 ; CHECK-NEXT: add x11, sp, #896
2553 ; CHECK-NEXT: add x9, sp, #160
2554 ; CHECK-NEXT: ld1 { v7.b }[6], [x10]
2555 ; CHECK-NEXT: ld1 { v16.b }[6], [x11]
2556 ; CHECK-NEXT: add x8, sp, #1168
2557 ; CHECK-NEXT: add x12, sp, #784
2558 ; CHECK-NEXT: ld1 { v3.b }[10], [x9]
2559 ; CHECK-NEXT: ld1 { v6.b }[8], [x8]
2560 ; CHECK-NEXT: ld1 { v17.b }[8], [x12]
2561 ; CHECK-NEXT: add x9, sp, #1288
2562 ; CHECK-NEXT: add x10, sp, #904
2563 ; CHECK-NEXT: ld1 { v7.b }[7], [x9]
2564 ; CHECK-NEXT: ld1 { v16.b }[7], [x10]
2565 ; CHECK-NEXT: add x8, sp, #1176
2566 ; CHECK-NEXT: add x11, sp, #792
2567 ; CHECK-NEXT: ld1 { v6.b }[9], [x8]
2568 ; CHECK-NEXT: add x10, sp, #1296
2569 ; CHECK-NEXT: ld1 { v17.b }[9], [x11]
2570 ; CHECK-NEXT: add x11, sp, #912
2571 ; CHECK-NEXT: add x9, sp, #168
2572 ; CHECK-NEXT: ld1 { v7.b }[8], [x10]
2573 ; CHECK-NEXT: ld1 { v16.b }[8], [x11]
2574 ; CHECK-NEXT: add x8, sp, #1184
2575 ; CHECK-NEXT: add x12, sp, #800
2576 ; CHECK-NEXT: ld1 { v3.b }[11], [x9]
2577 ; CHECK-NEXT: ld1 { v6.b }[10], [x8]
2578 ; CHECK-NEXT: ld1 { v17.b }[10], [x12]
2579 ; CHECK-NEXT: add x9, sp, #1304
2580 ; CHECK-NEXT: add x10, sp, #920
2581 ; CHECK-NEXT: ld1 { v7.b }[9], [x9]
2582 ; CHECK-NEXT: ld1 { v16.b }[9], [x10]
2583 ; CHECK-NEXT: add x8, sp, #1192
2584 ; CHECK-NEXT: add x11, sp, #808
2585 ; CHECK-NEXT: ld1 { v6.b }[11], [x8]
2586 ; CHECK-NEXT: add x10, sp, #1312
2587 ; CHECK-NEXT: ld1 { v17.b }[11], [x11]
2588 ; CHECK-NEXT: add x11, sp, #928
2589 ; CHECK-NEXT: add x9, sp, #176
2590 ; CHECK-NEXT: ld1 { v7.b }[10], [x10]
2591 ; CHECK-NEXT: ld1 { v16.b }[10], [x11]
2592 ; CHECK-NEXT: add x8, sp, #1200
2593 ; CHECK-NEXT: add x12, sp, #816
2594 ; CHECK-NEXT: ld1 { v3.b }[12], [x9]
2595 ; CHECK-NEXT: ld1 { v6.b }[12], [x8]
2596 ; CHECK-NEXT: ld1 { v17.b }[12], [x12]
2597 ; CHECK-NEXT: add x9, sp, #1320
2598 ; CHECK-NEXT: add x10, sp, #936
2599 ; CHECK-NEXT: ld1 { v7.b }[11], [x9]
2600 ; CHECK-NEXT: ld1 { v16.b }[11], [x10]
2601 ; CHECK-NEXT: add x8, sp, #1208
2602 ; CHECK-NEXT: add x11, sp, #824
2603 ; CHECK-NEXT: ld1 { v6.b }[13], [x8]
2604 ; CHECK-NEXT: add x10, sp, #1328
2605 ; CHECK-NEXT: ld1 { v17.b }[13], [x11]
2606 ; CHECK-NEXT: add x11, sp, #944
2607 ; CHECK-NEXT: add x9, sp, #184
2608 ; CHECK-NEXT: ld1 { v7.b }[12], [x10]
2609 ; CHECK-NEXT: ld1 { v16.b }[12], [x11]
2610 ; CHECK-NEXT: add x8, sp, #1216
2611 ; CHECK-NEXT: add x12, sp, #832
2612 ; CHECK-NEXT: ld1 { v3.b }[13], [x9]
2613 ; CHECK-NEXT: ld1 { v6.b }[14], [x8]
2614 ; CHECK-NEXT: ld1 { v17.b }[14], [x12]
2615 ; CHECK-NEXT: add x9, sp, #1336
2616 ; CHECK-NEXT: add x10, sp, #952
2617 ; CHECK-NEXT: ld1 { v7.b }[13], [x9]
2618 ; CHECK-NEXT: ld1 { v16.b }[13], [x10]
2619 ; CHECK-NEXT: add x8, sp, #1224
2620 ; CHECK-NEXT: add x11, sp, #840
2621 ; CHECK-NEXT: ld1 { v6.b }[15], [x8]
2622 ; CHECK-NEXT: add x8, sp, #192
2623 ; CHECK-NEXT: ld1 { v17.b }[15], [x11]
2624 ; CHECK-NEXT: add x10, sp, #1344
2625 ; CHECK-NEXT: add x11, sp, #960
2626 ; CHECK-NEXT: ld1 { v3.b }[14], [x8]
2627 ; CHECK-NEXT: ld1 { v7.b }[14], [x10]
2628 ; CHECK-NEXT: ld1 { v16.b }[14], [x11]
2629 ; CHECK-NEXT: add x9, sp, #584
2630 ; CHECK-NEXT: sdot v5.4s, v1.16b, v0.16b
2631 ; CHECK-NEXT: add x8, sp, #200
2632 ; CHECK-NEXT: sdot v4.4s, v17.16b, v6.16b
2633 ; CHECK-NEXT: ld1 { v2.b }[15], [x9]
2634 ; CHECK-NEXT: add x9, sp, #1352
2635 ; CHECK-NEXT: add x10, sp, #968
2636 ; CHECK-NEXT: ld1 { v3.b }[15], [x8]
2637 ; CHECK-NEXT: ld1 { v7.b }[15], [x9]
2638 ; CHECK-NEXT: ld1 { v16.b }[15], [x10]
2639 ; CHECK-NEXT: sdot v5.4s, v3.16b, v2.16b
2640 ; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b
2641 ; CHECK-NEXT: add v0.4s, v5.4s, v4.4s
2642 ; CHECK-NEXT: addv s0, v0.4s
2643 ; CHECK-NEXT: fmov w0, s0
2644 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2647 %az = sext <48 x i8> %a to <48 x i32>
2648 %bz = sext <48 x i8> %b to <48 x i32>
2649 %m1 = mul nuw nsw <48 x i32> %az, %bz
2650 %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m1)
2651 %cz = sext <48 x i8> %c to <48 x i32>
2652 %dz = sext <48 x i8> %d to <48 x i32>
2653 %m2 = mul nuw nsw <48 x i32> %cz, %dz
2654 %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m2)
2655 %x = add i32 %r1, %r2
2659 define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) {
2660 ; CHECK-LABEL: test_sdot_v48i8_double_nomla:
2661 ; CHECK: // %bb.0: // %entry
2662 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
2663 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2664 ; CHECK-NEXT: .cfi_offset w29, -16
2665 ; CHECK-NEXT: ldr b5, [sp, #208]
2666 ; CHECK-NEXT: add x8, sp, #216
2667 ; CHECK-NEXT: fmov s0, w0
2668 ; CHECK-NEXT: ldr b4, [sp, #976]
2669 ; CHECK-NEXT: add x9, sp, #984
2670 ; CHECK-NEXT: add x12, sp, #328
2671 ; CHECK-NEXT: ld1 { v5.b }[1], [x8]
2672 ; CHECK-NEXT: add x8, sp, #224
2673 ; CHECK-NEXT: movi v1.16b, #1
2674 ; CHECK-NEXT: mov v0.b[1], w1
2675 ; CHECK-NEXT: ld1 { v4.b }[1], [x9]
2676 ; CHECK-NEXT: movi v3.2d, #0000000000000000
2677 ; CHECK-NEXT: add x11, sp, #992
2678 ; CHECK-NEXT: ldr b6, [sp, #720]
2679 ; CHECK-NEXT: ldr b7, [sp, #80]
2680 ; CHECK-NEXT: ld1 { v5.b }[2], [x8]
2681 ; CHECK-NEXT: add x8, sp, #232
2682 ; CHECK-NEXT: add x13, sp, #88
2683 ; CHECK-NEXT: ld1 { v4.b }[2], [x11]
2684 ; CHECK-NEXT: ld1 { v7.b }[1], [x13]
2685 ; CHECK-NEXT: add x13, sp, #856
2686 ; CHECK-NEXT: mov v0.b[2], w2
2687 ; CHECK-NEXT: add x14, sp, #1008
2688 ; CHECK-NEXT: add x15, sp, #872
2689 ; CHECK-NEXT: ld1 { v5.b }[3], [x8]
2690 ; CHECK-NEXT: add x8, sp, #240
2691 ; CHECK-NEXT: add x16, sp, #888
2692 ; CHECK-NEXT: add x10, sp, #16
2693 ; CHECK-NEXT: add x9, sp, #24
2694 ; CHECK-NEXT: add x11, sp, #40
2695 ; CHECK-NEXT: movi v2.2d, #0000000000000000
2696 ; CHECK-NEXT: ld1 { v5.b }[4], [x8]
2697 ; CHECK-NEXT: add x8, sp, #248
2698 ; CHECK-NEXT: mov v0.b[3], w3
2699 ; CHECK-NEXT: ld1 { v5.b }[5], [x8]
2700 ; CHECK-NEXT: add x8, sp, #256
2701 ; CHECK-NEXT: mov v0.b[4], w4
2702 ; CHECK-NEXT: ld1 { v5.b }[6], [x8]
2703 ; CHECK-NEXT: add x8, sp, #264
2704 ; CHECK-NEXT: mov v0.b[5], w5
2705 ; CHECK-NEXT: ld1 { v5.b }[7], [x8]
2706 ; CHECK-NEXT: add x8, sp, #272
2707 ; CHECK-NEXT: ld1 { v5.b }[8], [x8]
2708 ; CHECK-NEXT: add x8, sp, #280
2709 ; CHECK-NEXT: mov v0.b[6], w6
2710 ; CHECK-NEXT: ld1 { v5.b }[9], [x8]
2711 ; CHECK-NEXT: add x8, sp, #288
2712 ; CHECK-NEXT: mov v0.b[7], w7
2713 ; CHECK-NEXT: ld1 { v5.b }[10], [x8]
2714 ; CHECK-NEXT: add x8, sp, #296
2715 ; CHECK-NEXT: ld1 { v0.b }[8], [x10]
2716 ; CHECK-NEXT: add x10, sp, #128
2717 ; CHECK-NEXT: ld1 { v5.b }[11], [x8]
2718 ; CHECK-NEXT: add x8, sp, #304
2719 ; CHECK-NEXT: ld1 { v0.b }[9], [x9]
2720 ; CHECK-NEXT: add x9, sp, #136
2721 ; CHECK-NEXT: ld1 { v5.b }[12], [x8]
2722 ; CHECK-NEXT: add x8, sp, #312
2723 ; CHECK-NEXT: ld1 { v5.b }[13], [x8]
2724 ; CHECK-NEXT: add x8, sp, #320
2725 ; CHECK-NEXT: ld1 { v5.b }[14], [x8]
2726 ; CHECK-NEXT: add x8, sp, #32
2727 ; CHECK-NEXT: ld1 { v0.b }[10], [x8]
2728 ; CHECK-NEXT: add x8, sp, #144
2729 ; CHECK-NEXT: ld1 { v5.b }[15], [x12]
2730 ; CHECK-NEXT: add x12, sp, #728
2731 ; CHECK-NEXT: ld1 { v6.b }[1], [x12]
2732 ; CHECK-NEXT: add x12, sp, #1000
2733 ; CHECK-NEXT: ld1 { v0.b }[11], [x11]
2734 ; CHECK-NEXT: ld1 { v4.b }[3], [x12]
2735 ; CHECK-NEXT: add x12, sp, #736
2736 ; CHECK-NEXT: add x11, sp, #920
2737 ; CHECK-NEXT: sdot v3.4s, v5.16b, v1.16b
2738 ; CHECK-NEXT: ldr b5, [sp, #848]
2739 ; CHECK-NEXT: ld1 { v6.b }[2], [x12]
2740 ; CHECK-NEXT: add x12, sp, #48
2741 ; CHECK-NEXT: ld1 { v5.b }[1], [x13]
2742 ; CHECK-NEXT: add x13, sp, #744
2743 ; CHECK-NEXT: ld1 { v4.b }[4], [x14]
2744 ; CHECK-NEXT: add x14, sp, #96
2745 ; CHECK-NEXT: ld1 { v0.b }[12], [x12]
2746 ; CHECK-NEXT: ld1 { v6.b }[3], [x13]
2747 ; CHECK-NEXT: add x13, sp, #864
2748 ; CHECK-NEXT: ld1 { v7.b }[2], [x14]
2749 ; CHECK-NEXT: add x14, sp, #1016
2750 ; CHECK-NEXT: ld1 { v5.b }[2], [x13]
2751 ; CHECK-NEXT: add x13, sp, #752
2752 ; CHECK-NEXT: ld1 { v4.b }[5], [x14]
2753 ; CHECK-NEXT: add x14, sp, #104
2754 ; CHECK-NEXT: ld1 { v6.b }[4], [x13]
2755 ; CHECK-NEXT: add x13, sp, #1024
2756 ; CHECK-NEXT: ld1 { v7.b }[3], [x14]
2757 ; CHECK-NEXT: ld1 { v5.b }[3], [x15]
2758 ; CHECK-NEXT: add x15, sp, #760
2759 ; CHECK-NEXT: add x14, sp, #112
2760 ; CHECK-NEXT: ld1 { v4.b }[6], [x13]
2761 ; CHECK-NEXT: add x13, sp, #880
2762 ; CHECK-NEXT: ld1 { v6.b }[5], [x15]
2763 ; CHECK-NEXT: add x15, sp, #1032
2764 ; CHECK-NEXT: ld1 { v7.b }[4], [x14]
2765 ; CHECK-NEXT: ld1 { v5.b }[4], [x13]
2766 ; CHECK-NEXT: add x14, sp, #768
2767 ; CHECK-NEXT: add x13, sp, #120
2768 ; CHECK-NEXT: ld1 { v4.b }[7], [x15]
2769 ; CHECK-NEXT: add x15, sp, #1040
2770 ; CHECK-NEXT: ld1 { v6.b }[6], [x14]
2771 ; CHECK-NEXT: ld1 { v7.b }[5], [x13]
2772 ; CHECK-NEXT: add x13, sp, #776
2773 ; CHECK-NEXT: ld1 { v5.b }[5], [x16]
2774 ; CHECK-NEXT: add x14, sp, #1048
2775 ; CHECK-NEXT: ld1 { v4.b }[8], [x15]
2776 ; CHECK-NEXT: add x15, sp, #896
2777 ; CHECK-NEXT: ld1 { v6.b }[7], [x13]
2778 ; CHECK-NEXT: ld1 { v7.b }[6], [x10]
2779 ; CHECK-NEXT: add x10, sp, #784
2780 ; CHECK-NEXT: ld1 { v5.b }[6], [x15]
2781 ; CHECK-NEXT: add x13, sp, #1056
2782 ; CHECK-NEXT: ld1 { v4.b }[9], [x14]
2783 ; CHECK-NEXT: add x14, sp, #904
2784 ; CHECK-NEXT: ld1 { v6.b }[8], [x10]
2785 ; CHECK-NEXT: ld1 { v7.b }[7], [x9]
2786 ; CHECK-NEXT: add x9, sp, #792
2787 ; CHECK-NEXT: ld1 { v5.b }[7], [x14]
2788 ; CHECK-NEXT: add x10, sp, #1064
2789 ; CHECK-NEXT: ld1 { v4.b }[10], [x13]
2790 ; CHECK-NEXT: add x13, sp, #912
2791 ; CHECK-NEXT: ld1 { v6.b }[9], [x9]
2792 ; CHECK-NEXT: ld1 { v7.b }[8], [x8]
2793 ; CHECK-NEXT: add x9, sp, #800
2794 ; CHECK-NEXT: ld1 { v5.b }[8], [x13]
2795 ; CHECK-NEXT: add x8, sp, #152
2796 ; CHECK-NEXT: ld1 { v4.b }[11], [x10]
2797 ; CHECK-NEXT: add x10, sp, #1072
2798 ; CHECK-NEXT: ld1 { v6.b }[10], [x9]
2799 ; CHECK-NEXT: ld1 { v7.b }[9], [x8]
2800 ; CHECK-NEXT: add x9, sp, #808
2801 ; CHECK-NEXT: ld1 { v5.b }[9], [x11]
2802 ; CHECK-NEXT: add x8, sp, #56
2803 ; CHECK-NEXT: ld1 { v4.b }[12], [x10]
2804 ; CHECK-NEXT: add x10, sp, #160
2805 ; CHECK-NEXT: ld1 { v0.b }[13], [x8]
2806 ; CHECK-NEXT: ld1 { v6.b }[11], [x9]
2807 ; CHECK-NEXT: add x9, sp, #928
2808 ; CHECK-NEXT: ld1 { v7.b }[10], [x10]
2809 ; CHECK-NEXT: add x10, sp, #1080
2810 ; CHECK-NEXT: ld1 { v5.b }[10], [x9]
2811 ; CHECK-NEXT: add x8, sp, #816
2812 ; CHECK-NEXT: ld1 { v4.b }[13], [x10]
2813 ; CHECK-NEXT: add x9, sp, #168
2814 ; CHECK-NEXT: add x10, sp, #176
2815 ; CHECK-NEXT: ld1 { v6.b }[12], [x8]
2816 ; CHECK-NEXT: add x8, sp, #936
2817 ; CHECK-NEXT: ld1 { v7.b }[11], [x9]
2818 ; CHECK-NEXT: add x9, sp, #1088
2819 ; CHECK-NEXT: ld1 { v5.b }[11], [x8]
2820 ; CHECK-NEXT: add x8, sp, #64
2821 ; CHECK-NEXT: ld1 { v4.b }[14], [x9]
2822 ; CHECK-NEXT: add x9, sp, #824
2823 ; CHECK-NEXT: ld1 { v0.b }[14], [x8]
2824 ; CHECK-NEXT: ld1 { v6.b }[13], [x9]
2825 ; CHECK-NEXT: add x9, sp, #944
2826 ; CHECK-NEXT: ld1 { v7.b }[12], [x10]
2827 ; CHECK-NEXT: add x10, sp, #1096
2828 ; CHECK-NEXT: ld1 { v5.b }[12], [x9]
2829 ; CHECK-NEXT: add x8, sp, #832
2830 ; CHECK-NEXT: ld1 { v4.b }[15], [x10]
2831 ; CHECK-NEXT: add x9, sp, #184
2832 ; CHECK-NEXT: add x10, sp, #72
2833 ; CHECK-NEXT: ld1 { v6.b }[14], [x8]
2834 ; CHECK-NEXT: add x8, sp, #952
2835 ; CHECK-NEXT: ld1 { v7.b }[13], [x9]
2836 ; CHECK-NEXT: ld1 { v5.b }[13], [x8]
2837 ; CHECK-NEXT: add x8, sp, #840
2838 ; CHECK-NEXT: ld1 { v0.b }[15], [x10]
2839 ; CHECK-NEXT: sdot v2.4s, v4.16b, v1.16b
2840 ; CHECK-NEXT: add x9, sp, #192
2841 ; CHECK-NEXT: ld1 { v6.b }[15], [x8]
2842 ; CHECK-NEXT: add x8, sp, #960
2843 ; CHECK-NEXT: ld1 { v7.b }[14], [x9]
2844 ; CHECK-NEXT: ld1 { v5.b }[14], [x8]
2845 ; CHECK-NEXT: sdot v3.4s, v0.16b, v1.16b
2846 ; CHECK-NEXT: add x8, sp, #200
2847 ; CHECK-NEXT: add x9, sp, #968
2848 ; CHECK-NEXT: sdot v2.4s, v6.16b, v1.16b
2849 ; CHECK-NEXT: ld1 { v7.b }[15], [x8]
2850 ; CHECK-NEXT: ld1 { v5.b }[15], [x9]
2851 ; CHECK-NEXT: sdot v3.4s, v7.16b, v1.16b
2852 ; CHECK-NEXT: sdot v2.4s, v5.16b, v1.16b
2853 ; CHECK-NEXT: add v0.4s, v3.4s, v2.4s
2854 ; CHECK-NEXT: addv s0, v0.4s
2855 ; CHECK-NEXT: fmov w0, s0
2856 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
2859 %az = sext <48 x i8> %a to <48 x i32>
2860 %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %az)
2861 %cz = sext <48 x i8> %c to <48 x i32>
2862 %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %cz)
2863 %x = add i32 %r1, %r2
2867 define i32 @test_udot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2868 ; CHECK-LABEL: test_udot_v64i8:
2869 ; CHECK: // %bb.0: // %entry
2870 ; CHECK-NEXT: movi v0.2d, #0000000000000000
2871 ; CHECK-NEXT: movi v1.2d, #0000000000000000
2872 ; CHECK-NEXT: ldp q2, q3, [x0, #32]
2873 ; CHECK-NEXT: ldp q4, q5, [x1, #32]
2874 ; CHECK-NEXT: udot v1.4s, v5.16b, v3.16b
2875 ; CHECK-NEXT: udot v0.4s, v4.16b, v2.16b
2876 ; CHECK-NEXT: ldp q2, q3, [x0]
2877 ; CHECK-NEXT: ldp q4, q5, [x1]
2878 ; CHECK-NEXT: udot v1.4s, v5.16b, v3.16b
2879 ; CHECK-NEXT: udot v0.4s, v4.16b, v2.16b
2880 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
2881 ; CHECK-NEXT: addv s0, v0.4s
2882 ; CHECK-NEXT: fmov w8, s0
2883 ; CHECK-NEXT: add w0, w8, w2
2886 %0 = load <64 x i8>, ptr %a
2887 %1 = zext <64 x i8> %0 to <64 x i32>
2888 %2 = load <64 x i8>, ptr %b
2889 %3 = zext <64 x i8> %2 to <64 x i32>
2890 %4 = mul nuw nsw <64 x i32> %3, %1
2891 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
2892 %op.extra = add i32 %5, %sum
2896 define i32 @test_udot_v64i8_nomla(ptr nocapture readonly %a1) {
2897 ; CHECK-LABEL: test_udot_v64i8_nomla:
2898 ; CHECK: // %bb.0: // %entry
2899 ; CHECK-NEXT: movi v0.16b, #1
2900 ; CHECK-NEXT: movi v1.2d, #0000000000000000
2901 ; CHECK-NEXT: movi v2.2d, #0000000000000000
2902 ; CHECK-NEXT: ldp q3, q4, [x0, #32]
2903 ; CHECK-NEXT: udot v2.4s, v4.16b, v0.16b
2904 ; CHECK-NEXT: udot v1.4s, v3.16b, v0.16b
2905 ; CHECK-NEXT: ldp q3, q4, [x0]
2906 ; CHECK-NEXT: udot v2.4s, v4.16b, v0.16b
2907 ; CHECK-NEXT: udot v1.4s, v3.16b, v0.16b
2908 ; CHECK-NEXT: add v0.4s, v1.4s, v2.4s
2909 ; CHECK-NEXT: addv s0, v0.4s
2910 ; CHECK-NEXT: fmov w0, s0
2913 %0 = load <64 x i8>, ptr %a1
2914 %1 = zext <64 x i8> %0 to <64 x i32>
2915 %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1)
2918 define i32 @test_sdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2919 ; CHECK-LABEL: test_sdot_v64i8:
2920 ; CHECK: // %bb.0: // %entry
2921 ; CHECK-NEXT: movi v0.2d, #0000000000000000
2922 ; CHECK-NEXT: movi v1.2d, #0000000000000000
2923 ; CHECK-NEXT: ldp q2, q3, [x0, #32]
2924 ; CHECK-NEXT: ldp q4, q5, [x1, #32]
2925 ; CHECK-NEXT: sdot v1.4s, v5.16b, v3.16b
2926 ; CHECK-NEXT: sdot v0.4s, v4.16b, v2.16b
2927 ; CHECK-NEXT: ldp q2, q3, [x0]
2928 ; CHECK-NEXT: ldp q4, q5, [x1]
2929 ; CHECK-NEXT: sdot v1.4s, v5.16b, v3.16b
2930 ; CHECK-NEXT: sdot v0.4s, v4.16b, v2.16b
2931 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
2932 ; CHECK-NEXT: addv s0, v0.4s
2933 ; CHECK-NEXT: fmov w8, s0
2934 ; CHECK-NEXT: add w0, w8, w2
2937 %0 = load <64 x i8>, ptr %a
2938 %1 = sext <64 x i8> %0 to <64 x i32>
2939 %2 = load <64 x i8>, ptr %b
2940 %3 = sext <64 x i8> %2 to <64 x i32>
2941 %4 = mul nsw <64 x i32> %3, %1
2942 %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
2943 %op.extra = add nsw i32 %5, %sum
2947 define i32 @test_sdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
2948 ; CHECK-LABEL: test_sdot_v64i8_double:
2949 ; CHECK: // %bb.0: // %entry
2950 ; CHECK-NEXT: movi v16.2d, #0000000000000000
2951 ; CHECK-NEXT: movi v17.2d, #0000000000000000
2952 ; CHECK-NEXT: movi v18.2d, #0000000000000000
2953 ; CHECK-NEXT: movi v19.2d, #0000000000000000
2954 ; CHECK-NEXT: ldp q20, q21, [sp, #96]
2955 ; CHECK-NEXT: ldp q22, q23, [sp, #32]
2956 ; CHECK-NEXT: sdot v16.4s, v3.16b, v7.16b
2957 ; CHECK-NEXT: sdot v18.4s, v2.16b, v6.16b
2958 ; CHECK-NEXT: sdot v19.4s, v23.16b, v21.16b
2959 ; CHECK-NEXT: sdot v17.4s, v22.16b, v20.16b
2960 ; CHECK-NEXT: ldp q2, q3, [sp, #64]
2961 ; CHECK-NEXT: ldp q6, q7, [sp]
2962 ; CHECK-NEXT: sdot v16.4s, v1.16b, v5.16b
2963 ; CHECK-NEXT: sdot v18.4s, v0.16b, v4.16b
2964 ; CHECK-NEXT: sdot v19.4s, v7.16b, v3.16b
2965 ; CHECK-NEXT: sdot v17.4s, v6.16b, v2.16b
2966 ; CHECK-NEXT: add v0.4s, v18.4s, v16.4s
2967 ; CHECK-NEXT: add v1.4s, v17.4s, v19.4s
2968 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
2969 ; CHECK-NEXT: addv s0, v0.4s
2970 ; CHECK-NEXT: fmov w0, s0
2973 %az = sext <64 x i8> %a to <64 x i32>
2974 %bz = sext <64 x i8> %b to <64 x i32>
2975 %m1 = mul nuw nsw <64 x i32> %az, %bz
2976 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
2977 %cz = sext <64 x i8> %c to <64 x i32>
2978 %dz = sext <64 x i8> %d to <64 x i32>
2979 %m2 = mul nuw nsw <64 x i32> %cz, %dz
2980 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
2981 %x = add i32 %r1, %r2
2985 define i32 @test_sdot_v64i8_double_nomla(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
2986 ; CHECK-LABEL: test_sdot_v64i8_double_nomla:
2987 ; CHECK: // %bb.0: // %entry
2988 ; CHECK-NEXT: movi v4.16b, #1
2989 ; CHECK-NEXT: movi v5.2d, #0000000000000000
2990 ; CHECK-NEXT: movi v6.2d, #0000000000000000
2991 ; CHECK-NEXT: movi v7.2d, #0000000000000000
2992 ; CHECK-NEXT: ldp q17, q18, [sp, #32]
2993 ; CHECK-NEXT: movi v16.2d, #0000000000000000
2994 ; CHECK-NEXT: sdot v5.4s, v3.16b, v4.16b
2995 ; CHECK-NEXT: sdot v6.4s, v17.16b, v4.16b
2996 ; CHECK-NEXT: sdot v7.4s, v2.16b, v4.16b
2997 ; CHECK-NEXT: ldp q2, q3, [sp]
2998 ; CHECK-NEXT: sdot v16.4s, v18.16b, v4.16b
2999 ; CHECK-NEXT: sdot v5.4s, v1.16b, v4.16b
3000 ; CHECK-NEXT: sdot v6.4s, v2.16b, v4.16b
3001 ; CHECK-NEXT: sdot v7.4s, v0.16b, v4.16b
3002 ; CHECK-NEXT: sdot v16.4s, v3.16b, v4.16b
3003 ; CHECK-NEXT: add v0.4s, v7.4s, v5.4s
3004 ; CHECK-NEXT: add v1.4s, v6.4s, v16.4s
3005 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
3006 ; CHECK-NEXT: addv s0, v0.4s
3007 ; CHECK-NEXT: fmov w0, s0
3010 %az = sext <64 x i8> %a to <64 x i32>
3011 %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %az)
3012 %cz = sext <64 x i8> %c to <64 x i32>
3013 %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %cz)
3014 %x = add i32 %r1, %r2