1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
3 ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
4 ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
6 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
7 ; CHECK-DOT-LABEL: udot:
9 ; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
12 ; CHECK-NODOT-LABEL: udot:
13 ; CHECK-NODOT: // %bb.0:
14 ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
15 ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
16 ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
17 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
18 ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
19 ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
20 ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
21 ; CHECK-NODOT-NEXT: ret
22 %u.wide = zext <16 x i8> %u to <16 x i32>
23 %s.wide = zext <16 x i8> %s to <16 x i32>
24 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
25 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
26 ret <4 x i32> %partial.reduce
29 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
30 ; CHECK-DOT-LABEL: udot_narrow:
31 ; CHECK-DOT: // %bb.0:
32 ; CHECK-DOT-NEXT: udot v0.2s, v2.8b, v1.8b
35 ; CHECK-NODOT-LABEL: udot_narrow:
36 ; CHECK-NODOT: // %bb.0:
37 ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
38 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
39 ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
40 ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
41 ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
42 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
43 ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
44 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
45 ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
46 ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
47 ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
48 ; CHECK-NODOT-NEXT: ret
49 %u.wide = zext <8 x i8> %u to <8 x i32>
50 %s.wide = zext <8 x i8> %s to <8 x i32>
51 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
52 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
53 ret <2 x i32> %partial.reduce
56 define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
57 ; CHECK-DOT-LABEL: sdot:
58 ; CHECK-DOT: // %bb.0:
59 ; CHECK-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
62 ; CHECK-NODOT-LABEL: sdot:
63 ; CHECK-NODOT: // %bb.0:
64 ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
65 ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
66 ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
67 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
68 ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
69 ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
70 ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
71 ; CHECK-NODOT-NEXT: ret
72 %u.wide = sext <16 x i8> %u to <16 x i32>
73 %s.wide = sext <16 x i8> %s to <16 x i32>
74 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
75 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
76 ret <4 x i32> %partial.reduce
79 define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
80 ; CHECK-DOT-LABEL: sdot_narrow:
81 ; CHECK-DOT: // %bb.0:
82 ; CHECK-DOT-NEXT: sdot v0.2s, v2.8b, v1.8b
85 ; CHECK-NODOT-LABEL: sdot_narrow:
86 ; CHECK-NODOT: // %bb.0:
87 ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
88 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
89 ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
90 ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
91 ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
92 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
93 ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
94 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
95 ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
96 ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
97 ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
98 ; CHECK-NODOT-NEXT: ret
99 %u.wide = sext <8 x i8> %u to <8 x i32>
100 %s.wide = sext <8 x i8> %s to <8 x i32>
101 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
102 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
103 ret <2 x i32> %partial.reduce
106 define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
107 ; CHECK-NOI8MM-LABEL: usdot:
108 ; CHECK-NOI8MM: // %bb.0:
109 ; CHECK-NOI8MM-NEXT: ushll v3.8h, v1.8b, #0
110 ; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0
111 ; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
112 ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
113 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h
114 ; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h
115 ; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h
116 ; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h
117 ; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s
118 ; CHECK-NOI8MM-NEXT: ret
120 ; CHECK-I8MM-LABEL: usdot:
121 ; CHECK-I8MM: // %bb.0:
122 ; CHECK-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b
123 ; CHECK-I8MM-NEXT: ret
124 %u.wide = zext <16 x i8> %u to <16 x i32>
125 %s.wide = sext <16 x i8> %s to <16 x i32>
126 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
127 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
128 ret <4 x i32> %partial.reduce
131 define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
132 ; CHECK-NOI8MM-LABEL: usdot_narrow:
133 ; CHECK-NOI8MM: // %bb.0:
134 ; CHECK-NOI8MM-NEXT: ushll v1.8h, v1.8b, #0
135 ; CHECK-NOI8MM-NEXT: sshll v2.8h, v2.8b, #0
136 ; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0
137 ; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h
138 ; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h
139 ; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8
140 ; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8
141 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
142 ; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8
143 ; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8
144 ; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h
145 ; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s
146 ; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s
147 ; CHECK-NOI8MM-NEXT: ret
149 ; CHECK-I8MM-LABEL: usdot_narrow:
150 ; CHECK-I8MM: // %bb.0:
151 ; CHECK-I8MM-NEXT: usdot v0.2s, v1.8b, v2.8b
152 ; CHECK-I8MM-NEXT: ret
153 %u.wide = zext <8 x i8> %u to <8 x i32>
154 %s.wide = sext <8 x i8> %s to <8 x i32>
155 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
156 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
157 ret <2 x i32> %partial.reduce
160 define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
161 ; CHECK-NOI8MM-LABEL: sudot:
162 ; CHECK-NOI8MM: // %bb.0:
163 ; CHECK-NOI8MM-NEXT: sshll v3.8h, v1.8b, #0
164 ; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0
165 ; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0
166 ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
167 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h
168 ; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h
169 ; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h
170 ; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h
171 ; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s
172 ; CHECK-NOI8MM-NEXT: ret
174 ; CHECK-I8MM-LABEL: sudot:
175 ; CHECK-I8MM: // %bb.0:
176 ; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b
177 ; CHECK-I8MM-NEXT: ret
178 %u.wide = sext <16 x i8> %u to <16 x i32>
179 %s.wide = zext <16 x i8> %s to <16 x i32>
180 %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
181 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
182 ret <4 x i32> %partial.reduce
185 define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
186 ; CHECK-NOI8MM-LABEL: sudot_narrow:
187 ; CHECK-NOI8MM: // %bb.0:
188 ; CHECK-NOI8MM-NEXT: sshll v1.8h, v1.8b, #0
189 ; CHECK-NOI8MM-NEXT: ushll v2.8h, v2.8b, #0
190 ; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0
191 ; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h
192 ; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h
193 ; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8
194 ; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8
195 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
196 ; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8
197 ; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8
198 ; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h
199 ; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s
200 ; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s
201 ; CHECK-NOI8MM-NEXT: ret
203 ; CHECK-I8MM-LABEL: sudot_narrow:
204 ; CHECK-I8MM: // %bb.0:
205 ; CHECK-I8MM-NEXT: usdot v0.2s, v2.8b, v1.8b
206 ; CHECK-I8MM-NEXT: ret
207 %u.wide = sext <8 x i8> %u to <8 x i32>
208 %s.wide = zext <8 x i8> %s to <8 x i32>
209 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
210 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
211 ret <2 x i32> %partial.reduce
214 define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
215 ; CHECK-DOT-LABEL: udot_8to64:
216 ; CHECK-DOT: // %bb.0: // %entry
217 ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
218 ; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b
219 ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
220 ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
221 ; CHECK-DOT-NEXT: ret
223 ; CHECK-NODOT-LABEL: udot_8to64:
224 ; CHECK-NODOT: // %bb.0: // %entry
225 ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
226 ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
227 ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
228 ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
229 ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
230 ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
231 ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
232 ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
233 ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
234 ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
235 ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
236 ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
237 ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
238 ; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
239 ; CHECK-NODOT-NEXT: ret
241 %a.wide = zext <16 x i8> %a to <16 x i64>
242 %b.wide = zext <16 x i8> %b to <16 x i64>
243 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
244 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
245 <4 x i64> %acc, <16 x i64> %mult)
246 ret <4 x i64> %partial.reduce
249 define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
250 ; CHECK-DOT-LABEL: sdot_8to64:
251 ; CHECK-DOT: // %bb.0: // %entry
252 ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
253 ; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b
254 ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
255 ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
256 ; CHECK-DOT-NEXT: ret
258 ; CHECK-NODOT-LABEL: sdot_8to64:
259 ; CHECK-NODOT: // %bb.0: // %entry
260 ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
261 ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
262 ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
263 ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
264 ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
265 ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
266 ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
267 ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
268 ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
269 ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
270 ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
271 ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
272 ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
273 ; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
274 ; CHECK-NODOT-NEXT: ret
276 %a.wide = sext <16 x i8> %a to <16 x i64>
277 %b.wide = sext <16 x i8> %b to <16 x i64>
278 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
279 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
280 <4 x i64> %acc, <16 x i64> %mult)
281 ret <4 x i64> %partial.reduce
284 define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
285 ; CHECK-NOI8MM-LABEL: usdot_8to64:
286 ; CHECK-NOI8MM: // %bb.0: // %entry
287 ; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0
288 ; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0
289 ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
290 ; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0
291 ; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0
292 ; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0
293 ; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0
294 ; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0
295 ; CHECK-NOI8MM-NEXT: ushll2 v16.4s, v2.8h, #0
296 ; CHECK-NOI8MM-NEXT: sshll2 v17.4s, v3.8h, #0
297 ; CHECK-NOI8MM-NEXT: ushll v2.4s, v2.4h, #0
298 ; CHECK-NOI8MM-NEXT: sshll v3.4s, v3.4h, #0
299 ; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s
300 ; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s
301 ; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s
302 ; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s
303 ; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s
304 ; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s
305 ; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s
306 ; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s
307 ; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d
308 ; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d
309 ; CHECK-NOI8MM-NEXT: ret
311 ; CHECK-I8MM-LABEL: usdot_8to64:
312 ; CHECK-I8MM: // %bb.0: // %entry
313 ; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000
314 ; CHECK-I8MM-NEXT: usdot v4.4s, v2.16b, v3.16b
315 ; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s
316 ; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s
317 ; CHECK-I8MM-NEXT: ret
319 %a.wide = zext <16 x i8> %a to <16 x i64>
320 %b.wide = sext <16 x i8> %b to <16 x i64>
321 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
322 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
323 <4 x i64> %acc, <16 x i64> %mult)
324 ret <4 x i64> %partial.reduce
327 define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
328 ; CHECK-NOI8MM-LABEL: sudot_8to64:
329 ; CHECK-NOI8MM: // %bb.0: // %entry
330 ; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
331 ; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0
332 ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
333 ; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0
334 ; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0
335 ; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0
336 ; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0
337 ; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0
338 ; CHECK-NOI8MM-NEXT: sshll2 v16.4s, v2.8h, #0
339 ; CHECK-NOI8MM-NEXT: ushll2 v17.4s, v3.8h, #0
340 ; CHECK-NOI8MM-NEXT: sshll v2.4s, v2.4h, #0
341 ; CHECK-NOI8MM-NEXT: ushll v3.4s, v3.4h, #0
342 ; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s
343 ; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s
344 ; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s
345 ; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s
346 ; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s
347 ; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s
348 ; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s
349 ; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s
350 ; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d
351 ; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d
352 ; CHECK-NOI8MM-NEXT: ret
354 ; CHECK-I8MM-LABEL: sudot_8to64:
355 ; CHECK-I8MM: // %bb.0: // %entry
356 ; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000
357 ; CHECK-I8MM-NEXT: usdot v4.4s, v3.16b, v2.16b
358 ; CHECK-I8MM-NEXT: saddw2 v1.2d, v1.2d, v4.4s
359 ; CHECK-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s
360 ; CHECK-I8MM-NEXT: ret
362 %a.wide = sext <16 x i8> %a to <16 x i64>
363 %b.wide = zext <16 x i8> %b to <16 x i64>
364 %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
365 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
366 <4 x i64> %acc, <16 x i64> %mult)
367 ret <4 x i64> %partial.reduce
370 define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
371 ; CHECK-DOT-LABEL: udot_no_bin_op:
372 ; CHECK-DOT: // %bb.0:
373 ; CHECK-DOT-NEXT: movi v2.16b, #1
374 ; CHECK-DOT-NEXT: udot v0.4s, v1.16b, v2.16b
375 ; CHECK-DOT-NEXT: ret
377 ; CHECK-NODOT-LABEL: udot_no_bin_op:
378 ; CHECK-NODOT: // %bb.0:
379 ; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0
380 ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
381 ; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0
382 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h
383 ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v3.4s, v2.8h
384 ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
385 ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
386 ; CHECK-NODOT-NEXT: ret
387 %a.wide = zext <16 x i8> %a to <16 x i32>
388 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
389 ret <4 x i32> %partial.reduce
392 define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
393 ; CHECK-DOT-LABEL: sdot_no_bin_op:
394 ; CHECK-DOT: // %bb.0:
395 ; CHECK-DOT-NEXT: movi v2.16b, #1
396 ; CHECK-DOT-NEXT: sdot v0.4s, v1.16b, v2.16b
397 ; CHECK-DOT-NEXT: ret
399 ; CHECK-NODOT-LABEL: sdot_no_bin_op:
400 ; CHECK-NODOT: // %bb.0:
401 ; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0
402 ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
403 ; CHECK-NODOT-NEXT: sshll v3.4s, v1.4h, #0
404 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h
405 ; CHECK-NODOT-NEXT: saddw2 v2.4s, v3.4s, v2.8h
406 ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
407 ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
408 ; CHECK-NODOT-NEXT: ret
409 %a.wide = sext <16 x i8> %a to <16 x i32>
410 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
411 ret <4 x i32> %partial.reduce
414 define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
415 ; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
416 ; CHECK-DOT: // %bb.0:
417 ; CHECK-DOT-NEXT: movi v2.8b, #1
418 ; CHECK-DOT-NEXT: udot v0.2s, v1.8b, v2.8b
419 ; CHECK-DOT-NEXT: ret
421 ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
422 ; CHECK-NODOT: // %bb.0:
423 ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
424 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
425 ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
426 ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
427 ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
428 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
429 ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
430 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
431 ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
432 ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
433 ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
434 ; CHECK-NODOT-NEXT: ret
435 %a.wide = zext <8 x i8> %a to <8 x i32>
436 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
437 ret <2 x i32> %partial.reduce
440 define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
441 ; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
442 ; CHECK-DOT: // %bb.0:
443 ; CHECK-DOT-NEXT: movi v2.8b, #1
444 ; CHECK-DOT-NEXT: sdot v0.2s, v1.8b, v2.8b
445 ; CHECK-DOT-NEXT: ret
447 ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
448 ; CHECK-NODOT: // %bb.0:
449 ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
450 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
451 ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
452 ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
453 ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
454 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
455 ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
456 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
457 ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
458 ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
459 ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
460 ; CHECK-NODOT-NEXT: ret
461 %a.wide = sext <8 x i8> %a to <8 x i32>
462 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
463 ret <2 x i32> %partial.reduce
466 define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
467 ; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
468 ; CHECK-DOT: // %bb.0:
469 ; CHECK-DOT-NEXT: movi v3.16b, #1
470 ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
471 ; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b
472 ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
473 ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
474 ; CHECK-DOT-NEXT: ret
476 ; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
477 ; CHECK-NODOT: // %bb.0:
478 ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0
479 ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
480 ; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0
481 ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
482 ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0
483 ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
484 ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v4.4s
485 ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
486 ; CHECK-NODOT-NEXT: uaddl2 v4.2d, v3.4s, v5.4s
487 ; CHECK-NODOT-NEXT: uaddl v3.2d, v3.2s, v5.2s
488 ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
489 ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
490 ; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
491 ; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
492 ; CHECK-NODOT-NEXT: ret
493 %a.wide = zext <16 x i8> %a to <16 x i64>
494 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
495 ret <4 x i64> %partial.reduce
498 define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
499 ; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
500 ; CHECK-DOT: // %bb.0:
501 ; CHECK-DOT-NEXT: movi v3.16b, #1
502 ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
503 ; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b
504 ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
505 ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
506 ; CHECK-DOT-NEXT: ret
508 ; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
509 ; CHECK-NODOT: // %bb.0:
510 ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0
511 ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
512 ; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0
513 ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
514 ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0
515 ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
516 ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
517 ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
518 ; CHECK-NODOT-NEXT: saddl2 v4.2d, v3.4s, v5.4s
519 ; CHECK-NODOT-NEXT: saddl v3.2d, v3.2s, v5.2s
520 ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
521 ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
522 ; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
523 ; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
524 ; CHECK-NODOT-NEXT: ret
525 %a.wide = sext <16 x i8> %a to <16 x i64>
526 %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
527 ret <4 x i64> %partial.reduce
530 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
531 ; CHECK-LABEL: not_udot:
533 ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
534 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
535 ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
537 %u.wide = zext <8 x i8> %u to <8 x i32>
538 %s.wide = zext <8 x i8> %s to <8 x i32>
539 %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
540 %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult)
541 ret <4 x i32> %partial.reduce
544 define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
545 ; CHECK-LABEL: not_udot_narrow:
547 ; CHECK-NEXT: bic v1.4h, #255, lsl #8
548 ; CHECK-NEXT: bic v2.4h, #255, lsl #8
549 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
550 ; CHECK-NEXT: umull v3.4s, v2.4h, v1.4h
551 ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
552 ; CHECK-NEXT: ext v1.16b, v3.16b, v3.16b, #8
553 ; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
555 %u.wide = zext <4 x i8> %u to <4 x i32>
556 %s.wide = zext <4 x i8> %s to <4 x i32>
557 %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
558 %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
559 ret <2 x i32> %partial.reduce
562 define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
563 ; CHECK-LABEL: udot_different_types:
564 ; CHECK: // %bb.0: // %entry
565 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
566 ; CHECK-NEXT: ushll v3.4s, v1.4h, #0
567 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
568 ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
569 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
570 ; CHECK-NEXT: umull v5.2d, v1.2s, v2.2s
571 ; CHECK-NEXT: umlal v0.2d, v3.2s, v4.2s
572 ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
573 ; CHECK-NEXT: umlal2 v5.2d, v3.4s, v4.4s
574 ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
577 %a.wide = zext <8 x i16> %a to <8 x i64>
578 %b.wide = zext <8 x i8> %b to <8 x i64>
579 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
580 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
581 ret <2 x i64> %partial.reduce
584 define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
585 ; CHECK-LABEL: sdot_different_types:
586 ; CHECK: // %bb.0: // %entry
587 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
588 ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
589 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
590 ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
591 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
592 ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
593 ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
594 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
595 ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
596 ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
599 %a.wide = sext <8 x i16> %a to <8 x i64>
600 %b.wide = sext <8 x i8> %b to <8 x i64>
601 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
602 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
603 ret <2 x i64> %partial.reduce
606 define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
607 ; CHECK-LABEL: usdot_different_types:
608 ; CHECK: // %bb.0: // %entry
609 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
610 ; CHECK-NEXT: ushll v3.4s, v1.4h, #0
611 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
612 ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
613 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
614 ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
615 ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
616 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
617 ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
618 ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
621 %a.wide = zext <8 x i16> %a to <8 x i64>
622 %b.wide = sext <8 x i8> %b to <8 x i64>
623 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
624 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
625 ret <2 x i64> %partial.reduce
628 define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
629 ; CHECK-LABEL: sudot_different_types:
630 ; CHECK: // %bb.0: // %entry
631 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
632 ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
633 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
634 ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
635 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
636 ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
637 ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
638 ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
639 ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
640 ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
643 %a.wide = sext <8 x i16> %a to <8 x i64>
644 %b.wide = zext <8 x i8> %b to <8 x i64>
645 %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
646 %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
647 ret <2 x i64> %partial.reduce