1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod < %s | FileCheck %s
3 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65 < %s | FileCheck %s
4 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65ae < %s | FileCheck %s
5 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1 < %s | FileCheck %s
6 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1 < %s | FileCheck %s
7 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s
8 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1 < %s | FileCheck %s
9 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a < %s | FileCheck %s
10 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b < %s | FileCheck %s
12 declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
13 declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
14 declare <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
15 declare <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
17 define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
18 ; CHECK-LABEL: test_vdot_u32:
19 ; CHECK: // %bb.0: // %entry
20 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
23 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
24 ret <2 x i32> %vdot1.i
27 define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
28 ; CHECK-LABEL: test_vdotq_u32:
29 ; CHECK: // %bb.0: // %entry
30 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
33 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
34 ret <4 x i32> %vdot1.i
37 define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
38 ; CHECK-LABEL: test_vdot_s32:
39 ; CHECK: // %bb.0: // %entry
40 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
43 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
44 ret <2 x i32> %vdot1.i
47 define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
48 ; CHECK-LABEL: test_vdotq_s32:
49 ; CHECK: // %bb.0: // %entry
50 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
53 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
54 ret <4 x i32> %vdot1.i
58 define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
59 ; CHECK-LABEL: test_vdot_u32_zero:
60 ; CHECK: // %bb.0: // %entry
61 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
64 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
65 %ret = add <2 x i32> %vdot1.i, %a
69 define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
70 ; CHECK-LABEL: test_vdotq_u32_zero:
71 ; CHECK: // %bb.0: // %entry
72 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
75 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
76 %ret = add <4 x i32> %vdot1.i, %a
80 define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
81 ; CHECK-LABEL: test_vdot_s32_zero:
82 ; CHECK: // %bb.0: // %entry
83 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
86 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
87 %ret = add <2 x i32> %vdot1.i, %a
91 define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
92 ; CHECK-LABEL: test_vdotq_s32_zero:
93 ; CHECK: // %bb.0: // %entry
94 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
97 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
98 %ret = add <4 x i32> %vdot1.i, %a
103 define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
104 ; CHECK-LABEL: test_vdot_lane_u32:
105 ; CHECK: // %bb.0: // %entry
106 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
107 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
110 %.cast = bitcast <8 x i8> %c to <2 x i32>
111 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
112 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
113 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
114 ret <2 x i32> %vdot1.i
117 define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
118 ; CHECK-LABEL: test_vdotq_lane_u32:
119 ; CHECK: // %bb.0: // %entry
120 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
121 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
124 %.cast = bitcast <8 x i8> %c to <2 x i32>
125 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
126 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
127 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
128 ret <4 x i32> %vdot1.i
131 define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
132 ; CHECK-LABEL: test_vdot_laneq_u32:
133 ; CHECK: // %bb.0: // %entry
134 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
137 %.cast = bitcast <16 x i8> %c to <4 x i32>
138 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
139 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
140 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
141 ret <2 x i32> %vdot1.i
144 define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
145 ; CHECK-LABEL: test_vdotq_laneq_u32:
146 ; CHECK: // %bb.0: // %entry
147 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
150 %.cast = bitcast <16 x i8> %c to <4 x i32>
151 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
152 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
153 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
154 ret <4 x i32> %vdot1.i
158 define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
159 ; CHECK-LABEL: test_vdot_lane_u32_zero:
160 ; CHECK: // %bb.0: // %entry
161 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
162 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
165 %.cast = bitcast <8 x i8> %c to <2 x i32>
166 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
167 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
168 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
169 %ret = add <2 x i32> %vdot1.i, %a
173 define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
174 ; CHECK-LABEL: test_vdotq_lane_u32_zero:
175 ; CHECK: // %bb.0: // %entry
176 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
177 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
180 %.cast = bitcast <8 x i8> %c to <2 x i32>
181 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
182 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
183 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
184 %ret = add <4 x i32> %vdot1.i, %a
188 define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
189 ; CHECK-LABEL: test_vdot_laneq_u32_zero:
190 ; CHECK: // %bb.0: // %entry
191 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
194 %.cast = bitcast <16 x i8> %c to <4 x i32>
195 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
196 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
197 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
198 %ret = add <2 x i32> %vdot1.i, %a
202 define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
203 ; CHECK-LABEL: test_vdotq_laneq_u32_zero:
204 ; CHECK: // %bb.0: // %entry
205 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
208 %.cast = bitcast <16 x i8> %c to <4 x i32>
209 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
210 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
211 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
212 %ret = add <4 x i32> %vdot1.i, %a
217 define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
218 ; CHECK-LABEL: test_vdot_lane_s32:
219 ; CHECK: // %bb.0: // %entry
220 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
221 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
224 %.cast = bitcast <8 x i8> %c to <2 x i32>
225 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
226 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
227 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
228 ret <2 x i32> %vdot1.i
231 define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
232 ; CHECK-LABEL: test_vdotq_lane_s32:
233 ; CHECK: // %bb.0: // %entry
234 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
235 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
238 %.cast = bitcast <8 x i8> %c to <2 x i32>
239 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
240 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
241 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
242 ret <4 x i32> %vdot1.i
245 define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
246 ; CHECK-LABEL: test_vdot_laneq_s32:
247 ; CHECK: // %bb.0: // %entry
248 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
251 %.cast = bitcast <16 x i8> %c to <4 x i32>
252 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
253 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
254 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
255 ret <2 x i32> %vdot1.i
258 define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
259 ; CHECK-LABEL: test_vdotq_laneq_s32:
260 ; CHECK: // %bb.0: // %entry
261 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
264 %.cast = bitcast <16 x i8> %c to <4 x i32>
265 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
266 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
267 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
268 ret <4 x i32> %vdot1.i
272 define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
273 ; CHECK-LABEL: test_vdot_lane_s32_zero:
274 ; CHECK: // %bb.0: // %entry
275 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
276 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
279 %.cast = bitcast <8 x i8> %c to <2 x i32>
280 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
281 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
282 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
283 %ret = add <2 x i32> %vdot1.i, %a
287 define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
288 ; CHECK-LABEL: test_vdotq_lane_s32_zero:
289 ; CHECK: // %bb.0: // %entry
290 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
291 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
294 %.cast = bitcast <8 x i8> %c to <2 x i32>
295 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
296 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
297 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
298 %ret = add <4 x i32> %vdot1.i, %a
302 define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
303 ; CHECK-LABEL: test_vdot_laneq_s32_zero:
304 ; CHECK: // %bb.0: // %entry
305 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
308 %.cast = bitcast <16 x i8> %c to <4 x i32>
309 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
310 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
311 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
312 %ret = add <2 x i32> %vdot1.i, %a
316 define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
317 ; CHECK-LABEL: test_vdotq_laneq_s32_zero:
318 ; CHECK: // %bb.0: // %entry
319 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
322 %.cast = bitcast <16 x i8> %c to <4 x i32>
323 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
324 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
325 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
326 %ret = add <4 x i32> %vdot1.i, %a