1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod < %s | FileCheck %s
3 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65 < %s | FileCheck %s
4 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65ae < %s | FileCheck %s
5 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1 < %s | FileCheck %s
6 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1 < %s | FileCheck %s
7 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s
8 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1 < %s | FileCheck %s
9 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a < %s | FileCheck %s
11 declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
12 declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
13 declare <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
14 declare <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
16 define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
17 ; CHECK-LABEL: test_vdot_u32:
18 ; CHECK: // %bb.0: // %entry
19 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
22 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
23 ret <2 x i32> %vdot1.i
26 define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
27 ; CHECK-LABEL: test_vdotq_u32:
28 ; CHECK: // %bb.0: // %entry
29 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
32 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
33 ret <4 x i32> %vdot1.i
36 define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
37 ; CHECK-LABEL: test_vdot_s32:
38 ; CHECK: // %bb.0: // %entry
39 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
42 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
43 ret <2 x i32> %vdot1.i
46 define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
47 ; CHECK-LABEL: test_vdotq_s32:
48 ; CHECK: // %bb.0: // %entry
49 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
52 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
53 ret <4 x i32> %vdot1.i
57 define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
58 ; CHECK-LABEL: test_vdot_u32_zero:
59 ; CHECK: // %bb.0: // %entry
60 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
63 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
64 %ret = add <2 x i32> %vdot1.i, %a
68 define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
69 ; CHECK-LABEL: test_vdotq_u32_zero:
70 ; CHECK: // %bb.0: // %entry
71 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
74 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
75 %ret = add <4 x i32> %vdot1.i, %a
79 define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
80 ; CHECK-LABEL: test_vdot_s32_zero:
81 ; CHECK: // %bb.0: // %entry
82 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
85 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
86 %ret = add <2 x i32> %vdot1.i, %a
90 define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
91 ; CHECK-LABEL: test_vdotq_s32_zero:
92 ; CHECK: // %bb.0: // %entry
93 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
96 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
97 %ret = add <4 x i32> %vdot1.i, %a
102 define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
103 ; CHECK-LABEL: test_vdot_lane_u32:
104 ; CHECK: // %bb.0: // %entry
105 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
106 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
109 %.cast = bitcast <8 x i8> %c to <2 x i32>
110 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
111 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
112 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
113 ret <2 x i32> %vdot1.i
116 define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
117 ; CHECK-LABEL: test_vdotq_lane_u32:
118 ; CHECK: // %bb.0: // %entry
119 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
120 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
123 %.cast = bitcast <8 x i8> %c to <2 x i32>
124 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
125 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
126 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
127 ret <4 x i32> %vdot1.i
130 define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
131 ; CHECK-LABEL: test_vdot_laneq_u32:
132 ; CHECK: // %bb.0: // %entry
133 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
136 %.cast = bitcast <16 x i8> %c to <4 x i32>
137 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
138 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
139 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
140 ret <2 x i32> %vdot1.i
143 define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
144 ; CHECK-LABEL: test_vdotq_laneq_u32:
145 ; CHECK: // %bb.0: // %entry
146 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
149 %.cast = bitcast <16 x i8> %c to <4 x i32>
150 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
151 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
152 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
153 ret <4 x i32> %vdot1.i
157 define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
158 ; CHECK-LABEL: test_vdot_lane_u32_zero:
159 ; CHECK: // %bb.0: // %entry
160 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
161 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
164 %.cast = bitcast <8 x i8> %c to <2 x i32>
165 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
166 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
167 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
168 %ret = add <2 x i32> %vdot1.i, %a
172 define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
173 ; CHECK-LABEL: test_vdotq_lane_u32_zero:
174 ; CHECK: // %bb.0: // %entry
175 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
176 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
179 %.cast = bitcast <8 x i8> %c to <2 x i32>
180 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
181 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
182 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
183 %ret = add <4 x i32> %vdot1.i, %a
187 define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
188 ; CHECK-LABEL: test_vdot_laneq_u32_zero:
189 ; CHECK: // %bb.0: // %entry
190 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
193 %.cast = bitcast <16 x i8> %c to <4 x i32>
194 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
195 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
196 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
197 %ret = add <2 x i32> %vdot1.i, %a
201 define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
202 ; CHECK-LABEL: test_vdotq_laneq_u32_zero:
203 ; CHECK: // %bb.0: // %entry
204 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
207 %.cast = bitcast <16 x i8> %c to <4 x i32>
208 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
209 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
210 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
211 %ret = add <4 x i32> %vdot1.i, %a
216 define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
217 ; CHECK-LABEL: test_vdot_lane_s32:
218 ; CHECK: // %bb.0: // %entry
219 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
220 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
223 %.cast = bitcast <8 x i8> %c to <2 x i32>
224 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
225 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
226 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
227 ret <2 x i32> %vdot1.i
230 define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
231 ; CHECK-LABEL: test_vdotq_lane_s32:
232 ; CHECK: // %bb.0: // %entry
233 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
234 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
237 %.cast = bitcast <8 x i8> %c to <2 x i32>
238 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
239 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
240 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
241 ret <4 x i32> %vdot1.i
244 define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
245 ; CHECK-LABEL: test_vdot_laneq_s32:
246 ; CHECK: // %bb.0: // %entry
247 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
250 %.cast = bitcast <16 x i8> %c to <4 x i32>
251 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
252 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
253 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
254 ret <2 x i32> %vdot1.i
257 define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
258 ; CHECK-LABEL: test_vdotq_laneq_s32:
259 ; CHECK: // %bb.0: // %entry
260 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
263 %.cast = bitcast <16 x i8> %c to <4 x i32>
264 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
265 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
266 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
267 ret <4 x i32> %vdot1.i
271 define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
272 ; CHECK-LABEL: test_vdot_lane_s32_zero:
273 ; CHECK: // %bb.0: // %entry
274 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
275 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
278 %.cast = bitcast <8 x i8> %c to <2 x i32>
279 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
280 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
281 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
282 %ret = add <2 x i32> %vdot1.i, %a
286 define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
287 ; CHECK-LABEL: test_vdotq_lane_s32_zero:
288 ; CHECK: // %bb.0: // %entry
289 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
290 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
293 %.cast = bitcast <8 x i8> %c to <2 x i32>
294 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
295 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
296 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
297 %ret = add <4 x i32> %vdot1.i, %a
301 define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
302 ; CHECK-LABEL: test_vdot_laneq_s32_zero:
303 ; CHECK: // %bb.0: // %entry
304 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
307 %.cast = bitcast <16 x i8> %c to <4 x i32>
308 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
309 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
310 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
311 %ret = add <2 x i32> %vdot1.i, %a
315 define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
316 ; CHECK-LABEL: test_vdotq_laneq_s32_zero:
317 ; CHECK: // %bb.0: // %entry
318 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
321 %.cast = bitcast <16 x i8> %c to <4 x i32>
322 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
323 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
324 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
325 %ret = add <4 x i32> %vdot1.i, %a