1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod < %s | FileCheck %s
3 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65 < %s | FileCheck %s
4 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65ae < %s | FileCheck %s
5 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1 < %s | FileCheck %s
6 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1 < %s | FileCheck %s
7 ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s
9 declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
10 declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
11 declare <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
12 declare <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
14 define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
15 ; CHECK-LABEL: test_vdot_u32:
16 ; CHECK: // %bb.0: // %entry
17 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
20 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
21 ret <2 x i32> %vdot1.i
24 define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
25 ; CHECK-LABEL: test_vdotq_u32:
26 ; CHECK: // %bb.0: // %entry
27 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
30 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
31 ret <4 x i32> %vdot1.i
34 define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
35 ; CHECK-LABEL: test_vdot_s32:
36 ; CHECK: // %bb.0: // %entry
37 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
40 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
41 ret <2 x i32> %vdot1.i
44 define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
45 ; CHECK-LABEL: test_vdotq_s32:
46 ; CHECK: // %bb.0: // %entry
47 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
50 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
51 ret <4 x i32> %vdot1.i
55 define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
56 ; CHECK-LABEL: test_vdot_u32_zero:
57 ; CHECK: // %bb.0: // %entry
58 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b
61 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
62 %ret = add <2 x i32> %vdot1.i, %a
66 define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
67 ; CHECK-LABEL: test_vdotq_u32_zero:
68 ; CHECK: // %bb.0: // %entry
69 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b
72 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
73 %ret = add <4 x i32> %vdot1.i, %a
77 define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
78 ; CHECK-LABEL: test_vdot_s32_zero:
79 ; CHECK: // %bb.0: // %entry
80 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b
83 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
84 %ret = add <2 x i32> %vdot1.i, %a
88 define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
89 ; CHECK-LABEL: test_vdotq_s32_zero:
90 ; CHECK: // %bb.0: // %entry
91 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b
94 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
95 %ret = add <4 x i32> %vdot1.i, %a
100 define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
101 ; CHECK-LABEL: test_vdot_lane_u32:
102 ; CHECK: // %bb.0: // %entry
103 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
104 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
107 %.cast = bitcast <8 x i8> %c to <2 x i32>
108 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
109 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
110 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
111 ret <2 x i32> %vdot1.i
114 define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
115 ; CHECK-LABEL: test_vdotq_lane_u32:
116 ; CHECK: // %bb.0: // %entry
117 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
118 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
121 %.cast = bitcast <8 x i8> %c to <2 x i32>
122 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
123 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
124 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
125 ret <4 x i32> %vdot1.i
128 define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
129 ; CHECK-LABEL: test_vdot_laneq_u32:
130 ; CHECK: // %bb.0: // %entry
131 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
134 %.cast = bitcast <16 x i8> %c to <4 x i32>
135 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
136 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
137 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
138 ret <2 x i32> %vdot1.i
141 define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
142 ; CHECK-LABEL: test_vdotq_laneq_u32:
143 ; CHECK: // %bb.0: // %entry
144 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
147 %.cast = bitcast <16 x i8> %c to <4 x i32>
148 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
149 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
150 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
151 ret <4 x i32> %vdot1.i
155 define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
156 ; CHECK-LABEL: test_vdot_lane_u32_zero:
157 ; CHECK: // %bb.0: // %entry
158 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
159 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
162 %.cast = bitcast <8 x i8> %c to <2 x i32>
163 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
164 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
165 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
166 %ret = add <2 x i32> %vdot1.i, %a
170 define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
171 ; CHECK-LABEL: test_vdotq_lane_u32_zero:
172 ; CHECK: // %bb.0: // %entry
173 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
174 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
177 %.cast = bitcast <8 x i8> %c to <2 x i32>
178 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
179 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
180 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
181 %ret = add <4 x i32> %vdot1.i, %a
185 define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
186 ; CHECK-LABEL: test_vdot_laneq_u32_zero:
187 ; CHECK: // %bb.0: // %entry
188 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.4b[1]
191 %.cast = bitcast <16 x i8> %c to <4 x i32>
192 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
193 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
194 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
195 %ret = add <2 x i32> %vdot1.i, %a
199 define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
200 ; CHECK-LABEL: test_vdotq_laneq_u32_zero:
201 ; CHECK: // %bb.0: // %entry
202 ; CHECK-NEXT: udot v0.4s, v1.16b, v2.4b[1]
205 %.cast = bitcast <16 x i8> %c to <4 x i32>
206 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
207 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
208 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
209 %ret = add <4 x i32> %vdot1.i, %a
214 define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
215 ; CHECK-LABEL: test_vdot_lane_s32:
216 ; CHECK: // %bb.0: // %entry
217 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
218 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
221 %.cast = bitcast <8 x i8> %c to <2 x i32>
222 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
223 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
224 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
225 ret <2 x i32> %vdot1.i
228 define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
229 ; CHECK-LABEL: test_vdotq_lane_s32:
230 ; CHECK: // %bb.0: // %entry
231 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
232 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
235 %.cast = bitcast <8 x i8> %c to <2 x i32>
236 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
237 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
238 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
239 ret <4 x i32> %vdot1.i
242 define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
243 ; CHECK-LABEL: test_vdot_laneq_s32:
244 ; CHECK: // %bb.0: // %entry
245 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
248 %.cast = bitcast <16 x i8> %c to <4 x i32>
249 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
250 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
251 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
252 ret <2 x i32> %vdot1.i
255 define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
256 ; CHECK-LABEL: test_vdotq_laneq_s32:
257 ; CHECK: // %bb.0: // %entry
258 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
261 %.cast = bitcast <16 x i8> %c to <4 x i32>
262 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
263 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
264 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
265 ret <4 x i32> %vdot1.i
269 define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
270 ; CHECK-LABEL: test_vdot_lane_s32_zero:
271 ; CHECK: // %bb.0: // %entry
272 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
273 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
276 %.cast = bitcast <8 x i8> %c to <2 x i32>
277 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
278 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
279 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
280 %ret = add <2 x i32> %vdot1.i, %a
284 define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
285 ; CHECK-LABEL: test_vdotq_lane_s32_zero:
286 ; CHECK: // %bb.0: // %entry
287 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
288 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
291 %.cast = bitcast <8 x i8> %c to <2 x i32>
292 %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
293 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
294 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
295 %ret = add <4 x i32> %vdot1.i, %a
299 define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
300 ; CHECK-LABEL: test_vdot_laneq_s32_zero:
301 ; CHECK: // %bb.0: // %entry
302 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.4b[1]
305 %.cast = bitcast <16 x i8> %c to <4 x i32>
306 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
307 %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
308 %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
309 %ret = add <2 x i32> %vdot1.i, %a
313 define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
314 ; CHECK-LABEL: test_vdotq_laneq_s32_zero:
315 ; CHECK: // %bb.0: // %entry
316 ; CHECK-NEXT: sdot v0.4s, v1.16b, v2.4b[1]
319 %.cast = bitcast <16 x i8> %c to <4 x i32>
320 %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
321 %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
322 %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
323 %ret = add <4 x i32> %vdot1.i, %a