1 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s
7 define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
8 ; CHECK-LABEL: bfdot_f32:
9 ; CHECK-NEXT: bfdot z0.s, z1.h, z2.h
11 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
12 ret <vscale x 4 x float> %out
15 define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
16 ; CHECK-LABEL: bfdot_lane_0_f32:
17 ; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[0]
19 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
20 ret <vscale x 4 x float> %out
23 define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
24 ; CHECK-LABEL: bfdot_lane_1_f32:
25 ; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[1]
27 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
28 ret <vscale x 4 x float> %out
31 define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
32 ; CHECK-LABEL: bfdot_lane_2_f32:
33 ; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[2]
35 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
36 ret <vscale x 4 x float> %out
39 define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
40 ; CHECK-LABEL: bfdot_lane_3_f32:
41 ; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[3]
43 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
44 ret <vscale x 4 x float> %out
51 define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
52 ; CHECK-LABEL: bfmlalb_f32:
53 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h
55 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
56 ret <vscale x 4 x float> %out
59 define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
60 ; CHECK-LABEL: bfmlalb_lane_0_f32:
61 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[0]
63 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
64 ret <vscale x 4 x float> %out
67 define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
68 ; CHECK-LABEL: bfmlalb_lane_1_f32:
69 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[1]
71 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
72 ret <vscale x 4 x float> %out
75 define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
76 ; CHECK-LABEL: bfmlalb_lane_2_f32:
77 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[2]
79 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
80 ret <vscale x 4 x float> %out
83 define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
84 ; CHECK-LABEL: bfmlalb_lane_3_f32:
85 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[3]
87 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
88 ret <vscale x 4 x float> %out
91 define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
92 ; CHECK-LABEL: bfmlalb_lane_4_f32:
93 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[4]
95 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
96 ret <vscale x 4 x float> %out
99 define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
100 ; CHECK-LABEL: bfmlalb_lane_5_f32:
101 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[5]
103 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
104 ret <vscale x 4 x float> %out
107 define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
108 ; CHECK-LABEL: bfmlalb_lane_6_f32:
109 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[6]
111 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
112 ret <vscale x 4 x float> %out
115 define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
116 ; CHECK-LABEL: bfmlalb_lane_7_f32:
117 ; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[7]
119 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
120 ret <vscale x 4 x float> %out
127 define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
128 ; CHECK-LABEL: bfmlalt_f32:
129 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h
131 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
132 ret <vscale x 4 x float> %out
135 define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
136 ; CHECK-LABEL: bfmlalt_lane_0_f32:
137 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[0]
139 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
140 ret <vscale x 4 x float> %out
143 define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
144 ; CHECK-LABEL: bfmlalt_lane_1_f32:
145 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[1]
147 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
148 ret <vscale x 4 x float> %out
151 define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
152 ; CHECK-LABEL: bfmlalt_lane_2_f32:
153 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[2]
155 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
156 ret <vscale x 4 x float> %out
159 define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
160 ; CHECK-LABEL: bfmlalt_lane_3_f32:
161 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[3]
163 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
164 ret <vscale x 4 x float> %out
167 define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
168 ; CHECK-LABEL: bfmlalt_lane_4_f32:
169 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[4]
171 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
172 ret <vscale x 4 x float> %out
175 define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
176 ; CHECK-LABEL: bfmlalt_lane_5_f32:
177 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[5]
179 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
180 ret <vscale x 4 x float> %out
183 define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
184 ; CHECK-LABEL: bfmlalt_lane_6_f32:
185 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[6]
187 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
188 ret <vscale x 4 x float> %out
191 define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
192 ; CHECK-LABEL: bfmlalt_lane_7_f32:
193 ; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[7]
195 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
196 ret <vscale x 4 x float> %out
203 define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
204 ; CHECK-LABEL: bfmmla_f32:
205 ; CHECK-NEXT: bfmmla z0.s, z1.h, z2.h
207 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
208 ret <vscale x 4 x float> %out
215 define <vscale x 8 x bfloat> @fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
216 ; CHECK-LABEL: fcvt_bf16_f32:
217 ; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s
219 %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
220 ret <vscale x 8 x bfloat> %out
227 define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind {
228 ; CHECK-LABEL: fcvtnt_bf16_f32:
229 ; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
231 %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b)
232 ret <vscale x 8 x bfloat> %out
235 declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
236 declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
237 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
238 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
239 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
240 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
241 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
242 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
243 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)