1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
3 ; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
4 ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
5 ; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,B16B16
7 target triple = "aarch64-unknown-linux-gnu"
13 define <vscale x 2 x bfloat> @fabs_nxv2bf16(<vscale x 2 x bfloat> %a) {
14 ; CHECK-LABEL: fabs_nxv2bf16:
16 ; CHECK-NEXT: and z0.h, z0.h, #0x7fff
18 %res = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> %a)
19 ret <vscale x 2 x bfloat> %res
22 define <vscale x 4 x bfloat> @fabs_nxv4bf16(<vscale x 4 x bfloat> %a) {
23 ; CHECK-LABEL: fabs_nxv4bf16:
25 ; CHECK-NEXT: and z0.h, z0.h, #0x7fff
27 %res = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> %a)
28 ret <vscale x 4 x bfloat> %res
31 define <vscale x 8 x bfloat> @fabs_nxv8bf16(<vscale x 8 x bfloat> %a) {
32 ; CHECK-LABEL: fabs_nxv8bf16:
34 ; CHECK-NEXT: and z0.h, z0.h, #0x7fff
36 %res = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> %a)
37 ret <vscale x 8 x bfloat> %res
44 define <vscale x 2 x bfloat> @fadd_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
45 ; NOB16B16-LABEL: fadd_nxv2bf16:
47 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
48 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
49 ; NOB16B16-NEXT: ptrue p0.d
50 ; NOB16B16-NEXT: fadd z0.s, p0/m, z0.s, z1.s
51 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
54 ; B16B16-LABEL: fadd_nxv2bf16:
56 ; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
58 %res = fadd <vscale x 2 x bfloat> %a, %b
59 ret <vscale x 2 x bfloat> %res
62 define <vscale x 4 x bfloat> @fadd_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
63 ; NOB16B16-LABEL: fadd_nxv4bf16:
65 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
66 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
67 ; NOB16B16-NEXT: ptrue p0.s
68 ; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s
69 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
72 ; B16B16-LABEL: fadd_nxv4bf16:
74 ; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
76 %res = fadd <vscale x 4 x bfloat> %a, %b
77 ret <vscale x 4 x bfloat> %res
80 define <vscale x 8 x bfloat> @fadd_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
81 ; NOB16B16-LABEL: fadd_nxv8bf16:
83 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
84 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
85 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
86 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
87 ; NOB16B16-NEXT: ptrue p0.s
88 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
89 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
90 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
91 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
92 ; NOB16B16-NEXT: fadd z2.s, z3.s, z2.s
93 ; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s
94 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
95 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
96 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
99 ; B16B16-LABEL: fadd_nxv8bf16:
101 ; B16B16-NEXT: bfadd z0.h, z0.h, z1.h
103 %res = fadd <vscale x 8 x bfloat> %a, %b
104 ret <vscale x 8 x bfloat> %res
111 define <vscale x 2 x bfloat> @fdiv_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
112 ; CHECK-LABEL: fdiv_nxv2bf16:
114 ; CHECK-NEXT: lsl z1.s, z1.s, #16
115 ; CHECK-NEXT: lsl z0.s, z0.s, #16
116 ; CHECK-NEXT: ptrue p0.d
117 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
118 ; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
120 %res = fdiv <vscale x 2 x bfloat> %a, %b
121 ret <vscale x 2 x bfloat> %res
124 define <vscale x 4 x bfloat> @fdiv_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
125 ; CHECK-LABEL: fdiv_nxv4bf16:
127 ; CHECK-NEXT: lsl z1.s, z1.s, #16
128 ; CHECK-NEXT: lsl z0.s, z0.s, #16
129 ; CHECK-NEXT: ptrue p0.s
130 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
131 ; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
133 %res = fdiv <vscale x 4 x bfloat> %a, %b
134 ret <vscale x 4 x bfloat> %res
137 define <vscale x 8 x bfloat> @fdiv_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
138 ; CHECK-LABEL: fdiv_nxv8bf16:
140 ; CHECK-NEXT: uunpkhi z2.s, z1.h
141 ; CHECK-NEXT: uunpkhi z3.s, z0.h
142 ; CHECK-NEXT: uunpklo z1.s, z1.h
143 ; CHECK-NEXT: uunpklo z0.s, z0.h
144 ; CHECK-NEXT: ptrue p0.s
145 ; CHECK-NEXT: lsl z2.s, z2.s, #16
146 ; CHECK-NEXT: lsl z3.s, z3.s, #16
147 ; CHECK-NEXT: lsl z1.s, z1.s, #16
148 ; CHECK-NEXT: lsl z0.s, z0.s, #16
149 ; CHECK-NEXT: fdivr z2.s, p0/m, z2.s, z3.s
150 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
151 ; CHECK-NEXT: bfcvt z1.h, p0/m, z2.s
152 ; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
153 ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
155 %res = fdiv <vscale x 8 x bfloat> %a, %b
156 ret <vscale x 8 x bfloat> %res
163 define <vscale x 2 x bfloat> @fmax_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
164 ; NOB16B16-LABEL: fmax_nxv2bf16:
165 ; NOB16B16: // %bb.0:
166 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
167 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
168 ; NOB16B16-NEXT: ptrue p0.d
169 ; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
170 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
173 ; B16B16-LABEL: fmax_nxv2bf16:
175 ; B16B16-NEXT: ptrue p0.d
176 ; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
178 %res = call <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
179 ret <vscale x 2 x bfloat> %res
182 define <vscale x 4 x bfloat> @fmax_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
183 ; NOB16B16-LABEL: fmax_nxv4bf16:
184 ; NOB16B16: // %bb.0:
185 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
186 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
187 ; NOB16B16-NEXT: ptrue p0.s
188 ; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
189 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
192 ; B16B16-LABEL: fmax_nxv4bf16:
194 ; B16B16-NEXT: ptrue p0.s
195 ; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
197 %res = call <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
198 ret <vscale x 4 x bfloat> %res
201 define <vscale x 8 x bfloat> @fmax_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
202 ; NOB16B16-LABEL: fmax_nxv8bf16:
203 ; NOB16B16: // %bb.0:
204 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
205 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
206 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
207 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
208 ; NOB16B16-NEXT: ptrue p0.s
209 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
210 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
211 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
212 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
213 ; NOB16B16-NEXT: fmax z2.s, p0/m, z2.s, z3.s
214 ; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s
215 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
216 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
217 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
220 ; B16B16-LABEL: fmax_nxv8bf16:
222 ; B16B16-NEXT: ptrue p0.h
223 ; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
225 %res = call <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
226 ret <vscale x 8 x bfloat> %res
233 define <vscale x 2 x bfloat> @fmaxnm_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
234 ; NOB16B16-LABEL: fmaxnm_nxv2bf16:
235 ; NOB16B16: // %bb.0:
236 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
237 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
238 ; NOB16B16-NEXT: ptrue p0.d
239 ; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
240 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
243 ; B16B16-LABEL: fmaxnm_nxv2bf16:
245 ; B16B16-NEXT: ptrue p0.d
246 ; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
248 %res = call <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
249 ret <vscale x 2 x bfloat> %res
252 define <vscale x 4 x bfloat> @fmaxnm_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
253 ; NOB16B16-LABEL: fmaxnm_nxv4bf16:
254 ; NOB16B16: // %bb.0:
255 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
256 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
257 ; NOB16B16-NEXT: ptrue p0.s
258 ; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
259 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
262 ; B16B16-LABEL: fmaxnm_nxv4bf16:
264 ; B16B16-NEXT: ptrue p0.s
265 ; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
267 %res = call <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
268 ret <vscale x 4 x bfloat> %res
271 define <vscale x 8 x bfloat> @fmaxnm_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
272 ; NOB16B16-LABEL: fmaxnm_nxv8bf16:
273 ; NOB16B16: // %bb.0:
274 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
275 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
276 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
277 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
278 ; NOB16B16-NEXT: ptrue p0.s
279 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
280 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
281 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
282 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
283 ; NOB16B16-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s
284 ; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
285 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
286 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
287 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
290 ; B16B16-LABEL: fmaxnm_nxv8bf16:
292 ; B16B16-NEXT: ptrue p0.h
293 ; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
295 %res = call <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
296 ret <vscale x 8 x bfloat> %res
303 define <vscale x 2 x bfloat> @fmin_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
304 ; NOB16B16-LABEL: fmin_nxv2bf16:
305 ; NOB16B16: // %bb.0:
306 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
307 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
308 ; NOB16B16-NEXT: ptrue p0.d
309 ; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
310 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
313 ; B16B16-LABEL: fmin_nxv2bf16:
315 ; B16B16-NEXT: ptrue p0.d
316 ; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
318 %res = call <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
319 ret <vscale x 2 x bfloat> %res
322 define <vscale x 4 x bfloat> @fmin_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
323 ; NOB16B16-LABEL: fmin_nxv4bf16:
324 ; NOB16B16: // %bb.0:
325 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
326 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
327 ; NOB16B16-NEXT: ptrue p0.s
328 ; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
329 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
332 ; B16B16-LABEL: fmin_nxv4bf16:
334 ; B16B16-NEXT: ptrue p0.s
335 ; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
337 %res = call <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
338 ret <vscale x 4 x bfloat> %res
341 define <vscale x 8 x bfloat> @fmin_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
342 ; NOB16B16-LABEL: fmin_nxv8bf16:
343 ; NOB16B16: // %bb.0:
344 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
345 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
346 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
347 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
348 ; NOB16B16-NEXT: ptrue p0.s
349 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
350 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
351 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
352 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
353 ; NOB16B16-NEXT: fmin z2.s, p0/m, z2.s, z3.s
354 ; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s
355 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
356 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
357 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
360 ; B16B16-LABEL: fmin_nxv8bf16:
362 ; B16B16-NEXT: ptrue p0.h
363 ; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
365 %res = call <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
366 ret <vscale x 8 x bfloat> %res
373 define <vscale x 2 x bfloat> @fminnm_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
374 ; NOB16B16-LABEL: fminnm_nxv2bf16:
375 ; NOB16B16: // %bb.0:
376 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
377 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
378 ; NOB16B16-NEXT: ptrue p0.d
379 ; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
380 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
383 ; B16B16-LABEL: fminnm_nxv2bf16:
385 ; B16B16-NEXT: ptrue p0.d
386 ; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
388 %res = call <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
389 ret <vscale x 2 x bfloat> %res
392 define <vscale x 4 x bfloat> @fminnm_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
393 ; NOB16B16-LABEL: fminnm_nxv4bf16:
394 ; NOB16B16: // %bb.0:
395 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
396 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
397 ; NOB16B16-NEXT: ptrue p0.s
398 ; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
399 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
402 ; B16B16-LABEL: fminnm_nxv4bf16:
404 ; B16B16-NEXT: ptrue p0.s
405 ; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
407 %res = call <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
408 ret <vscale x 4 x bfloat> %res
411 define <vscale x 8 x bfloat> @fminnm_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
412 ; NOB16B16-LABEL: fminnm_nxv8bf16:
413 ; NOB16B16: // %bb.0:
414 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
415 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
416 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
417 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
418 ; NOB16B16-NEXT: ptrue p0.s
419 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
420 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
421 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
422 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
423 ; NOB16B16-NEXT: fminnm z2.s, p0/m, z2.s, z3.s
424 ; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
425 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
426 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
427 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
430 ; B16B16-LABEL: fminnm_nxv8bf16:
432 ; B16B16-NEXT: ptrue p0.h
433 ; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
435 %res = call <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
436 ret <vscale x 8 x bfloat> %res
443 define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c) {
444 ; NOB16B16-LABEL: fmla_nxv2bf16:
445 ; NOB16B16: // %bb.0:
446 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
447 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
448 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
449 ; NOB16B16-NEXT: ptrue p0.d
450 ; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
451 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
454 ; B16B16-LABEL: fmla_nxv2bf16:
456 ; B16B16-NEXT: ptrue p0.d
457 ; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
459 %res = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, <vscale x 2 x bfloat> %c)
460 ret <vscale x 2 x bfloat> %res
463 define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
464 ; NOB16B16-LABEL: fmla_nxv4bf16:
465 ; NOB16B16: // %bb.0:
466 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
467 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
468 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
469 ; NOB16B16-NEXT: ptrue p0.s
470 ; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
471 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
474 ; B16B16-LABEL: fmla_nxv4bf16:
476 ; B16B16-NEXT: ptrue p0.s
477 ; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
479 %res = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c)
480 ret <vscale x 4 x bfloat> %res
483 define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
484 ; NOB16B16-LABEL: fmla_nxv8bf16:
485 ; NOB16B16: // %bb.0:
486 ; NOB16B16-NEXT: uunpkhi z3.s, z1.h
487 ; NOB16B16-NEXT: uunpkhi z4.s, z0.h
488 ; NOB16B16-NEXT: uunpkhi z5.s, z2.h
489 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
490 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
491 ; NOB16B16-NEXT: uunpklo z2.s, z2.h
492 ; NOB16B16-NEXT: ptrue p0.s
493 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
494 ; NOB16B16-NEXT: lsl z4.s, z4.s, #16
495 ; NOB16B16-NEXT: lsl z5.s, z5.s, #16
496 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
497 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
498 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
499 ; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
500 ; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
501 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
502 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
503 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
506 ; B16B16-LABEL: fmla_nxv8bf16:
508 ; B16B16-NEXT: ptrue p0.h
509 ; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
511 %res = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c)
512 ret <vscale x 8 x bfloat> %res
519 define <vscale x 2 x bfloat> @fmul_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
520 ; NOB16B16-LABEL: fmul_nxv2bf16:
521 ; NOB16B16: // %bb.0:
522 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
523 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
524 ; NOB16B16-NEXT: ptrue p0.d
525 ; NOB16B16-NEXT: fmul z0.s, p0/m, z0.s, z1.s
526 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
529 ; B16B16-LABEL: fmul_nxv2bf16:
531 ; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
533 %res = fmul <vscale x 2 x bfloat> %a, %b
534 ret <vscale x 2 x bfloat> %res
537 define <vscale x 4 x bfloat> @fmul_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
538 ; NOB16B16-LABEL: fmul_nxv4bf16:
539 ; NOB16B16: // %bb.0:
540 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
541 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
542 ; NOB16B16-NEXT: ptrue p0.s
543 ; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s
544 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
547 ; B16B16-LABEL: fmul_nxv4bf16:
549 ; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
551 %res = fmul <vscale x 4 x bfloat> %a, %b
552 ret <vscale x 4 x bfloat> %res
555 define <vscale x 8 x bfloat> @fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
556 ; NOB16B16-LABEL: fmul_nxv8bf16:
557 ; NOB16B16: // %bb.0:
558 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
559 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
560 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
561 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
562 ; NOB16B16-NEXT: ptrue p0.s
563 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
564 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
565 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
566 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
567 ; NOB16B16-NEXT: fmul z2.s, z3.s, z2.s
568 ; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s
569 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
570 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
571 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
574 ; B16B16-LABEL: fmul_nxv8bf16:
576 ; B16B16-NEXT: bfmul z0.h, z0.h, z1.h
578 %res = fmul <vscale x 8 x bfloat> %a, %b
579 ret <vscale x 8 x bfloat> %res
586 define <vscale x 2 x bfloat> @fneg_nxv2bf16(<vscale x 2 x bfloat> %a) {
587 ; CHECK-LABEL: fneg_nxv2bf16:
589 ; CHECK-NEXT: eor z0.h, z0.h, #0x8000
591 %res = fneg <vscale x 2 x bfloat> %a
592 ret <vscale x 2 x bfloat> %res
595 define <vscale x 4 x bfloat> @fneg_nxv4bf16(<vscale x 4 x bfloat> %a) {
596 ; CHECK-LABEL: fneg_nxv4bf16:
598 ; CHECK-NEXT: eor z0.h, z0.h, #0x8000
600 %res = fneg <vscale x 4 x bfloat> %a
601 ret <vscale x 4 x bfloat> %res
604 define <vscale x 8 x bfloat> @fneg_nxv8bf16(<vscale x 8 x bfloat> %a) {
605 ; CHECK-LABEL: fneg_nxv8bf16:
607 ; CHECK-NEXT: eor z0.h, z0.h, #0x8000
609 %res = fneg <vscale x 8 x bfloat> %a
610 ret <vscale x 8 x bfloat> %res
617 define <vscale x 2 x bfloat> @fsqrt_nxv2bf16(<vscale x 2 x bfloat> %a) {
618 ; CHECK-LABEL: fsqrt_nxv2bf16:
620 ; CHECK-NEXT: lsl z0.s, z0.s, #16
621 ; CHECK-NEXT: ptrue p0.d
622 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
623 ; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
625 %res = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> %a)
626 ret <vscale x 2 x bfloat> %res
629 define <vscale x 4 x bfloat> @fsqrt_nxv4bf16(<vscale x 4 x bfloat> %a) {
630 ; CHECK-LABEL: fsqrt_nxv4bf16:
632 ; CHECK-NEXT: lsl z0.s, z0.s, #16
633 ; CHECK-NEXT: ptrue p0.s
634 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
635 ; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
637 %res = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> %a)
638 ret <vscale x 4 x bfloat> %res
641 define <vscale x 8 x bfloat> @fsqrt_nxv8bf16(<vscale x 8 x bfloat> %a) {
642 ; CHECK-LABEL: fsqrt_nxv8bf16:
644 ; CHECK-NEXT: uunpkhi z1.s, z0.h
645 ; CHECK-NEXT: uunpklo z0.s, z0.h
646 ; CHECK-NEXT: ptrue p0.s
647 ; CHECK-NEXT: lsl z1.s, z1.s, #16
648 ; CHECK-NEXT: lsl z0.s, z0.s, #16
649 ; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s
650 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
651 ; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
652 ; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
653 ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
655 %res = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> %a)
656 ret <vscale x 8 x bfloat> %res
663 define <vscale x 2 x bfloat> @fsub_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
664 ; NOB16B16-LABEL: fsub_nxv2bf16:
665 ; NOB16B16: // %bb.0:
666 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
667 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
668 ; NOB16B16-NEXT: ptrue p0.d
669 ; NOB16B16-NEXT: fsub z0.s, p0/m, z0.s, z1.s
670 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
673 ; B16B16-LABEL: fsub_nxv2bf16:
675 ; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
677 %res = fsub <vscale x 2 x bfloat> %a, %b
678 ret <vscale x 2 x bfloat> %res
681 define <vscale x 4 x bfloat> @fsub_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
682 ; NOB16B16-LABEL: fsub_nxv4bf16:
683 ; NOB16B16: // %bb.0:
684 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
685 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
686 ; NOB16B16-NEXT: ptrue p0.s
687 ; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s
688 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
691 ; B16B16-LABEL: fsub_nxv4bf16:
693 ; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
695 %res = fsub <vscale x 4 x bfloat> %a, %b
696 ret <vscale x 4 x bfloat> %res
699 define <vscale x 8 x bfloat> @fsub_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
700 ; NOB16B16-LABEL: fsub_nxv8bf16:
701 ; NOB16B16: // %bb.0:
702 ; NOB16B16-NEXT: uunpkhi z2.s, z1.h
703 ; NOB16B16-NEXT: uunpkhi z3.s, z0.h
704 ; NOB16B16-NEXT: uunpklo z1.s, z1.h
705 ; NOB16B16-NEXT: uunpklo z0.s, z0.h
706 ; NOB16B16-NEXT: ptrue p0.s
707 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16
708 ; NOB16B16-NEXT: lsl z3.s, z3.s, #16
709 ; NOB16B16-NEXT: lsl z1.s, z1.s, #16
710 ; NOB16B16-NEXT: lsl z0.s, z0.s, #16
711 ; NOB16B16-NEXT: fsub z2.s, z3.s, z2.s
712 ; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s
713 ; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
714 ; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
715 ; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
718 ; B16B16-LABEL: fsub_nxv8bf16:
720 ; B16B16-NEXT: bfsub z0.h, z0.h, z1.h
722 %res = fsub <vscale x 8 x bfloat> %a, %b
723 ret <vscale x 8 x bfloat> %res
726 declare <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat>)
727 declare <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat>)
728 declare <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat>)
730 declare <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
731 declare <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
732 declare <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
734 declare <vscale x 2 x bfloat> @llvm.maximum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
735 declare <vscale x 4 x bfloat> @llvm.maximum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
736 declare <vscale x 8 x bfloat> @llvm.maximum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
738 declare <vscale x 2 x bfloat> @llvm.maxnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
739 declare <vscale x 4 x bfloat> @llvm.maxnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
740 declare <vscale x 8 x bfloat> @llvm.maxnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
742 declare <vscale x 2 x bfloat> @llvm.minimum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
743 declare <vscale x 4 x bfloat> @llvm.minimum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
744 declare <vscale x 8 x bfloat> @llvm.minimum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
746 declare <vscale x 2 x bfloat> @llvm.minnum.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>)
747 declare <vscale x 4 x bfloat> @llvm.minnum.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>)
748 declare <vscale x 8 x bfloat> @llvm.minnum.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
750 declare <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat>)
751 declare <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat>)
752 declare <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat>)