1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,NOBF16
3 ; RUN: llc -mattr=+sve --enable-no-nans-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NOBF16NNAN
4 ; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,BF16
5 ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,BF16
7 target triple = "aarch64-unknown-linux-gnu"
9 ; NOTE: "fptrunc <# x double> to <# x bfloat>" is not supported because SVE
10 ; lacks a down convert that rounds to odd. Such IR will trigger the usual
11 ; failure (crash) when attempting to unroll a scalable vector.
13 define <vscale x 2 x float> @fpext_nxv2bf16_to_nxv2f32(<vscale x 2 x bfloat> %a) {
14 ; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32:
16 ; CHECK-NEXT: lsl z0.s, z0.s, #16
18 %res = fpext <vscale x 2 x bfloat> %a to <vscale x 2 x float>
19 ret <vscale x 2 x float> %res
22 define <vscale x 4 x float> @fpext_nxv4bf16_to_nxv4f32(<vscale x 4 x bfloat> %a) {
23 ; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f32:
25 ; CHECK-NEXT: lsl z0.s, z0.s, #16
27 %res = fpext <vscale x 4 x bfloat> %a to <vscale x 4 x float>
28 ret <vscale x 4 x float> %res
31 define <vscale x 8 x float> @fpext_nxv8bf16_to_nxv8f32(<vscale x 8 x bfloat> %a) {
32 ; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f32:
34 ; CHECK-NEXT: uunpklo z1.s, z0.h
35 ; CHECK-NEXT: uunpkhi z2.s, z0.h
36 ; CHECK-NEXT: lsl z0.s, z1.s, #16
37 ; CHECK-NEXT: lsl z1.s, z2.s, #16
39 %res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x float>
40 ret <vscale x 8 x float> %res
43 define <vscale x 2 x double> @fpext_nxv2bf16_to_nxv2f64(<vscale x 2 x bfloat> %a) {
44 ; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f64:
46 ; CHECK-NEXT: lsl z0.s, z0.s, #16
47 ; CHECK-NEXT: ptrue p0.d
48 ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
50 %res = fpext <vscale x 2 x bfloat> %a to <vscale x 2 x double>
51 ret <vscale x 2 x double> %res
54 define <vscale x 4 x double> @fpext_nxv4bf16_to_nxv4f64(<vscale x 4 x bfloat> %a) {
55 ; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f64:
57 ; CHECK-NEXT: uunpklo z1.d, z0.s
58 ; CHECK-NEXT: uunpkhi z0.d, z0.s
59 ; CHECK-NEXT: ptrue p0.d
60 ; CHECK-NEXT: lsl z1.s, z1.s, #16
61 ; CHECK-NEXT: lsl z2.s, z0.s, #16
62 ; CHECK-NEXT: movprfx z0, z1
63 ; CHECK-NEXT: fcvt z0.d, p0/m, z1.s
64 ; CHECK-NEXT: movprfx z1, z2
65 ; CHECK-NEXT: fcvt z1.d, p0/m, z2.s
67 %res = fpext <vscale x 4 x bfloat> %a to <vscale x 4 x double>
68 ret <vscale x 4 x double> %res
71 define <vscale x 8 x double> @fpext_nxv8bf16_to_nxv8f64(<vscale x 8 x bfloat> %a) {
72 ; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f64:
74 ; CHECK-NEXT: uunpklo z1.s, z0.h
75 ; CHECK-NEXT: uunpkhi z0.s, z0.h
76 ; CHECK-NEXT: ptrue p0.d
77 ; CHECK-NEXT: uunpklo z2.d, z1.s
78 ; CHECK-NEXT: uunpkhi z1.d, z1.s
79 ; CHECK-NEXT: uunpklo z3.d, z0.s
80 ; CHECK-NEXT: uunpkhi z0.d, z0.s
81 ; CHECK-NEXT: lsl z1.s, z1.s, #16
82 ; CHECK-NEXT: lsl z2.s, z2.s, #16
83 ; CHECK-NEXT: lsl z3.s, z3.s, #16
84 ; CHECK-NEXT: lsl z4.s, z0.s, #16
85 ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s
86 ; CHECK-NEXT: movprfx z0, z2
87 ; CHECK-NEXT: fcvt z0.d, p0/m, z2.s
88 ; CHECK-NEXT: movprfx z2, z3
89 ; CHECK-NEXT: fcvt z2.d, p0/m, z3.s
90 ; CHECK-NEXT: movprfx z3, z4
91 ; CHECK-NEXT: fcvt z3.d, p0/m, z4.s
93 %res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x double>
94 ret <vscale x 8 x double> %res
97 define <vscale x 2 x bfloat> @fptrunc_nxv2f32_to_nxv2bf16(<vscale x 2 x float> %a) {
98 ; NOBF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
100 ; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff
101 ; NOBF16-NEXT: lsr z2.s, z0.s, #16
102 ; NOBF16-NEXT: ptrue p0.d
103 ; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
104 ; NOBF16-NEXT: and z2.s, z2.s, #0x1
105 ; NOBF16-NEXT: add z1.s, z0.s, z1.s
106 ; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
107 ; NOBF16-NEXT: add z1.s, z2.s, z1.s
108 ; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s
109 ; NOBF16-NEXT: lsr z0.s, z0.s, #16
112 ; NOBF16NNAN-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
113 ; NOBF16NNAN: // %bb.0:
114 ; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff
115 ; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16
116 ; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1
117 ; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s
118 ; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s
119 ; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16
120 ; NOBF16NNAN-NEXT: ret
122 ; BF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
124 ; BF16-NEXT: ptrue p0.d
125 ; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
127 %res = fptrunc <vscale x 2 x float> %a to <vscale x 2 x bfloat>
128 ret <vscale x 2 x bfloat> %res
131 define <vscale x 4 x bfloat> @fptrunc_nxv4f32_to_nxv4bf16(<vscale x 4 x float> %a) {
132 ; NOBF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
134 ; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff
135 ; NOBF16-NEXT: lsr z2.s, z0.s, #16
136 ; NOBF16-NEXT: ptrue p0.s
137 ; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
138 ; NOBF16-NEXT: and z2.s, z2.s, #0x1
139 ; NOBF16-NEXT: add z1.s, z0.s, z1.s
140 ; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
141 ; NOBF16-NEXT: add z1.s, z2.s, z1.s
142 ; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s
143 ; NOBF16-NEXT: lsr z0.s, z0.s, #16
146 ; NOBF16NNAN-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
147 ; NOBF16NNAN: // %bb.0:
148 ; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff
149 ; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16
150 ; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1
151 ; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s
152 ; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s
153 ; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16
154 ; NOBF16NNAN-NEXT: ret
156 ; BF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
158 ; BF16-NEXT: ptrue p0.s
159 ; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
161 %res = fptrunc <vscale x 4 x float> %a to <vscale x 4 x bfloat>
162 ret <vscale x 4 x bfloat> %res
165 define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %a) {
166 ; NOBF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
168 ; NOBF16-NEXT: mov z2.s, #32767 // =0x7fff
169 ; NOBF16-NEXT: lsr z3.s, z1.s, #16
170 ; NOBF16-NEXT: lsr z4.s, z0.s, #16
171 ; NOBF16-NEXT: ptrue p0.s
172 ; NOBF16-NEXT: and z3.s, z3.s, #0x1
173 ; NOBF16-NEXT: and z4.s, z4.s, #0x1
174 ; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
175 ; NOBF16-NEXT: add z5.s, z1.s, z2.s
176 ; NOBF16-NEXT: add z2.s, z0.s, z2.s
177 ; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
178 ; NOBF16-NEXT: orr z1.s, z1.s, #0x400000
179 ; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
180 ; NOBF16-NEXT: add z3.s, z3.s, z5.s
181 ; NOBF16-NEXT: add z2.s, z4.s, z2.s
182 ; NOBF16-NEXT: sel z1.s, p1, z1.s, z3.s
183 ; NOBF16-NEXT: sel z0.s, p0, z0.s, z2.s
184 ; NOBF16-NEXT: lsr z1.s, z1.s, #16
185 ; NOBF16-NEXT: lsr z0.s, z0.s, #16
186 ; NOBF16-NEXT: uzp1 z0.h, z0.h, z1.h
189 ; NOBF16NNAN-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
190 ; NOBF16NNAN: // %bb.0:
191 ; NOBF16NNAN-NEXT: mov z2.s, #32767 // =0x7fff
192 ; NOBF16NNAN-NEXT: lsr z3.s, z1.s, #16
193 ; NOBF16NNAN-NEXT: lsr z4.s, z0.s, #16
194 ; NOBF16NNAN-NEXT: and z3.s, z3.s, #0x1
195 ; NOBF16NNAN-NEXT: and z4.s, z4.s, #0x1
196 ; NOBF16NNAN-NEXT: add z1.s, z1.s, z2.s
197 ; NOBF16NNAN-NEXT: add z0.s, z0.s, z2.s
198 ; NOBF16NNAN-NEXT: add z1.s, z3.s, z1.s
199 ; NOBF16NNAN-NEXT: add z0.s, z4.s, z0.s
200 ; NOBF16NNAN-NEXT: lsr z1.s, z1.s, #16
201 ; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16
202 ; NOBF16NNAN-NEXT: uzp1 z0.h, z0.h, z1.h
203 ; NOBF16NNAN-NEXT: ret
205 ; BF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
207 ; BF16-NEXT: ptrue p0.s
208 ; BF16-NEXT: bfcvt z1.h, p0/m, z1.s
209 ; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
210 ; BF16-NEXT: uzp1 z0.h, z0.h, z1.h
212 %res = fptrunc <vscale x 8 x float> %a to <vscale x 8 x bfloat>
213 ret <vscale x 8 x bfloat> %res