1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s
4 ; TODO: Add global-isel when it can support bf16
6 define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
7 ; GCN-LABEL: v_test_cvt_bf16_f32_v:
9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
10 ; GCN-NEXT: ; return to shader part epilog
11 %cvt = fpext bfloat %v to float
15 define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
16 ; GCN-LABEL: v_test_cvt_bf16_f32_s:
18 ; GCN-NEXT: s_lshl_b32 s0, s0, 16
19 ; GCN-NEXT: v_mov_b32_e32 v0, s0
20 ; GCN-NEXT: ; return to shader part epilog
21 %cvt = fpext bfloat %v to float
25 define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
26 ; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v:
28 ; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1
29 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
30 ; GCN-NEXT: v_add3_u32 v2, v2, v0, s0
31 ; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0
32 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
34 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
35 ; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1
36 ; GCN-NEXT: v_add3_u32 v2, v2, v1, s0
37 ; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1
38 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
39 ; GCN-NEXT: s_mov_b32 s0, 0x7060302
41 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
42 ; GCN-NEXT: v_perm_b32 v0, v1, v0, s0
43 ; GCN-NEXT: ; return to shader part epilog
44 %res = fptrunc <2 x float> %src to <2 x bfloat>
45 %cast = bitcast <2 x bfloat> %res to float
49 define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
50 ; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s:
52 ; GCN-NEXT: s_bfe_u32 s2, s1, 0x10010
53 ; GCN-NEXT: s_add_i32 s2, s2, s1
54 ; GCN-NEXT: s_or_b32 s4, s1, 0x400000
55 ; GCN-NEXT: s_add_i32 s5, s2, 0x7fff
56 ; GCN-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1
57 ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
58 ; GCN-NEXT: s_cselect_b32 s1, s4, s5
59 ; GCN-NEXT: s_lshr_b32 s2, s1, 16
60 ; GCN-NEXT: s_bfe_u32 s1, s0, 0x10010
61 ; GCN-NEXT: s_add_i32 s1, s1, s0
62 ; GCN-NEXT: s_or_b32 s3, s0, 0x400000
63 ; GCN-NEXT: s_add_i32 s4, s1, 0x7fff
64 ; GCN-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0
65 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
66 ; GCN-NEXT: s_cselect_b32 s0, s3, s4
67 ; GCN-NEXT: s_lshr_b32 s0, s0, 16
68 ; GCN-NEXT: s_pack_ll_b32_b16 s0, s0, s2
69 ; GCN-NEXT: v_mov_b32_e32 v0, s0
70 ; GCN-NEXT: ; return to shader part epilog
71 %res = fptrunc <2 x float> %src to <2 x bfloat>
72 %cast = bitcast <2 x bfloat> %res to float
76 define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
77 ; GCN-LABEL: v_test_cvt_f32_bf16_v:
79 ; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1
80 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
81 ; GCN-NEXT: v_add3_u32 v1, v1, v0, s0
82 ; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v0
83 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
85 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
86 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
87 ; GCN-NEXT: ; return to shader part epilog
88 %trunc = fptrunc float %src to bfloat
89 %ext = fpext bfloat %trunc to float
93 define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
94 ; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v:
96 ; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
97 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
98 ; GCN-NEXT: v_and_b32_e32 v7, 1, v6
99 ; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
100 ; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
101 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
102 ; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
103 ; GCN-NEXT: v_add_u32_e32 v4, v6, v4
104 ; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
105 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
106 ; GCN-NEXT: s_brev_b32 s4, 1
107 ; GCN-NEXT: v_and_or_b32 v5, v1, s4, v4
108 ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
109 ; GCN-NEXT: s_movk_i32 s5, 0x7fff
110 ; GCN-NEXT: v_add3_u32 v4, v4, v5, s5
111 ; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
112 ; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
114 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
115 ; GCN-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
116 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
117 ; GCN-NEXT: v_and_b32_e32 v6, 1, v5
118 ; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
119 ; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
120 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
121 ; GCN-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
122 ; GCN-NEXT: v_add_u32_e32 v0, v5, v0
123 ; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
124 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
125 ; GCN-NEXT: v_and_or_b32 v1, v3, s4, v0
126 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 1
127 ; GCN-NEXT: v_add3_u32 v0, v0, v1, s5
128 ; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
129 ; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
130 ; GCN-NEXT: s_mov_b32 s0, 0x7060302
132 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
133 ; GCN-NEXT: v_perm_b32 v0, v0, v4, s0
134 ; GCN-NEXT: ; return to shader part epilog
135 %res = fptrunc <2 x double> %src to <2 x bfloat>
136 %cast = bitcast <2 x bfloat> %res to float
140 define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
141 ; GCN-LABEL: fptrunc_f32_f32_to_v2bf16:
142 ; GCN: ; %bb.0: ; %entry
143 ; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1
144 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
145 ; GCN-NEXT: v_add3_u32 v2, v2, v0, s0
146 ; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0
147 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
149 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
150 ; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1
151 ; GCN-NEXT: v_add3_u32 v2, v2, v1, s0
152 ; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1
153 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
154 ; GCN-NEXT: s_mov_b32 s0, 0x7060302
156 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
157 ; GCN-NEXT: v_perm_b32 v0, v1, v0, s0
158 ; GCN-NEXT: ; return to shader part epilog
160 %a.cvt = fptrunc float %a to bfloat
161 %b.cvt = fptrunc float %b to bfloat
162 %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
163 %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
164 %ret = bitcast <2 x bfloat> %v2.2 to float
168 define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
169 ; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
170 ; GCN: ; %bb.0: ; %entry
171 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
172 ; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1
173 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
174 ; GCN-NEXT: v_add3_u32 v3, v3, v2, s0
175 ; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2
176 ; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0
178 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
179 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
180 ; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1
181 ; GCN-NEXT: v_add3_u32 v3, v3, v2, s0
182 ; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2
183 ; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1|
184 ; GCN-NEXT: s_mov_b32 s0, 0x7060302
186 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
187 ; GCN-NEXT: v_perm_b32 v0, v1, v0, s0
188 ; GCN-NEXT: ; return to shader part epilog
190 %a.neg = fneg float %a
191 %a.cvt = fptrunc float %a.neg to bfloat
192 %b.abs = call float @llvm.fabs.f32(float %b)
193 %b.cvt = fptrunc float %b.abs to bfloat
194 %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
195 %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
196 %ret = bitcast <2 x bfloat> %v2.2 to float
200 define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
201 ; GCN-LABEL: fptrunc_f32_to_bf16:
202 ; GCN: ; %bb.0: ; %entry
203 ; GCN-NEXT: v_mov_b32_e32 v3, v2
204 ; GCN-NEXT: v_mov_b32_e32 v2, v1
205 ; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1
206 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
207 ; GCN-NEXT: v_add3_u32 v1, v1, v0, s0
208 ; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v0
209 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
211 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
212 ; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
215 %a.cvt = fptrunc float %a to bfloat
216 store bfloat %a.cvt, ptr %out
220 define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
221 ; GCN-LABEL: fptrunc_f32_to_bf16_abs:
222 ; GCN: ; %bb.0: ; %entry
223 ; GCN-NEXT: v_mov_b32_e32 v3, v2
224 ; GCN-NEXT: v_mov_b32_e32 v2, v1
225 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
226 ; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1
227 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
228 ; GCN-NEXT: v_add3_u32 v4, v4, v1, s0
229 ; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
230 ; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0|
232 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
233 ; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
236 %a.abs = call float @llvm.fabs.f32(float %a)
237 %a.cvt = fptrunc float %a.abs to bfloat
238 store bfloat %a.cvt, ptr %out
242 define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
243 ; GCN-LABEL: fptrunc_f32_to_bf16_neg:
244 ; GCN: ; %bb.0: ; %entry
245 ; GCN-NEXT: v_mov_b32_e32 v3, v2
246 ; GCN-NEXT: v_mov_b32_e32 v2, v1
247 ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
248 ; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1
249 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
250 ; GCN-NEXT: v_add3_u32 v4, v4, v1, s0
251 ; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
252 ; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0
254 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
255 ; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
258 %a.neg = fneg float %a
259 %a.cvt = fptrunc float %a.neg to bfloat
260 store bfloat %a.cvt, ptr %out
264 define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
265 ; GCN-LABEL: fptrunc_f64_to_bf16:
266 ; GCN: ; %bb.0: ; %entry
267 ; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
268 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
269 ; GCN-NEXT: v_and_b32_e32 v7, 1, v6
270 ; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
271 ; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
272 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
273 ; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
274 ; GCN-NEXT: v_add_u32_e32 v4, v6, v4
275 ; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
276 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
277 ; GCN-NEXT: s_brev_b32 s0, 1
278 ; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4
279 ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
280 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
281 ; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
282 ; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
283 ; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
285 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
286 ; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
289 %a.cvt = fptrunc double %a to bfloat
290 store bfloat %a.cvt, ptr %out
294 define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
295 ; GCN-LABEL: fptrunc_f64_to_bf16_neg:
296 ; GCN: ; %bb.0: ; %entry
297 ; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
298 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
299 ; GCN-NEXT: v_and_b32_e32 v8, 1, v7
300 ; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
301 ; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
302 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
303 ; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
304 ; GCN-NEXT: v_add_u32_e32 v4, v7, v4
305 ; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
306 ; GCN-NEXT: s_brev_b32 s4, 1
307 ; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
308 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
309 ; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4
310 ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
311 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
312 ; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
313 ; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
314 ; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
316 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
317 ; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
320 %a.neg = fneg double %a
321 %a.cvt = fptrunc double %a.neg to bfloat
322 store bfloat %a.cvt, ptr %out
326 define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
327 ; GCN-LABEL: fptrunc_f64_to_bf16_abs:
328 ; GCN: ; %bb.0: ; %entry
329 ; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
330 ; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
331 ; GCN-NEXT: v_and_b32_e32 v8, 1, v7
332 ; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
333 ; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
334 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
335 ; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
336 ; GCN-NEXT: v_add_u32_e32 v4, v7, v4
337 ; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
338 ; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
339 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
340 ; GCN-NEXT: s_brev_b32 s0, 1
341 ; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4
342 ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
343 ; GCN-NEXT: s_movk_i32 s0, 0x7fff
344 ; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
345 ; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
346 ; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
348 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
349 ; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
352 %a.abs = call double @llvm.fabs.f64(double %a)
353 %a.cvt = fptrunc double %a.abs to bfloat
354 store bfloat %a.cvt, ptr %out
358 declare float @llvm.fabs.f32(float)
359 declare double @llvm.fabs.f64(double)