1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
3 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
5 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
7 declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
9 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
10 ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
11 ; GFX11: ; %bb.0: ; %entry
12 ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
15 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
16 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
17 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
19 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
20 ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
21 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
22 ; GFX11-NEXT: s_endpgm
24 ; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
25 ; GFX950: ; %bb.0: ; %entry
26 ; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
27 ; GFX950-NEXT: v_mov_b32_e32 v0, 0
28 ; GFX950-NEXT: s_waitcnt lgkmcnt(0)
29 ; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0
30 ; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0
31 ; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0
32 ; GFX950-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX950-NEXT: v_mov_b32_e32 v1, s0
34 ; GFX950-NEXT: v_mov_b32_e32 v2, s1
35 ; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp
36 ; GFX950-NEXT: s_nop 2
37 ; GFX950-NEXT: global_store_dword v0, v1, s[8:9]
38 ; GFX950-NEXT: s_endpgm
40 ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
41 ; GFX950-ISEL: ; %bb.0: ; %entry
42 ; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
43 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
44 ; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
45 ; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
46 ; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
47 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
49 ; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
50 ; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1 clamp
51 ; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0
52 ; GFX950-ISEL-NEXT: s_nop 1
53 ; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9]
54 ; GFX950-ISEL-NEXT: s_endpgm
58 ptr addrspace(1) %c) {
60 %a.val = load <2 x bfloat>, ptr addrspace(1) %a
61 %b.val = load <2 x bfloat>, ptr addrspace(1) %b
62 %c.val = load float, ptr addrspace(1) %c
63 %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 1)
64 store float %r.val, ptr addrspace(1) %r
69 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
70 ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
71 ; GFX11: ; %bb.0: ; %entry
72 ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
73 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
75 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
76 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
77 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
79 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
80 ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
81 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
82 ; GFX11-NEXT: s_endpgm
84 ; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
85 ; GFX950: ; %bb.0: ; %entry
86 ; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
87 ; GFX950-NEXT: v_mov_b32_e32 v0, 0
88 ; GFX950-NEXT: s_waitcnt lgkmcnt(0)
89 ; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0
90 ; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0
91 ; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0
92 ; GFX950-NEXT: s_waitcnt lgkmcnt(0)
93 ; GFX950-NEXT: v_mov_b32_e32 v1, s0
94 ; GFX950-NEXT: v_mov_b32_e32 v2, s1
95 ; GFX950-NEXT: v_dot2c_f32_bf16_e32 v2, s2, v1
96 ; GFX950-NEXT: s_nop 2
97 ; GFX950-NEXT: global_store_dword v0, v2, s[8:9]
98 ; GFX950-NEXT: s_endpgm
100 ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
101 ; GFX950-ISEL: ; %bb.0: ; %entry
102 ; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
103 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
105 ; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
106 ; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
107 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
108 ; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
109 ; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
110 ; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v1, s2, v0
111 ; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
112 ; GFX950-ISEL-NEXT: s_nop 1
113 ; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9]
114 ; GFX950-ISEL-NEXT: s_endpgm
118 ptr addrspace(1) %c) {
120 %a.val = load <2 x bfloat>, ptr addrspace(1) %a
121 %b.val = load <2 x bfloat>, ptr addrspace(1) %b
122 %c.val = load float, ptr addrspace(1) %c
123 %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 0)
124 store float %r.val, ptr addrspace(1) %r