1 ; RUN: llc -march=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,VGPRCD %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx942 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx942 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,AGPRCD %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
6 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
7 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
8 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
9 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
10 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64, i64, <4 x float>, i32, i32, i32)
11 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64, i64, <4 x float>, i32, i32, i32)
12 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64, i64, <4 x float>, i32, i32, i32)
13 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64, i64, <4 x float>, i32, i32, i32)
14 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64, i64, <16 x float>, i32, i32, i32)
15 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64, i64, <16 x float>, i32, i32, i32)
16 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64, i64, <16 x float>, i32, i32, i32)
17 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32, i32, i32)
18 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half>, <8 x half>, <4 x float>, i32, i32, i32)
19 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half>, <8 x half>, <16 x float>, i32, i32, i32)
20 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16>, <8 x i16>, <4 x float>, i32, i32, i32)
21 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16>, <8 x i16>, <16 x float>, i32, i32, i32)
22 declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32)
23 declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32>, <4 x i32>, <16 x i32>, i32, i32, i32)
24 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
25 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
26 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
27 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32>, <4 x i32>, <4 x float>, i32, i32, i32)
28 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
29 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
30 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
31 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
33 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
34 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
35 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
36 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
37 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
38 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
39 ; GFX940: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[[[TWO]]:[[ONE]]], v[[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
40 ; GISEL: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
41 ; GCN-NOT: v_accvgpr_read_b32
42 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
43 define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
45 %in.1 = load <4 x i32>, ptr addrspace(1) %arg
46 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
47 store <4 x i32> %mai.1, ptr addrspace(1) %arg
51 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8:
52 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
53 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
54 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
55 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
56 ; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
57 ; GFX940: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[[[TWO]]:[[ONE]]], v[[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
58 ; GISEL: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
59 ; GCN-NOT: v_accvgpr_read_b32
60 ; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
61 define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
63 %in.1 = load <16 x i32>, ptr addrspace(1) %arg
64 %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 1, i32 2, i32 3)
65 store <16 x i32> %mai.1, ptr addrspace(1) %arg
69 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
70 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
71 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
72 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
73 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
74 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
75 ; GFX940: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
76 ; GISEL: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
77 ; GCN-NOT: v_accvgpr_read_b32
78 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
79 define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
81 %in.1 = load <4 x float>, ptr addrspace(1) %arg
82 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
83 store <4 x float> %mai.1, ptr addrspace(1) %arg
87 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
88 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
89 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
90 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
91 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
92 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
93 ; GFX940: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
94 ; GISEL: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
95 ; GCN-NOT: v_accvgpr_read_b32
96 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
97 define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
99 %in.1 = load <16 x float>, ptr addrspace(1) %arg
100 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
101 store <16 x float> %mai.1, ptr addrspace(1) %arg
105 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
106 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
107 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
108 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
109 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
110 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
111 ; GFX940: v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
112 ; GISEL: v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
113 ; GCN-NOT: v_accvgpr_read_b32
114 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
115 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
117 %in.1 = load <4 x float>, ptr addrspace(1) %arg
118 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
119 store <4 x float> %mai.1, ptr addrspace(1) %arg
123 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8:
124 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
125 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
126 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
127 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
128 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
129 ; GFX940: v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
130 ; GISEL: v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
131 ; GCN-NOT: v_accvgpr_read_b32
132 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
133 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
135 %in.1 = load <4 x float>, ptr addrspace(1) %arg
136 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
137 store <4 x float> %mai.1, ptr addrspace(1) %arg
141 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8:
142 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
143 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
144 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
145 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
146 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
147 ; GFX940: v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
148 ; GISEL: v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
149 ; GCN-NOT: v_accvgpr_read_b32
150 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
151 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
153 %in.1 = load <4 x float>, ptr addrspace(1) %arg
154 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
155 store <4 x float> %mai.1, ptr addrspace(1) %arg
159 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8:
160 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
161 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
162 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
163 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
164 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
165 ; GFX940: v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
166 ; GISEL: v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
167 ; GCN-NOT: v_accvgpr_read_b32
168 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
169 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
171 %in.1 = load <4 x float>, ptr addrspace(1) %arg
172 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
173 store <4 x float> %mai.1, ptr addrspace(1) %arg
177 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8:
178 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
179 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
180 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
181 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
182 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
183 ; GFX940: v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
184 ; GISEL: v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
185 ; GCN-NOT: v_accvgpr_read_b32
186 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
187 define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 {
189 %in.1 = load <16 x float>, ptr addrspace(1) %arg
190 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
191 store <16 x float> %mai.1, ptr addrspace(1) %arg
195 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8:
196 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
197 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
198 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
199 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
200 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
201 ; GFX940: v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
202 ; GISEL: v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
203 ; GCN-NOT: v_accvgpr_read_b32
204 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
205 define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 {
207 %in.1 = load <16 x float>, ptr addrspace(1) %arg
208 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
209 store <16 x float> %mai.1, ptr addrspace(1) %arg
213 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8:
214 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
215 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
216 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
217 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
218 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
219 ; GFX940: v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
220 ; GISEL: v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
221 ; GCN-NOT: v_accvgpr_read_b32
222 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
223 define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 {
225 %in.1 = load <16 x float>, ptr addrspace(1) %arg
226 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
227 store <16 x float> %mai.1, ptr addrspace(1) %arg
231 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8:
232 ; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
233 ; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
234 ; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
235 ; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
236 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
237 ; GFX940: v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
238 ; GISEL: v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
239 ; GCN-NOT: v_accvgpr_read_b32
240 ; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
241 define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 {
243 %in.1 = load <16 x float>, ptr addrspace(1) %arg
244 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
245 store <16 x float> %mai.1, ptr addrspace(1) %arg
249 ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16:
250 ; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
251 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
252 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
253 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
254 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
255 ; GCN: v_smfmac_f32_16x16x32_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
256 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
257 define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
259 %in.1 = load <4 x float>, ptr addrspace(1) %arg
260 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
261 store <4 x float> %mai.1, ptr addrspace(1) %arg
265 ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16:
266 ; GCN: s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
267 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
268 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
269 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
270 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
271 ; GCN: v_smfmac_f32_32x32x16_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
272 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
273 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
274 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
275 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
276 define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
278 %in.1 = load <16 x float>, ptr addrspace(1) %arg
279 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
280 store <16 x float> %mai.1, ptr addrspace(1) %arg
284 ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16:
285 ; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
286 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
287 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
288 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
289 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
290 ; GCN: v_smfmac_f32_16x16x32_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
291 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
292 define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
294 %in.1 = load <4 x float>, ptr addrspace(1) %arg
295 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
296 store <4 x float> %mai.1, ptr addrspace(1) %arg
300 ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16:
301 ; GCN: s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
302 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
303 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
304 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
305 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
306 ; GCN: v_smfmac_f32_32x32x16_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
307 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
308 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
309 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
310 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
311 define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
313 %in.1 = load <16 x float>, ptr addrspace(1) %arg
314 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
315 store <16 x float> %mai.1, ptr addrspace(1) %arg
319 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8:
320 ; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
321 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
322 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
323 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
324 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
325 ; GCN: v_smfmac_i32_16x16x64_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
326 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
327 define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
329 %in.1 = load <4 x i32>, ptr addrspace(1) %arg
330 %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
331 store <4 x i32> %mai.1, ptr addrspace(1) %arg
335 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8:
336 ; GCN: s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
337 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
338 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
339 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
340 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
341 ; GCN: v_smfmac_i32_32x32x32_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
342 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
343 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
344 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
345 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
346 define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
348 %in.1 = load <16 x i32>, ptr addrspace(1) %arg
349 %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
350 store <16 x i32> %mai.1, ptr addrspace(1) %arg
354 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
355 ; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
356 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
357 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
358 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
359 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
360 ; GCN: v_smfmac_f32_16x16x64_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
361 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
362 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
364 %in.1 = load <4 x float>, ptr addrspace(1) %arg
365 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
366 store <4 x float> %mai.1, ptr addrspace(1) %arg
370 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
371 ; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
372 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
373 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
374 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
375 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
376 ; GCN: v_smfmac_f32_16x16x64_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
377 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
378 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
380 %in.1 = load <4 x float>, ptr addrspace(1) %arg
381 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
382 store <4 x float> %mai.1, ptr addrspace(1) %arg
386 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
387 ; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
388 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
389 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
390 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
391 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
392 ; GCN: v_smfmac_f32_16x16x64_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
393 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
394 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
396 %in.1 = load <4 x float>, ptr addrspace(1) %arg
397 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
398 store <4 x float> %mai.1, ptr addrspace(1) %arg
402 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
403 ; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
404 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
405 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
406 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
407 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
408 ; GCN: v_smfmac_f32_16x16x64_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
409 ; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
410 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
412 %in.1 = load <4 x float>, ptr addrspace(1) %arg
413 %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
414 store <4 x float> %mai.1, ptr addrspace(1) %arg
418 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
419 ; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
420 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
421 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
422 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
423 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
424 ; GCN: v_smfmac_f32_32x32x32_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
425 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
426 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
427 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
428 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
429 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
431 %in.1 = load <16 x float>, ptr addrspace(1) %arg
432 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
433 store <16 x float> %mai.1, ptr addrspace(1) %arg
437 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
438 ; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
439 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
440 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
441 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
442 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
443 ; GCN: v_smfmac_f32_32x32x32_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
444 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
445 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
446 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
447 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
448 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
450 %in.1 = load <16 x float>, ptr addrspace(1) %arg
451 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
452 store <16 x float> %mai.1, ptr addrspace(1) %arg
456 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
457 ; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
458 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
459 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
460 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
461 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
462 ; GCN: v_smfmac_f32_32x32x32_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
463 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
464 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
465 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
466 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
467 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
469 %in.1 = load <16 x float>, ptr addrspace(1) %arg
470 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
471 store <16 x float> %mai.1, ptr addrspace(1) %arg
475 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
476 ; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
477 ; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
478 ; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
479 ; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
480 ; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
481 ; GCN: v_smfmac_f32_32x32x32_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
482 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
483 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
484 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
485 ; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
486 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
488 %in.1 = load <16 x float>, ptr addrspace(1) %arg
489 %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
490 store <16 x float> %mai.1, ptr addrspace(1) %arg
494 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }