1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_40 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940,GFX90A_40 %s
6 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
7 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
8 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
9 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
10 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
11 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
12 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
13 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
14 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
15 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
16 declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32)
17 declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32)
18 declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
19 declare i32 @llvm.amdgcn.workitem.id.x()
21 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
22 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
23 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
24 ; GCN-DAG: s_load_dwordx16
25 ; GCN-DAG: s_load_dwordx16
26 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
27 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
28 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
29 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
30 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
31 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
32 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
33 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
34 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
35 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
36 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
37 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
38 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
39 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
40 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
41 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
42 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
43 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
44 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
45 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
46 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
47 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
48 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
49 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
50 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
51 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
52 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
53 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
54 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
55 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
56 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
57 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
58 ; GFX90A_40-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
59 ; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
60 ; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
61 ; GFX908-COUNT-4: v_accvgpr_read_b32
62 ; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
63 ; GFX908-COUNT-4: v_accvgpr_read_b32
64 ; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
65 ; GFX908-COUNT-4: v_accvgpr_read_b32
66 ; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
67 ; GFX908-COUNT-4: v_accvgpr_read_b32
68 ; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
69 ; GFX90A-NOT: v_accvgpr_read_b32
70 ; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
71 define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
73 %in.1 = load <32 x float>, ptr addrspace(1) %arg
74 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
75 store <32 x float> %mai.1, ptr addrspace(1) %arg
79 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
80 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
81 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
82 ; GCN-DAG: s_load_dwordx16
83 ; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
84 ; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
85 ; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
86 ; GFX940: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
87 ; GFX908-COUNT: v_accvgpr_read_b32
88 ; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
89 ; GFX90A-NOT: v_accvgpr_read_b32
90 ; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
91 define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
93 %in.1 = load <16 x float>, ptr addrspace(1) %arg
94 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
95 store <16 x float> %mai.1, ptr addrspace(1) %arg
99 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
100 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
101 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
102 ; GCN: s_load_dwordx4
103 ; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
104 ; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
105 ; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
106 ; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
107 ; GFX908-COUNT-4: v_accvgpr_read_b32
108 ; GFX908: global_store_dwordx4
109 ; GFX90A-NOT: v_accvgpr_read_b32
110 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]]
111 define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
113 %in.1 = load <4 x float>, ptr addrspace(1) %arg
114 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
115 store <4 x float> %mai.1, ptr addrspace(1) %arg
119 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32:
120 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
121 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
122 ; GCN-DAG: s_load_dwordx16
123 ; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
124 ; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
125 ; GFX908_A: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
126 ; GFX940: v_mfma_f32_32x32x2_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
127 ; GFX908-COUNT-16: v_accvgpr_read_b32
128 ; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
129 ; GFX90A-NOT: v_accvgpr_read_b32
130 ; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
131 define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
133 %in.1 = load <16 x float>, ptr addrspace(1) %arg
134 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
135 store <16 x float> %mai.1, ptr addrspace(1) %arg
139 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32:
140 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
141 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
142 ; GCN: s_load_dwordx4
143 ; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
144 ; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
145 ; GFX908_A: v_mfma_f32_16x16x4f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
146 ; GFX940: v_mfma_f32_16x16x4_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
147 ; GFX908-COUNT-4: v_accvgpr_read_b32
148 ; GFX908: global_store_dwordx4
149 ; GFX90A-NOT: v_accvgpr_read_b32
150 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]],
151 define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
153 %in.1 = load <4 x float>, ptr addrspace(1) %arg
154 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
155 store <4 x float> %mai.1, ptr addrspace(1) %arg
159 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16:
160 ; GCN-DAG: s_load_dwordx16
161 ; GCN-DAG: s_load_dwordx16
162 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
163 ; GFX90A_40-COUNT-32:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
164 ; GFX908_A: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
165 ; GFX940: v_mfma_f32_32x32x4_2b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
166 ; GFX908-COUNT-32: v_accvgpr_read_b32
167 ; GFX908: global_store_dwordx4
168 ; GFX90A-NOT: v_accvgpr_read_b32
169 ; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
170 define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
172 %in.1 = load <32 x float>, ptr addrspace(1) %arg
173 %c.1 = load <4 x half>, ptr addrspace(1) %c
174 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
175 %c.2 = load <4 x half>, ptr addrspace(1) %c2p
176 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
177 store <32 x float> %mai.1, ptr addrspace(1) %arg
181 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16:
182 ; GCN: s_load_dwordx16
183 ; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
184 ; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
185 ; GFX908_A: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
186 ; GFX940: v_mfma_f32_16x16x4_4b_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
187 ; GFX908-COUNT-16: v_accvgpr_read_b32
188 ; GFX908: global_store_dwordx4
189 ; GFX90A-NOT: v_accvgpr_read_b32
190 ; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
191 define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
193 %in.1 = load <16 x float>, ptr addrspace(1) %arg
194 %c.1 = load <4 x half>, ptr addrspace(1) %c
195 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
196 %c.2 = load <4 x half>, ptr addrspace(1) %c2p
197 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
198 store <16 x float> %mai.1, ptr addrspace(1) %arg
202 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16:
203 ; GCN: s_load_dwordx4
204 ; GCN: s_load_dwordx4
205 ; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
206 ; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
207 ; GFX908_A: v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
208 ; GFX940: v_mfma_f32_4x4x4_16b_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
209 ; GFX908-COUNT-4: v_accvgpr_read_b32
210 ; GFX908: global_store_dwordx4
211 ; GFX90A-NOT: v_accvgpr_read_b32
212 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]],
213 define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
215 %in.1 = load <4 x float>, ptr addrspace(1) %arg
216 %c.1 = load <4 x half>, ptr addrspace(1) %c
217 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
218 %c.2 = load <4 x half>, ptr addrspace(1) %c2p
219 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
220 store <4 x float> %mai.1, ptr addrspace(1) %arg
224 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16:
225 ; GCN: s_load_dwordx16
226 ; GCN: s_waitcnt lgkmcnt(0)
227 ; GFX908_A: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
228 ; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
229 ; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
230 ; GFX908_A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
231 ; GFX940: v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
232 ; GFX908-COUNT-16: v_accvgpr_read_b32
233 ; GFX908: global_store_dwordx4
234 ; GFX90A-NOT: v_accvgpr_read_b32
235 ; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
236 define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
238 %in.1 = load <16 x float>, ptr addrspace(1) %arg
239 %c.1 = load <4 x half>, ptr addrspace(1) %c
240 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
241 %c.2 = load <4 x half>, ptr addrspace(1) %c2p
242 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
243 store <16 x float> %mai.1, ptr addrspace(1) %arg
247 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16:
248 ; GCN: s_load_dwordx4
249 ; GCN: s_load_dwordx4
250 ; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
251 ; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
252 ; GFX908_A: v_mfma_f32_16x16x16f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
253 ; GFX940: v_mfma_f32_16x16x16_f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
254 ; GFX908-COUNT-4: v_accvgpr_read_b32
255 ; GFX908: global_store_dwordx4
256 ; GFX90A-NOT: v_accvgpr_read_b32
257 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]],
258 define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 {
260 %in.1 = load <4 x float>, ptr addrspace(1) %arg
261 %c.1 = load <4 x half>, ptr addrspace(1) %c
262 %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1
263 %c.2 = load <4 x half>, ptr addrspace(1) %c2p
264 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
265 store <4 x float> %mai.1, ptr addrspace(1) %arg
269 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8:
270 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
271 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
272 ; GCN-DAG: s_load_dwordx16
273 ; GCN-DAG: s_load_dwordx16
274 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
275 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
276 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
277 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
278 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
279 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
280 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
281 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
282 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
283 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
284 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
285 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
286 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
287 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
288 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
289 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
290 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
291 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
292 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
293 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
294 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
295 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
296 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
297 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
298 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
299 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
300 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
301 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
302 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
303 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
304 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
305 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
306 ; GFX90A_40-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
307 ; GFX908_A: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
308 ; GFX940: v_mfma_i32_32x32x4_2b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
309 ; GFX908-COUNT-32: v_accvgpr_read_b32
310 ; GFX908: global_store_dwordx4
311 ; GFX90A-NOT: v_accvgpr_read_b32
312 ; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
313 define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
315 %in.1 = load <32 x i32>, ptr addrspace(1) %arg
316 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
317 store <32 x i32> %mai.1, ptr addrspace(1) %arg
321 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8:
322 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
323 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
324 ; GCN-DAG: s_load_dwordx16
325 ; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
326 ; GFX90A_40-COUNT-16:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
327 ; GFX908_A: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
328 ; GFX940: v_mfma_i32_16x16x4_4b_i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
329 ; GFX908-COUNT-16: v_accvgpr_read_b32
330 ; GFX908: global_store_dwordx4
331 ; GFX90A-NOT: v_accvgpr_read_b32
332 ; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
333 define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
335 %in.1 = load <16 x i32>, ptr addrspace(1) %arg
336 %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
337 store <16 x i32> %mai.1, ptr addrspace(1) %arg
341 ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8:
342 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
343 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
344 ; GCN: s_load_dwordx4
345 ; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
346 ; GFX90A_40-COUNT-4:v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
347 ; GFX908_A: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
348 ; GFX940: v_mfma_i32_4x4x4_16b_i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
349 ; GFX908-COUNT-4: v_accvgpr_read_b32
350 ; GFX908: global_store_dwordx4
351 ; GFX90A-NOT: v_accvgpr_read_b32
352 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]],
353 define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
355 %in.1 = load <4 x i32>, ptr addrspace(1) %arg
356 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
357 store <4 x i32> %mai.1, ptr addrspace(1) %arg
361 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
362 ; GFX908_A: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
363 ; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
364 ; GFX940: v_mfma_f32_32x32x1_2b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
365 ; GFX940-NEXT: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
366 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
368 %in.1 = load <32 x float>, ptr addrspace(1) %arg
369 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
370 %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
371 store <32 x float> %mai.2, ptr addrspace(1) %arg
375 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc:
376 ; GFX908_A: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
377 ; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
378 ; GFX940: v_mfma_f32_16x16x1_4b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
379 ; GFX940-NEXT: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
380 define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
382 %in.1 = load <16 x float>, ptr addrspace(1) %arg
383 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
384 %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
385 store <16 x float> %mai.2, ptr addrspace(1) %arg
389 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc:
390 ; GFX908_A: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
391 ; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
392 ; GFX940: v_mfma_f32_4x4x1_16b_f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
393 ; GFX940-NEXT: s_nop 1
394 ; GFX940-NEXT: v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
395 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
397 %in.1 = load <4 x float>, ptr addrspace(1) %arg
398 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
399 %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
400 store <4 x float> %mai.2, ptr addrspace(1) %arg
404 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat:
405 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
406 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
407 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
408 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
409 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
410 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
411 ; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
412 ; LIT-SRCC: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
413 ; GFX90A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
414 ; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0
415 ; GFX908-COUNT-4: v_accvgpr_read_b32
416 ; GFX908: global_store_dwordx4
417 ; GFX90A-NOT: v_accvgpr_read_b32
418 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]],
419 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
421 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
422 store <4 x float> %mai.1, ptr addrspace(1) %arg
426 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat:
427 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
428 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
429 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
430 ; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
431 ; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
432 ; GFX90A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
433 ; GFX940: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
434 ; GFX908-COUNT-16: v_accvgpr_read_b32
435 ; GFX908: global_store_dwordx4
436 ; GFX90A-NOT: v_accvgpr_read_b32
437 ; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
438 define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
440 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
441 store <16 x float> %mai.1, ptr addrspace(1) %arg
445 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat:
446 ; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000
447 ; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00
448 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
449 ; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
450 ; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
451 ; GFX90A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
452 ; GFX940: v_mfma_f32_32x32x8_f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
453 ; GFX908-COUNT-16: v_accvgpr_read_b32
454 ; GFX908: global_store_dwordx4
455 ; GFX90A-NOT: v_accvgpr_read_b32
456 ; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
457 define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %arg) #0 {
459 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
460 store <16 x float> %mai.1, ptr addrspace(1) %arg
464 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat:
465 ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
466 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
467 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
468 ; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
469 ; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
470 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
471 ; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
472 ; GFX908-COUNT-32: v_accvgpr_read_b32
473 ; GFX908: global_store_dwordx4
474 ; GFX90A-NOT: v_accvgpr_read_b32
475 ; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
476 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %arg) #0 {
478 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
479 store <32 x float> %mai.1, ptr addrspace(1) %arg
483 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm:
484 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
485 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
486 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
487 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
488 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
489 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
490 ; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
491 ; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
492 ; GFX908-COUNT-4: v_accvgpr_read_b32
493 ; GFX908: global_store_dwordx4
494 ; GFX90A-NOT: v_accvgpr_read_b32
495 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]],
496 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 {
498 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
499 store <4 x float> %mai.1, ptr addrspace(1) %arg
503 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm:
504 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
505 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
506 ; GFX908-COUNT-14: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
507 ; GFX90A-COUNT-14: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
508 ; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
509 ; GFX940: v_mfma_f32_16x16x1_4b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
510 ; GFX908-COUNT-16: v_accvgpr_read_b32
511 ; GFX908: global_store_dwordx4
512 ; GFX90A-NOT: v_accvgpr_read_b32
513 ; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
514 define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #0 {
516 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
517 store <16 x float> %mai.1, ptr addrspace(1) %arg
521 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
522 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
523 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
524 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
525 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
526 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
527 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
528 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
529 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
530 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
531 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
532 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
533 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
534 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
535 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
536 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
537 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
538 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
539 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
540 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
541 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
542 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
543 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
544 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
545 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
546 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
547 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
548 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
549 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
550 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
551 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
552 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
553 ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
554 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
555 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
556 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
557 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
558 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
559 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
560 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
561 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
562 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
563 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
564 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
565 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
566 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
567 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
568 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
569 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
570 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
571 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
572 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
573 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
574 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
575 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
576 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
577 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
578 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
579 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
580 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
581 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
582 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
583 ; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
584 ; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
585 ; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
586 ; GFX908-COUNT-32: v_accvgpr_read_b32
587 ; GFX908: global_store_dwordx4
588 ; GFX90A-NOT: v_accvgpr_read_b32
589 ; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}],
590 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #0 {
592 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
593 store <32 x float> %mai.1, ptr addrspace(1) %arg
597 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat:
598 ; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
599 ; GFX90A_40: s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000
600 ; GCN: v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]]
601 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
602 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
603 ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
604 ; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
605 ; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
606 ; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]]
607 ; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
608 ; GFX940: v_mfma_f32_4x4x1_16b_f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
609 ; GFX908-COUNT-4: v_accvgpr_read_b32
610 ; GFX908: global_store_dwordx4
611 ; GFX90A-NOT: v_accvgpr_read_b32
612 ; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]]
613 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %arg, i64 %idx) #0 {
615 %tid = call i32 @llvm.amdgcn.workitem.id.x()
616 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
617 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
618 ;store <4 x float> %mai.1, ptr addrspace(1) %arg
619 store <4 x float> %mai.1, ptr addrspace(1) %gep
623 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
624 ; GFX908: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
625 ; GFX90A_40:s_mov_b32 [[TMP0:s[0-9]+]], 0x42f60000
626 ; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]]
627 ; GFX90A_40-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]]
628 ; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
629 ; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
630 ; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
632 ; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
633 ; GFX940: v_mfma_f32_4x4x1_16b_f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
634 ; GFX908-COUNT-4: v_accvgpr_read_b32
635 ; GFX908: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}]
636 ; GFX90A_40: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
637 define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspace(1) %arg) #0 {
639 %tid = call i32 @llvm.amdgcn.workitem.id.x()
640 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
642 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
643 store <4 x float> %mai.1, ptr addrspace(1) %arg
647 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg:
648 ; GFX90A_40-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
649 ; GFX90A_40-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
650 ; GCN-COUNT-8: global_load_dwordx4
651 ; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
652 ; GFX90A_40-NOT: v_accvgpr_write
653 ; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
654 ; GFX908-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
655 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
656 ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
657 ; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
658 ; GFX908: v_accvgpr_read_b32
659 ; GFX908-COUNT-8: global_store_dwordx4
660 ; GFX90A_40-NOT: v_accvgpr_read_b32
661 ; GFX90A_40-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
662 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg) #0 {
664 %tid = call i32 @llvm.amdgcn.workitem.id.x()
665 %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
666 %in.1 = load <32 x float>, ptr addrspace(1) %gep
667 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
668 store <32 x float> %mai.1, ptr addrspace(1) %gep
672 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }