1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
3 ; GCN-LABEL: {{^}}accvgpr_write_read:
4 ; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1
5 ; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]]
6 ; GFX908: global_store_dword v{{[0-9]+}}, [[VREG]], s{{\[[0-9]+:[0-9]+\]}}
7 define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) {
9 %in.1 = load float, float addrspace(1)* %arg
10 %init = tail call float asm "v_accvgpr_write $0, 1", "=a"()
11 %read = tail call float asm "v_accvgpr_read $0, $1", "=v,a"(float %init)
12 store float %read, float addrspace(1)* %arg
16 ; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_avva
17 ; GFX908: v_accvgpr_write_b32
18 ; GFX908: v_accvgpr_write_b32
19 ; GFX908: v_accvgpr_write_b32
20 ; GFX908: v_accvgpr_write_b32
21 ; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
22 ; GFX908: v_accvgpr_read_b32
23 ; GFX908: v_accvgpr_read_b32
24 ; GFX908: v_accvgpr_read_b32
25 ; GFX908: v_accvgpr_read_b32
26 define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(<4 x float> addrspace(1)* %arg) {
28 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
29 %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,v,v,a"(float 1.0, float 2.0, <4 x float> %in.1)
30 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
34 ; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_aaaa
35 ; GFX908: v_accvgpr_write_b32
36 ; GFX908: v_accvgpr_write_b32
37 ; GFX908: v_accvgpr_write_b32
38 ; GFX908: v_accvgpr_write_b32
39 ; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], a{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}]
40 ; GFX908: v_accvgpr_read_b32
41 ; GFX908: v_accvgpr_read_b32
42 ; GFX908: v_accvgpr_read_b32
43 ; GFX908: v_accvgpr_read_b32
44 define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(<4 x float> addrspace(1)* %arg) {
46 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
47 %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,a,a,a"(float 1.0, float 2.0, <4 x float> %in.1)
48 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
52 ; GCN-LABEL: {{^}}v_mfma_f32_4x4x4f16_aaaa
53 ; GFX908: v_accvgpr_write_b32
54 ; GFX908: v_accvgpr_write_b32
55 ; GFX908: v_accvgpr_write_b32
56 ; GFX908: v_accvgpr_write_b32
57 ; GFX908: v_accvgpr_write_b32
58 ; GFX908: v_accvgpr_write_b32
59 ; GFX908: v_accvgpr_write_b32
60 ; GFX908: v_accvgpr_write_b32
61 ; GFX908: v_mfma_f32_4x4x4f16 a[{{[0-9:]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9:]+}}]
62 ; GFX908: v_accvgpr_read_b32
63 ; GFX908: v_accvgpr_read_b32
64 ; GFX908: v_accvgpr_read_b32
65 ; GFX908: v_accvgpr_read_b32
66 define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(<4 x float> addrspace(1)* %arg) {
68 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
69 %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x4f16 $0, $1, $2, $3", "=a,a,a,a"(<4 x half> <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>, <4 x half> <half 0xH03FF, half 0xH03FF, half 0xH03FF, half 0xH03FF>, <4 x float> %in.1)
70 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
74 ; GCN-LABEL: {{^}}v_mfma_f32_16x16x1f32_avaa
75 ; GFX908: v_accvgpr_write_b32
76 ; GFX908: v_accvgpr_write_b32
77 ; GFX908: v_accvgpr_write_b32
78 ; GFX908: v_accvgpr_write_b32
79 ; GFX908: v_accvgpr_write_b32
80 ; GFX908: v_accvgpr_write_b32
81 ; GFX908: v_accvgpr_write_b32
82 ; GFX908: v_accvgpr_write_b32
83 ; GFX908: v_accvgpr_write_b32
84 ; GFX908: v_accvgpr_write_b32
85 ; GFX908: v_accvgpr_write_b32
86 ; GFX908: v_accvgpr_write_b32
87 ; GFX908: v_accvgpr_write_b32
88 ; GFX908: v_accvgpr_write_b32
89 ; GFX908: v_accvgpr_write_b32
90 ; GFX908: v_accvgpr_write_b32
91 ; GFX908: v_accvgpr_write_b32
92 ; GFX908: v_mfma_f32_16x16x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}]
93 ; GFX908: v_accvgpr_read_b32
94 ; GFX908: v_accvgpr_read_b32
95 ; GFX908: v_accvgpr_read_b32
96 ; GFX908: v_accvgpr_read_b32
97 ; GFX908: v_accvgpr_read_b32
98 ; GFX908: v_accvgpr_read_b32
99 ; GFX908: v_accvgpr_read_b32
100 ; GFX908: v_accvgpr_read_b32
101 ; GFX908: v_accvgpr_read_b32
102 ; GFX908: v_accvgpr_read_b32
103 ; GFX908: v_accvgpr_read_b32
104 ; GFX908: v_accvgpr_read_b32
105 ; GFX908: v_accvgpr_read_b32
106 ; GFX908: v_accvgpr_read_b32
107 ; GFX908: v_accvgpr_read_b32
108 ; GFX908: v_accvgpr_read_b32
109 define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(<16 x float> addrspace(1)* %arg) {
111 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
112 %mai.1 = tail call <16 x float> asm "v_mfma_f32_16x16x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <16 x float> %in.1)
113 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
117 ; GCN-LABEL: {{^}}v_mfma_f32_32x32x1f32_avaa
118 ; GFX908: v_accvgpr_write_b32
119 ; GFX908: v_accvgpr_write_b32
120 ; GFX908: v_accvgpr_write_b32
121 ; GFX908: v_accvgpr_write_b32
122 ; GFX908: v_accvgpr_write_b32
123 ; GFX908: v_accvgpr_write_b32
124 ; GFX908: v_accvgpr_write_b32
125 ; GFX908: v_accvgpr_write_b32
126 ; GFX908: v_accvgpr_write_b32
127 ; GFX908: v_accvgpr_write_b32
128 ; GFX908: v_accvgpr_write_b32
129 ; GFX908: v_accvgpr_write_b32
130 ; GFX908: v_accvgpr_write_b32
131 ; GFX908: v_accvgpr_write_b32
132 ; GFX908: v_accvgpr_write_b32
133 ; GFX908: v_accvgpr_write_b32
134 ; GFX908: v_accvgpr_write_b32
135 ; GFX908: v_accvgpr_write_b32
136 ; GFX908: v_accvgpr_write_b32
137 ; GFX908: v_accvgpr_write_b32
138 ; GFX908: v_accvgpr_write_b32
139 ; GFX908: v_accvgpr_write_b32
140 ; GFX908: v_accvgpr_write_b32
141 ; GFX908: v_accvgpr_write_b32
142 ; GFX908: v_accvgpr_write_b32
143 ; GFX908: v_accvgpr_write_b32
144 ; GFX908: v_accvgpr_write_b32
145 ; GFX908: v_accvgpr_write_b32
146 ; GFX908: v_accvgpr_write_b32
147 ; GFX908: v_accvgpr_write_b32
148 ; GFX908: v_accvgpr_write_b32
149 ; GFX908: v_accvgpr_write_b32
150 ; GFX908: v_accvgpr_write_b32
151 ; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}]
152 ; GFX908: v_accvgpr_read_b32
153 ; GFX908: v_accvgpr_read_b32
154 ; GFX908: v_accvgpr_read_b32
155 ; GFX908: v_accvgpr_read_b32
156 ; GFX908: v_accvgpr_read_b32
157 ; GFX908: v_accvgpr_read_b32
158 ; GFX908: v_accvgpr_read_b32
159 ; GFX908: v_accvgpr_read_b32
160 ; GFX908: v_accvgpr_read_b32
161 ; GFX908: v_accvgpr_read_b32
162 ; GFX908: v_accvgpr_read_b32
163 ; GFX908: v_accvgpr_read_b32
164 ; GFX908: v_accvgpr_read_b32
165 ; GFX908: v_accvgpr_read_b32
166 ; GFX908: v_accvgpr_read_b32
167 ; GFX908: v_accvgpr_read_b32
168 ; GFX908: v_accvgpr_read_b32
169 ; GFX908: v_accvgpr_read_b32
170 ; GFX908: v_accvgpr_read_b32
171 ; GFX908: v_accvgpr_read_b32
172 ; GFX908: v_accvgpr_read_b32
173 ; GFX908: v_accvgpr_read_b32
174 ; GFX908: v_accvgpr_read_b32
175 ; GFX908: v_accvgpr_read_b32
176 ; GFX908: v_accvgpr_read_b32
177 ; GFX908: v_accvgpr_read_b32
178 ; GFX908: v_accvgpr_read_b32
179 ; GFX908: v_accvgpr_read_b32
180 ; GFX908: v_accvgpr_read_b32
181 ; GFX908: v_accvgpr_read_b32
182 ; GFX908: v_accvgpr_read_b32
183 ; GFX908: v_accvgpr_read_b32
184 define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(<32 x i32> addrspace(1)* %arg) {
186 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
187 %mai.1 = tail call <32 x i32> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <32 x i32> %in.1)
188 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg