1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
4 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
5 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
6 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
7 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
8 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
9 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
10 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
11 declare i32 @llvm.amdgcn.workitem.id.x()
13 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
14 ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k:
16 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24
17 ; GCN-NEXT: s_mov_b64 s[36:37], 1
18 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
19 ; GCN-NEXT: s_mov_b32 s36, 2
20 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
21 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
22 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
23 ; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
24 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
25 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
26 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
27 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
28 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
29 ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
30 ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
31 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6
32 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7
33 ; GCN-NEXT: v_accvgpr_write_b32 a8, s8
34 ; GCN-NEXT: v_accvgpr_write_b32 a9, s9
35 ; GCN-NEXT: v_accvgpr_write_b32 a10, s10
36 ; GCN-NEXT: v_accvgpr_write_b32 a11, s11
37 ; GCN-NEXT: v_accvgpr_write_b32 a12, s12
38 ; GCN-NEXT: v_accvgpr_write_b32 a13, s13
39 ; GCN-NEXT: v_accvgpr_write_b32 a14, s14
40 ; GCN-NEXT: v_accvgpr_write_b32 a15, s15
41 ; GCN-NEXT: v_accvgpr_write_b32 a16, s16
42 ; GCN-NEXT: v_accvgpr_write_b32 a17, s17
43 ; GCN-NEXT: v_accvgpr_write_b32 a18, s18
44 ; GCN-NEXT: v_accvgpr_write_b32 a19, s19
45 ; GCN-NEXT: v_accvgpr_write_b32 a20, s20
46 ; GCN-NEXT: v_accvgpr_write_b32 a21, s21
47 ; GCN-NEXT: v_accvgpr_write_b32 a22, s22
48 ; GCN-NEXT: v_accvgpr_write_b32 a23, s23
49 ; GCN-NEXT: v_accvgpr_write_b32 a24, s24
50 ; GCN-NEXT: v_accvgpr_write_b32 a25, s25
51 ; GCN-NEXT: v_accvgpr_write_b32 a26, s26
52 ; GCN-NEXT: v_accvgpr_write_b32 a27, s27
53 ; GCN-NEXT: v_accvgpr_write_b32 a28, s28
54 ; GCN-NEXT: v_accvgpr_write_b32 a29, s29
55 ; GCN-NEXT: v_accvgpr_write_b32 a30, s30
56 ; GCN-NEXT: v_accvgpr_write_b32 a31, s31
58 ; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
59 ; GCN-NEXT: v_mov_b32_e32 v0, 0
63 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
64 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
65 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
66 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
67 ; GCN-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
68 ; GCN-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
69 ; GCN-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
70 ; GCN-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
73 %in.1 = load <32 x float>, ptr addrspace(1) %arg
74 %a = bitcast i64 1 to <4 x i16>
75 %b = bitcast i64 2 to <4 x i16>
76 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
77 store <32 x float> %mai.1, ptr addrspace(1) %arg
81 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
82 ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k:
84 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
85 ; GCN-NEXT: s_mov_b64 s[18:19], 1
86 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
87 ; GCN-NEXT: s_mov_b32 s18, 2
88 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
89 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
90 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
91 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
92 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
93 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
94 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
95 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
96 ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
97 ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
98 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6
99 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7
100 ; GCN-NEXT: v_accvgpr_write_b32 a8, s8
101 ; GCN-NEXT: v_accvgpr_write_b32 a9, s9
102 ; GCN-NEXT: v_accvgpr_write_b32 a10, s10
103 ; GCN-NEXT: v_accvgpr_write_b32 a11, s11
104 ; GCN-NEXT: v_accvgpr_write_b32 a12, s12
105 ; GCN-NEXT: v_accvgpr_write_b32 a13, s13
106 ; GCN-NEXT: v_accvgpr_write_b32 a14, s14
107 ; GCN-NEXT: v_accvgpr_write_b32 a15, s15
109 ; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
110 ; GCN-NEXT: v_mov_b32_e32 v0, 0
113 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
114 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
115 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
116 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
119 %in.1 = load <16 x float>, ptr addrspace(1) %arg
120 %a = bitcast i64 1 to <4 x i16>
121 %b = bitcast i64 2 to <4 x i16>
122 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
123 store <16 x float> %mai.1, ptr addrspace(1) %arg
127 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
128 ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k:
129 ; GCN: ; %bb.0: ; %bb
130 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
131 ; GCN-NEXT: s_mov_b64 s[6:7], 1
132 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
133 ; GCN-NEXT: s_mov_b32 s6, 2
134 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
135 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
136 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
137 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
138 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
139 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
140 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
141 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
143 ; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
144 ; GCN-NEXT: v_mov_b32_e32 v0, 0
146 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
149 %in.1 = load <4 x float>, ptr addrspace(1) %arg
150 %a = bitcast i64 1 to <4 x i16>
151 %b = bitcast i64 2 to <4 x i16>
152 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
153 store <4 x float> %mai.1, ptr addrspace(1) %arg
157 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
158 ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k:
159 ; GCN: ; %bb.0: ; %bb
160 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
161 ; GCN-NEXT: s_mov_b64 s[18:19], 1
162 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
163 ; GCN-NEXT: s_mov_b32 s18, 2
164 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
165 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
166 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
167 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
168 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
169 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
170 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
171 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
172 ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
173 ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
174 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6
175 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7
176 ; GCN-NEXT: v_accvgpr_write_b32 a8, s8
177 ; GCN-NEXT: v_accvgpr_write_b32 a9, s9
178 ; GCN-NEXT: v_accvgpr_write_b32 a10, s10
179 ; GCN-NEXT: v_accvgpr_write_b32 a11, s11
180 ; GCN-NEXT: v_accvgpr_write_b32 a12, s12
181 ; GCN-NEXT: v_accvgpr_write_b32 a13, s13
182 ; GCN-NEXT: v_accvgpr_write_b32 a14, s14
183 ; GCN-NEXT: v_accvgpr_write_b32 a15, s15
185 ; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
186 ; GCN-NEXT: v_mov_b32_e32 v0, 0
190 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
191 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
192 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
193 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
196 %in.1 = load <16 x float>, ptr addrspace(1) %arg
197 %a = bitcast i64 1 to <4 x i16>
198 %b = bitcast i64 2 to <4 x i16>
199 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
200 store <16 x float> %mai.1, ptr addrspace(1) %arg
204 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
205 ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k:
206 ; GCN: ; %bb.0: ; %bb
207 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
208 ; GCN-NEXT: s_mov_b64 s[6:7], 1
209 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
210 ; GCN-NEXT: s_mov_b32 s6, 2
211 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
212 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
213 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
214 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
215 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
216 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
217 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
218 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
220 ; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
221 ; GCN-NEXT: v_mov_b32_e32 v0, 0
224 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
227 %in.1 = load <4 x float>, ptr addrspace(1) %arg
228 %a = bitcast i64 1 to <4 x i16>
229 %b = bitcast i64 2 to <4 x i16>
230 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
231 store <4 x float> %mai.1, ptr addrspace(1) %arg
235 define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
236 ; GCN-LABEL: test_mfma_f64_4x4x4f64:
237 ; GCN: ; %bb.0: ; %bb
238 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
239 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
240 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
241 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
242 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
244 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
246 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
247 ; GCN-NEXT: v_mov_b32_e32 v0, 0
249 ; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[4:5]
252 %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
253 %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
254 store double %mai.2, ptr addrspace(1) %arg
258 define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
259 ; GCN-LABEL: test_mfma_f64_16x16x4f64:
260 ; GCN: ; %bb.0: ; %bb
261 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
262 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34
263 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
264 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
265 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
266 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
267 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
268 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
269 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
270 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
271 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
272 ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
273 ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
274 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6
275 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7
277 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
278 ; GCN-NEXT: v_mov_b32_e32 v0, 0
282 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
283 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
286 %in.1 = load <4 x double>, ptr addrspace(1) %arg
287 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
288 store <4 x double> %mai.1, ptr addrspace(1) %arg
292 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
293 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm:
294 ; GCN: ; %bb.0: ; %bb
295 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
296 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
297 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
298 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
299 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
301 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
302 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
303 ; GCN-NEXT: v_mov_b32_e32 v0, 0
307 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
308 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
311 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
312 %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
313 store <4 x double> %mai.2, ptr addrspace(1) %arg
317 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
318 ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
319 ; GCN: ; %bb.0: ; %bb
320 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
321 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34
322 ; GCN-NEXT: s_mov_b64 s[0:1], 0
323 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0
324 ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
325 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
326 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
327 ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
328 ; GCN-NEXT: v_accvgpr_write_b32 a0, s0
329 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
330 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1
331 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2
332 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3
333 ; GCN-NEXT: v_accvgpr_write_b32 a4, s4
334 ; GCN-NEXT: v_accvgpr_write_b32 a5, s5
335 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6
336 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7
338 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
339 ; GCN-NEXT: v_mov_b32_e32 v0, 0
343 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
344 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
347 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
348 store <4 x double> %mai.1, ptr addrspace(1) %arg
352 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
353 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
354 ; GCN: ; %bb.0: ; %bb
355 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
356 ; GCN-NEXT: s_mov_b32 s4, 0
357 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
358 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000
359 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
360 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
361 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
362 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
363 ; GCN-NEXT: s_mov_b64 s[10:11], s[4:5]
364 ; GCN-NEXT: v_accvgpr_write_b32 a0, s4
365 ; GCN-NEXT: v_accvgpr_write_b32 a1, s5
366 ; GCN-NEXT: v_accvgpr_write_b32 a2, s6
367 ; GCN-NEXT: v_accvgpr_write_b32 a3, s7
368 ; GCN-NEXT: v_accvgpr_write_b32 a4, s8
369 ; GCN-NEXT: v_accvgpr_write_b32 a5, s9
370 ; GCN-NEXT: v_accvgpr_write_b32 a6, s10
371 ; GCN-NEXT: v_accvgpr_write_b32 a7, s11
372 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
374 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
375 ; GCN-NEXT: v_mov_b32_e32 v0, 0
379 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
380 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
383 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
384 store <4 x double> %mai.1, ptr addrspace(1) %arg
388 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }