1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
4 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
5 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
6 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
7 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
8 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
9 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
10 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
11 declare i32 @llvm.amdgcn.workitem.id.x()
13 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) {
14 ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k:
16 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24
17 ; GCN-NEXT: s_mov_b64 s[36:37], 1
18 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
19 ; GCN-NEXT: s_mov_b32 s36, 2
20 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
21 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
22 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
23 ; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
24 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
25 ; GCN-NEXT: v_mov_b32_e32 v4, s0
26 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
27 ; GCN-NEXT: v_mov_b32_e32 v4, s1
28 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
29 ; GCN-NEXT: v_mov_b32_e32 v4, s2
30 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
31 ; GCN-NEXT: v_mov_b32_e32 v4, s3
32 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
33 ; GCN-NEXT: v_mov_b32_e32 v4, s4
34 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4
35 ; GCN-NEXT: v_mov_b32_e32 v4, s5
36 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4
37 ; GCN-NEXT: v_mov_b32_e32 v4, s6
38 ; GCN-NEXT: v_accvgpr_write_b32 a6, v4
39 ; GCN-NEXT: v_mov_b32_e32 v4, s7
40 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4
41 ; GCN-NEXT: v_mov_b32_e32 v4, s8
42 ; GCN-NEXT: v_accvgpr_write_b32 a8, v4
43 ; GCN-NEXT: v_mov_b32_e32 v4, s9
44 ; GCN-NEXT: v_accvgpr_write_b32 a9, v4
45 ; GCN-NEXT: v_mov_b32_e32 v4, s10
46 ; GCN-NEXT: v_accvgpr_write_b32 a10, v4
47 ; GCN-NEXT: v_mov_b32_e32 v4, s11
48 ; GCN-NEXT: v_accvgpr_write_b32 a11, v4
49 ; GCN-NEXT: v_mov_b32_e32 v4, s12
50 ; GCN-NEXT: v_accvgpr_write_b32 a12, v4
51 ; GCN-NEXT: v_mov_b32_e32 v4, s13
52 ; GCN-NEXT: v_accvgpr_write_b32 a13, v4
53 ; GCN-NEXT: v_mov_b32_e32 v4, s14
54 ; GCN-NEXT: v_accvgpr_write_b32 a14, v4
55 ; GCN-NEXT: v_mov_b32_e32 v4, s15
56 ; GCN-NEXT: v_accvgpr_write_b32 a15, v4
57 ; GCN-NEXT: v_mov_b32_e32 v4, s16
58 ; GCN-NEXT: v_accvgpr_write_b32 a16, v4
59 ; GCN-NEXT: v_mov_b32_e32 v4, s17
60 ; GCN-NEXT: v_accvgpr_write_b32 a17, v4
61 ; GCN-NEXT: v_mov_b32_e32 v4, s18
62 ; GCN-NEXT: v_accvgpr_write_b32 a18, v4
63 ; GCN-NEXT: v_mov_b32_e32 v4, s19
64 ; GCN-NEXT: v_accvgpr_write_b32 a19, v4
65 ; GCN-NEXT: v_mov_b32_e32 v4, s20
66 ; GCN-NEXT: v_accvgpr_write_b32 a20, v4
67 ; GCN-NEXT: v_mov_b32_e32 v4, s21
68 ; GCN-NEXT: v_accvgpr_write_b32 a21, v4
69 ; GCN-NEXT: v_mov_b32_e32 v4, s22
70 ; GCN-NEXT: v_accvgpr_write_b32 a22, v4
71 ; GCN-NEXT: v_mov_b32_e32 v4, s23
72 ; GCN-NEXT: v_accvgpr_write_b32 a23, v4
73 ; GCN-NEXT: v_mov_b32_e32 v4, s24
74 ; GCN-NEXT: v_accvgpr_write_b32 a24, v4
75 ; GCN-NEXT: v_mov_b32_e32 v4, s25
76 ; GCN-NEXT: v_accvgpr_write_b32 a25, v4
77 ; GCN-NEXT: v_mov_b32_e32 v4, s26
78 ; GCN-NEXT: v_accvgpr_write_b32 a26, v4
79 ; GCN-NEXT: v_mov_b32_e32 v4, s27
80 ; GCN-NEXT: v_accvgpr_write_b32 a27, v4
81 ; GCN-NEXT: v_mov_b32_e32 v4, s28
82 ; GCN-NEXT: v_accvgpr_write_b32 a28, v4
83 ; GCN-NEXT: v_mov_b32_e32 v4, s29
84 ; GCN-NEXT: v_accvgpr_write_b32 a29, v4
85 ; GCN-NEXT: v_mov_b32_e32 v4, s30
86 ; GCN-NEXT: v_accvgpr_write_b32 a30, v4
87 ; GCN-NEXT: v_mov_b32_e32 v4, s31
88 ; GCN-NEXT: v_accvgpr_write_b32 a31, v4
90 ; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
91 ; GCN-NEXT: v_mov_b32_e32 v0, 0
95 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
96 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
97 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
98 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
99 ; GCN-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
100 ; GCN-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
101 ; GCN-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
102 ; GCN-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
105 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
106 %a = bitcast i64 1 to <4 x i16>
107 %b = bitcast i64 2 to <4 x i16>
108 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
109 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
113 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) {
114 ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k:
115 ; GCN: ; %bb.0: ; %bb
116 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
117 ; GCN-NEXT: s_mov_b64 s[18:19], 1
118 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
119 ; GCN-NEXT: s_mov_b32 s18, 2
120 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
121 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
122 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
123 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
124 ; GCN-NEXT: v_mov_b32_e32 v4, s0
125 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
126 ; GCN-NEXT: v_mov_b32_e32 v4, s1
127 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
128 ; GCN-NEXT: v_mov_b32_e32 v4, s2
129 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
130 ; GCN-NEXT: v_mov_b32_e32 v4, s3
131 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
132 ; GCN-NEXT: v_mov_b32_e32 v4, s4
133 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4
134 ; GCN-NEXT: v_mov_b32_e32 v4, s5
135 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4
136 ; GCN-NEXT: v_mov_b32_e32 v4, s6
137 ; GCN-NEXT: v_accvgpr_write_b32 a6, v4
138 ; GCN-NEXT: v_mov_b32_e32 v4, s7
139 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4
140 ; GCN-NEXT: v_mov_b32_e32 v4, s8
141 ; GCN-NEXT: v_accvgpr_write_b32 a8, v4
142 ; GCN-NEXT: v_mov_b32_e32 v4, s9
143 ; GCN-NEXT: v_accvgpr_write_b32 a9, v4
144 ; GCN-NEXT: v_mov_b32_e32 v4, s10
145 ; GCN-NEXT: v_accvgpr_write_b32 a10, v4
146 ; GCN-NEXT: v_mov_b32_e32 v4, s11
147 ; GCN-NEXT: v_accvgpr_write_b32 a11, v4
148 ; GCN-NEXT: v_mov_b32_e32 v4, s12
149 ; GCN-NEXT: v_accvgpr_write_b32 a12, v4
150 ; GCN-NEXT: v_mov_b32_e32 v4, s13
151 ; GCN-NEXT: v_accvgpr_write_b32 a13, v4
152 ; GCN-NEXT: v_mov_b32_e32 v4, s14
153 ; GCN-NEXT: v_accvgpr_write_b32 a14, v4
154 ; GCN-NEXT: v_mov_b32_e32 v4, s15
155 ; GCN-NEXT: v_accvgpr_write_b32 a15, v4
157 ; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
158 ; GCN-NEXT: v_mov_b32_e32 v0, 0
161 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
162 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
163 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
164 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
167 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
168 %a = bitcast i64 1 to <4 x i16>
169 %b = bitcast i64 2 to <4 x i16>
170 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
171 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
175 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) {
176 ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k:
177 ; GCN: ; %bb.0: ; %bb
178 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
179 ; GCN-NEXT: s_mov_b64 s[6:7], 1
180 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
181 ; GCN-NEXT: s_mov_b32 s6, 2
182 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
183 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
184 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
185 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
186 ; GCN-NEXT: v_mov_b32_e32 v4, s0
187 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
188 ; GCN-NEXT: v_mov_b32_e32 v4, s1
189 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
190 ; GCN-NEXT: v_mov_b32_e32 v4, s2
191 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
192 ; GCN-NEXT: v_mov_b32_e32 v4, s3
193 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
195 ; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
196 ; GCN-NEXT: v_mov_b32_e32 v0, 0
198 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
201 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
202 %a = bitcast i64 1 to <4 x i16>
203 %b = bitcast i64 2 to <4 x i16>
204 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
205 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
209 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) {
210 ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k:
211 ; GCN: ; %bb.0: ; %bb
212 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
213 ; GCN-NEXT: s_mov_b64 s[18:19], 1
214 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
215 ; GCN-NEXT: s_mov_b32 s18, 2
216 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
217 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
218 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
219 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
220 ; GCN-NEXT: v_mov_b32_e32 v4, s0
221 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
222 ; GCN-NEXT: v_mov_b32_e32 v4, s1
223 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
224 ; GCN-NEXT: v_mov_b32_e32 v4, s2
225 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
226 ; GCN-NEXT: v_mov_b32_e32 v4, s3
227 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
228 ; GCN-NEXT: v_mov_b32_e32 v4, s4
229 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4
230 ; GCN-NEXT: v_mov_b32_e32 v4, s5
231 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4
232 ; GCN-NEXT: v_mov_b32_e32 v4, s6
233 ; GCN-NEXT: v_accvgpr_write_b32 a6, v4
234 ; GCN-NEXT: v_mov_b32_e32 v4, s7
235 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4
236 ; GCN-NEXT: v_mov_b32_e32 v4, s8
237 ; GCN-NEXT: v_accvgpr_write_b32 a8, v4
238 ; GCN-NEXT: v_mov_b32_e32 v4, s9
239 ; GCN-NEXT: v_accvgpr_write_b32 a9, v4
240 ; GCN-NEXT: v_mov_b32_e32 v4, s10
241 ; GCN-NEXT: v_accvgpr_write_b32 a10, v4
242 ; GCN-NEXT: v_mov_b32_e32 v4, s11
243 ; GCN-NEXT: v_accvgpr_write_b32 a11, v4
244 ; GCN-NEXT: v_mov_b32_e32 v4, s12
245 ; GCN-NEXT: v_accvgpr_write_b32 a12, v4
246 ; GCN-NEXT: v_mov_b32_e32 v4, s13
247 ; GCN-NEXT: v_accvgpr_write_b32 a13, v4
248 ; GCN-NEXT: v_mov_b32_e32 v4, s14
249 ; GCN-NEXT: v_accvgpr_write_b32 a14, v4
250 ; GCN-NEXT: v_mov_b32_e32 v4, s15
251 ; GCN-NEXT: v_accvgpr_write_b32 a15, v4
253 ; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
254 ; GCN-NEXT: v_mov_b32_e32 v0, 0
258 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
259 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
260 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
261 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
264 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
265 %a = bitcast i64 1 to <4 x i16>
266 %b = bitcast i64 2 to <4 x i16>
267 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
268 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
272 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) {
273 ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k:
274 ; GCN: ; %bb.0: ; %bb
275 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
276 ; GCN-NEXT: s_mov_b64 s[6:7], 1
277 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
278 ; GCN-NEXT: s_mov_b32 s6, 2
279 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
280 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
281 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
282 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
283 ; GCN-NEXT: v_mov_b32_e32 v4, s0
284 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
285 ; GCN-NEXT: v_mov_b32_e32 v4, s1
286 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
287 ; GCN-NEXT: v_mov_b32_e32 v4, s2
288 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
289 ; GCN-NEXT: v_mov_b32_e32 v4, s3
290 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
292 ; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
293 ; GCN-NEXT: v_mov_b32_e32 v0, 0
296 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
299 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
300 %a = bitcast i64 1 to <4 x i16>
301 %b = bitcast i64 2 to <4 x i16>
302 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
303 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
307 define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) {
308 ; GCN-LABEL: test_mfma_f64_4x4x4f64:
309 ; GCN: ; %bb.0: ; %bb
310 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
311 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
312 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
313 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
314 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
316 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
318 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
319 ; GCN-NEXT: v_mov_b32_e32 v0, 0
321 ; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[4:5]
324 %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
325 %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
326 store double %mai.2, double addrspace(1)* %arg
330 define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) {
331 ; GCN-LABEL: test_mfma_f64_16x16x4f64:
332 ; GCN: ; %bb.0: ; %bb
333 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
334 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34
335 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
336 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
337 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
338 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
339 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
340 ; GCN-NEXT: v_mov_b32_e32 v4, s0
341 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
342 ; GCN-NEXT: v_mov_b32_e32 v4, s1
343 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
344 ; GCN-NEXT: v_mov_b32_e32 v4, s2
345 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
346 ; GCN-NEXT: v_mov_b32_e32 v4, s3
347 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
348 ; GCN-NEXT: v_mov_b32_e32 v4, s4
349 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4
350 ; GCN-NEXT: v_mov_b32_e32 v4, s5
351 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4
352 ; GCN-NEXT: v_mov_b32_e32 v4, s6
353 ; GCN-NEXT: v_accvgpr_write_b32 a6, v4
354 ; GCN-NEXT: v_mov_b32_e32 v4, s7
355 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4
357 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
358 ; GCN-NEXT: v_mov_b32_e32 v0, 0
362 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
363 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
366 %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
367 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
368 store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
372 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
373 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm:
374 ; GCN: ; %bb.0: ; %bb
375 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
376 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
377 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
378 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
379 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
381 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
382 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
383 ; GCN-NEXT: v_mov_b32_e32 v0, 0
387 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
388 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
391 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
392 %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
393 store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg
397 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
398 ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
399 ; GCN: ; %bb.0: ; %bb
400 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
401 ; GCN-NEXT: s_mov_b64 s[4:5], 0
402 ; GCN-NEXT: s_mov_b64 s[10:11], 1.0
403 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
404 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
405 ; GCN-NEXT: v_mov_b32_e32 v4, s4
406 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
407 ; GCN-NEXT: v_mov_b32_e32 v4, s5
408 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
409 ; GCN-NEXT: v_mov_b32_e32 v4, s6
410 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
411 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
412 ; GCN-NEXT: v_mov_b32_e32 v4, s7
413 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
414 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
415 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
416 ; GCN-NEXT: v_mov_b32_e32 v4, s8
417 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4
418 ; GCN-NEXT: v_mov_b32_e32 v4, s9
419 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4
420 ; GCN-NEXT: v_mov_b32_e32 v4, s10
421 ; GCN-NEXT: v_accvgpr_write_b32 a6, v4
422 ; GCN-NEXT: v_mov_b32_e32 v4, s11
423 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4
424 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
426 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
427 ; GCN-NEXT: v_mov_b32_e32 v0, 0
431 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
432 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
435 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
436 store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
440 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) {
441 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
442 ; GCN: ; %bb.0: ; %bb
443 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
444 ; GCN-NEXT: s_mov_b32 s4, 0
445 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000
446 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
447 ; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
448 ; GCN-NEXT: s_mov_b64 s[10:11], s[4:5]
449 ; GCN-NEXT: v_mov_b32_e32 v4, s4
450 ; GCN-NEXT: v_accvgpr_write_b32 a0, v4
451 ; GCN-NEXT: v_mov_b32_e32 v4, s5
452 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4
453 ; GCN-NEXT: v_mov_b32_e32 v4, s6
454 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
455 ; GCN-NEXT: v_accvgpr_write_b32 a2, v4
456 ; GCN-NEXT: v_mov_b32_e32 v4, s7
457 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
458 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
459 ; GCN-NEXT: v_accvgpr_write_b32 a3, v4
460 ; GCN-NEXT: v_mov_b32_e32 v4, s8
461 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4
462 ; GCN-NEXT: v_mov_b32_e32 v4, s9
463 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4
464 ; GCN-NEXT: v_mov_b32_e32 v4, s10
465 ; GCN-NEXT: v_accvgpr_write_b32 a6, v4
466 ; GCN-NEXT: v_mov_b32_e32 v4, s11
467 ; GCN-NEXT: v_accvgpr_write_b32 a7, v4
468 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
470 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
471 ; GCN-NEXT: v_mov_b32_e32 v0, 0
475 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
476 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
479 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
480 store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg