1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
5 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: global_load_b32 v10, v[10:11], off
8 ; GFX12-NEXT: v_mov_b32_e32 v23, v9
9 ; GFX12-NEXT: v_mov_b32_e32 v22, v8
10 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
11 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
12 ; GFX12-NEXT: v_mov_b32_e32 v27, v9
13 ; GFX12-NEXT: v_mov_b32_e32 v26, v8
14 ; GFX12-NEXT: v_mov_b32_e32 v25, v7
15 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
16 ; GFX12-NEXT: v_mov_b32_e32 v31, v9
17 ; GFX12-NEXT: v_mov_b32_e32 v30, v8
18 ; GFX12-NEXT: v_mov_b32_e32 v29, v7
19 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
20 ; GFX12-NEXT: s_wait_loadcnt 0x0
21 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
22 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
23 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
24 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
25 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
26 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
27 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
28 ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
29 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
30 ; GFX12-NEXT: s_endpgm
32 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
33 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
34 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
35 store <4 x float> %res0, ptr addrspace(1) %out0
36 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
37 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
38 store <4 x float> %res1, ptr addrspace(1) %out1
39 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
40 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
41 store <4 x float> %res2, ptr addrspace(1) %out2
42 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
43 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
44 store <4 x float> %res3, ptr addrspace(1) %out3
48 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
49 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
50 ; GFX12: ; %bb.0: ; %bb
51 ; GFX12-NEXT: global_load_b32 v10, v[10:11], off
52 ; GFX12-NEXT: v_mov_b32_e32 v23, v9
53 ; GFX12-NEXT: v_mov_b32_e32 v22, v8
54 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
55 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
56 ; GFX12-NEXT: v_mov_b32_e32 v27, v9
57 ; GFX12-NEXT: v_mov_b32_e32 v26, v8
58 ; GFX12-NEXT: v_mov_b32_e32 v25, v7
59 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
60 ; GFX12-NEXT: v_mov_b32_e32 v31, v9
61 ; GFX12-NEXT: v_mov_b32_e32 v30, v8
62 ; GFX12-NEXT: v_mov_b32_e32 v29, v7
63 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
64 ; GFX12-NEXT: s_wait_loadcnt 0x0
65 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
66 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
67 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
68 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
69 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
70 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
71 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
72 ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
73 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
74 ; GFX12-NEXT: s_endpgm
76 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
77 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
78 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
79 store <4 x float> %res0, ptr addrspace(1) %out0
80 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
81 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
82 store <4 x float> %res1, ptr addrspace(1) %out1
83 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
84 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
85 store <4 x float> %res2, ptr addrspace(1) %out2
86 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
87 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
88 store <4 x float> %res3, ptr addrspace(1) %out3
92 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
93 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
94 ; GFX12: ; %bb.0: ; %bb
95 ; GFX12-NEXT: global_load_b32 v22, v[8:9], off
96 ; GFX12-NEXT: v_mov_b32_e32 v9, v7
97 ; GFX12-NEXT: v_mov_b32_e32 v8, v6
98 ; GFX12-NEXT: v_mov_b32_e32 v19, v7
99 ; GFX12-NEXT: v_mov_b32_e32 v18, v6
100 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
101 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
102 ; GFX12-NEXT: s_wait_loadcnt 0x0
103 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
104 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
105 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
106 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
107 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
108 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
109 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
110 ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
111 ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
112 ; GFX12-NEXT: s_endpgm
114 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
115 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
116 %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
117 store <4 x half> %res0, ptr addrspace(1) %out0
118 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
119 %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
120 store <4 x half> %res1, ptr addrspace(1) %out1
121 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
122 %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
123 store <4 x half> %res2, ptr addrspace(1) %out2
124 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
125 %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
126 store <4 x half> %res3, ptr addrspace(1) %out3
130 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
131 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
132 ; GFX12: ; %bb.0: ; %bb
133 ; GFX12-NEXT: global_load_b32 v22, v[8:9], off
134 ; GFX12-NEXT: v_mov_b32_e32 v9, v7
135 ; GFX12-NEXT: v_mov_b32_e32 v8, v6
136 ; GFX12-NEXT: v_mov_b32_e32 v19, v7
137 ; GFX12-NEXT: v_mov_b32_e32 v18, v6
138 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
139 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
140 ; GFX12-NEXT: s_wait_loadcnt 0x0
141 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
142 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
143 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
144 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
145 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
146 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
147 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
148 ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
149 ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
150 ; GFX12-NEXT: s_endpgm
152 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
153 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
154 %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
155 store <4 x i16> %res0, ptr addrspace(1) %out0
156 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
157 %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
158 store <4 x i16> %res1, ptr addrspace(1) %out1
159 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
160 %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
161 store <4 x i16> %res2, ptr addrspace(1) %out2
162 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
163 %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
164 store <4 x i16> %res3, ptr addrspace(1) %out3
168 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
169 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
170 ; GFX12: ; %bb.0: ; %bb
171 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
172 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
173 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
174 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
175 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
176 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
177 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
178 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
179 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
180 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
181 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
182 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
183 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
184 ; GFX12-NEXT: s_wait_loadcnt 0x0
185 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
186 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
187 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
188 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
189 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
190 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
191 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
192 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
193 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
194 ; GFX12-NEXT: s_endpgm
196 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
197 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
198 %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
199 store <4 x i32> %res0, ptr addrspace(1) %out0
200 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
201 %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
202 store <4 x i32> %res1, ptr addrspace(1) %out1
203 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
204 %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
205 store <4 x i32> %res2, ptr addrspace(1) %out2
206 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
207 %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
208 store <4 x i32> %res3, ptr addrspace(1) %out3
212 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
213 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
214 ; GFX12: ; %bb.0: ; %bb
215 ; GFX12-NEXT: global_load_b32 v6, v[6:7], off
216 ; GFX12-NEXT: v_mov_b32_e32 v15, v5
217 ; GFX12-NEXT: v_mov_b32_e32 v14, v4
218 ; GFX12-NEXT: v_mov_b32_e32 v13, v3
219 ; GFX12-NEXT: v_mov_b32_e32 v12, v2
220 ; GFX12-NEXT: s_wait_loadcnt 0x0
221 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
222 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
223 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
224 ; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
225 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
226 ; GFX12-NEXT: s_endpgm
228 %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
229 %Index0 = extractelement <2 x i16> %IndexVec, i32 0
230 %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
231 store <4 x i32> %res0, ptr addrspace(1) %out0
232 %Index1 = extractelement <2 x i16> %IndexVec, i32 1
233 %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
234 store <4 x i32> %res1, ptr addrspace(1) %out1
238 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
239 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
240 ; GFX12: ; %bb.0: ; %bb
241 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
242 ; GFX12-NEXT: v_mov_b32_e32 v16, v6
243 ; GFX12-NEXT: v_mov_b32_e32 v15, v5
244 ; GFX12-NEXT: v_mov_b32_e32 v14, v4
245 ; GFX12-NEXT: v_mov_b32_e32 v13, v3
246 ; GFX12-NEXT: s_wait_loadcnt 0x0
247 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
248 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
249 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
250 ; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
251 ; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
252 ; GFX12-NEXT: s_endpgm
254 %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
255 %Index0 = extractelement <2 x i16> %IndexVec, i32 0
256 %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
257 store <4 x i32> %res0, ptr addrspace(1) %out0
258 %Index1 = extractelement <2 x i16> %IndexVec, i32 1
259 %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
260 store <4 x i32> %res1, ptr addrspace(1) %out1
264 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
265 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
266 ; GFX12: ; %bb.0: ; %bb
267 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
268 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
269 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
270 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
271 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
272 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
273 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
274 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
275 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
276 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
277 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
278 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
279 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
280 ; GFX12-NEXT: s_wait_loadcnt 0x0
281 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
282 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
283 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
284 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
285 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
286 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
287 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
288 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
289 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
290 ; GFX12-NEXT: s_endpgm
292 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
293 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
294 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
295 store <4 x float> %res0, ptr addrspace(1) %out0
296 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
297 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
298 store <4 x float> %res1, ptr addrspace(1) %out1
299 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
300 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
301 store <4 x float> %res2, ptr addrspace(1) %out2
302 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
303 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
304 store <4 x float> %res3, ptr addrspace(1) %out3
308 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
309 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
310 ; GFX12: ; %bb.0: ; %bb
311 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
312 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
313 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
314 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
315 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
316 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
317 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
318 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
319 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
320 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
321 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
322 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
323 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
324 ; GFX12-NEXT: s_wait_loadcnt 0x0
325 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
326 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
327 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
328 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
329 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
330 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
331 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
332 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
333 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
334 ; GFX12-NEXT: s_endpgm
336 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
337 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
338 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
339 store <4 x float> %res0, ptr addrspace(1) %out0
340 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
341 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
342 store <4 x float> %res1, ptr addrspace(1) %out1
343 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
344 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
345 store <4 x float> %res2, ptr addrspace(1) %out2
346 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
347 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
348 store <4 x float> %res3, ptr addrspace(1) %out3
352 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
353 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
354 ; GFX12: ; %bb.0: ; %bb
355 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
356 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
357 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
358 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
359 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
360 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
361 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
362 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
363 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
364 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
365 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
366 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
367 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
368 ; GFX12-NEXT: s_wait_loadcnt 0x0
369 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
370 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
371 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
372 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
373 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
374 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
375 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
376 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
377 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
378 ; GFX12-NEXT: s_endpgm
380 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
381 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
382 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
383 store <4 x float> %res0, ptr addrspace(1) %out0
384 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
385 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
386 store <4 x float> %res1, ptr addrspace(1) %out1
387 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
388 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
389 store <4 x float> %res2, ptr addrspace(1) %out2
390 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
391 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
392 store <4 x float> %res3, ptr addrspace(1) %out3
396 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
397 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
398 ; GFX12: ; %bb.0: ; %bb
399 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
400 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
401 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
402 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
403 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
404 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
405 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
406 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
407 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
408 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
409 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
410 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
411 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
412 ; GFX12-NEXT: s_wait_loadcnt 0x0
413 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
414 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
415 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
416 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
417 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
418 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
419 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
420 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
421 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
422 ; GFX12-NEXT: s_endpgm
424 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
425 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
426 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
427 store <4 x float> %res0, ptr addrspace(1) %out0
428 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
429 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
430 store <4 x float> %res1, ptr addrspace(1) %out1
431 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
432 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
433 store <4 x float> %res2, ptr addrspace(1) %out2
434 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
435 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
436 store <4 x float> %res3, ptr addrspace(1) %out3
440 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x float>, i8)
441 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
442 declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
443 declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
444 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
445 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
446 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
447 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
448 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
449 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
450 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)