1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
4 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
5 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
6 ; GFX12: ; %bb.0: ; %bb
7 ; GFX12-NEXT: global_load_b32 v10, v[10:11], off
8 ; GFX12-NEXT: v_mov_b32_e32 v23, v9
9 ; GFX12-NEXT: v_mov_b32_e32 v22, v8
10 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
11 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
12 ; GFX12-NEXT: v_mov_b32_e32 v27, v9
13 ; GFX12-NEXT: v_mov_b32_e32 v26, v8
14 ; GFX12-NEXT: v_mov_b32_e32 v25, v7
15 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
16 ; GFX12-NEXT: v_mov_b32_e32 v31, v9
17 ; GFX12-NEXT: v_mov_b32_e32 v30, v8
18 ; GFX12-NEXT: v_mov_b32_e32 v29, v7
19 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
20 ; GFX12-NEXT: s_wait_loadcnt 0x0
21 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
22 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
23 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
24 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
25 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
26 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
27 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
28 ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
29 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
31 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
32 ; GFX12-NEXT: s_endpgm
34 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
35 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
36 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
37 store <4 x float> %res0, ptr addrspace(1) %out0
38 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
39 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
40 store <4 x float> %res1, ptr addrspace(1) %out1
41 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
42 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
43 store <4 x float> %res2, ptr addrspace(1) %out2
44 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
45 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
46 store <4 x float> %res3, ptr addrspace(1) %out3
50 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
51 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
52 ; GFX12: ; %bb.0: ; %bb
53 ; GFX12-NEXT: global_load_b32 v10, v[10:11], off
54 ; GFX12-NEXT: v_mov_b32_e32 v23, v9
55 ; GFX12-NEXT: v_mov_b32_e32 v22, v8
56 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
57 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
58 ; GFX12-NEXT: v_mov_b32_e32 v27, v9
59 ; GFX12-NEXT: v_mov_b32_e32 v26, v8
60 ; GFX12-NEXT: v_mov_b32_e32 v25, v7
61 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
62 ; GFX12-NEXT: v_mov_b32_e32 v31, v9
63 ; GFX12-NEXT: v_mov_b32_e32 v30, v8
64 ; GFX12-NEXT: v_mov_b32_e32 v29, v7
65 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
66 ; GFX12-NEXT: s_wait_loadcnt 0x0
67 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
68 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
69 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
70 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
71 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
72 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off
73 ; GFX12-NEXT: global_store_b128 v[14:15], v[24:27], off
74 ; GFX12-NEXT: global_store_b128 v[16:17], v[28:31], off
75 ; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
77 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
78 ; GFX12-NEXT: s_endpgm
80 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
81 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
82 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
83 store <4 x float> %res0, ptr addrspace(1) %out0
84 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
85 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
86 store <4 x float> %res1, ptr addrspace(1) %out1
87 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
88 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
89 store <4 x float> %res2, ptr addrspace(1) %out2
90 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
91 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
92 store <4 x float> %res3, ptr addrspace(1) %out3
96 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
97 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
98 ; GFX12: ; %bb.0: ; %bb
99 ; GFX12-NEXT: global_load_b32 v22, v[8:9], off
100 ; GFX12-NEXT: v_mov_b32_e32 v9, v7
101 ; GFX12-NEXT: v_mov_b32_e32 v8, v6
102 ; GFX12-NEXT: v_mov_b32_e32 v19, v7
103 ; GFX12-NEXT: v_mov_b32_e32 v18, v6
104 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
105 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
106 ; GFX12-NEXT: s_wait_loadcnt 0x0
107 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
108 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
109 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
110 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
111 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
112 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
113 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
114 ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
115 ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
116 ; GFX12-NEXT: s_nop 0
117 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
118 ; GFX12-NEXT: s_endpgm
120 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
121 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
122 %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
123 store <4 x half> %res0, ptr addrspace(1) %out0
124 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
125 %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
126 store <4 x half> %res1, ptr addrspace(1) %out1
127 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
128 %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
129 store <4 x half> %res2, ptr addrspace(1) %out2
130 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
131 %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
132 store <4 x half> %res3, ptr addrspace(1) %out3
136 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
137 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
138 ; GFX12: ; %bb.0: ; %bb
139 ; GFX12-NEXT: global_load_b32 v22, v[8:9], off
140 ; GFX12-NEXT: v_mov_b32_e32 v9, v7
141 ; GFX12-NEXT: v_mov_b32_e32 v8, v6
142 ; GFX12-NEXT: v_mov_b32_e32 v19, v7
143 ; GFX12-NEXT: v_mov_b32_e32 v18, v6
144 ; GFX12-NEXT: v_mov_b32_e32 v21, v7
145 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
146 ; GFX12-NEXT: s_wait_loadcnt 0x0
147 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
148 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
149 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
150 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
151 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
152 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off
153 ; GFX12-NEXT: global_store_b64 v[12:13], v[18:19], off
154 ; GFX12-NEXT: global_store_b64 v[14:15], v[20:21], off
155 ; GFX12-NEXT: global_store_b64 v[16:17], v[6:7], off
156 ; GFX12-NEXT: s_nop 0
157 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
158 ; GFX12-NEXT: s_endpgm
160 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
161 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
162 %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
163 store <4 x i16> %res0, ptr addrspace(1) %out0
164 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
165 %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
166 store <4 x i16> %res1, ptr addrspace(1) %out1
167 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
168 %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
169 store <4 x i16> %res2, ptr addrspace(1) %out2
170 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
171 %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
172 store <4 x i16> %res3, ptr addrspace(1) %out3
176 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
177 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
178 ; GFX12: ; %bb.0: ; %bb
179 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
180 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
181 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
182 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
183 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
184 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
185 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
186 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
187 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
188 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
189 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
190 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
191 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
192 ; GFX12-NEXT: s_wait_loadcnt 0x0
193 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
194 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
195 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
196 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
197 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
198 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
199 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
200 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
201 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
202 ; GFX12-NEXT: s_nop 0
203 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
204 ; GFX12-NEXT: s_endpgm
206 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
207 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
208 %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
209 store <4 x i32> %res0, ptr addrspace(1) %out0
210 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
211 %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
212 store <4 x i32> %res1, ptr addrspace(1) %out1
213 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
214 %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
215 store <4 x i32> %res2, ptr addrspace(1) %out2
216 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
217 %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
218 store <4 x i32> %res3, ptr addrspace(1) %out3
222 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
223 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
224 ; GFX12: ; %bb.0: ; %bb
225 ; GFX12-NEXT: global_load_b32 v6, v[6:7], off
226 ; GFX12-NEXT: v_mov_b32_e32 v15, v5
227 ; GFX12-NEXT: v_mov_b32_e32 v14, v4
228 ; GFX12-NEXT: v_mov_b32_e32 v13, v3
229 ; GFX12-NEXT: v_mov_b32_e32 v12, v2
230 ; GFX12-NEXT: s_wait_loadcnt 0x0
231 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
232 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
233 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
234 ; GFX12-NEXT: global_store_b128 v[8:9], v[12:15], off
235 ; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
236 ; GFX12-NEXT: s_nop 0
237 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
238 ; GFX12-NEXT: s_endpgm
240 %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
241 %Index0 = extractelement <2 x i16> %IndexVec, i32 0
242 %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
243 store <4 x i32> %res0, ptr addrspace(1) %out0
244 %Index1 = extractelement <2 x i16> %IndexVec, i32 1
245 %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
246 store <4 x i32> %res1, ptr addrspace(1) %out1
250 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
251 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
252 ; GFX12: ; %bb.0: ; %bb
253 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
254 ; GFX12-NEXT: v_mov_b32_e32 v16, v6
255 ; GFX12-NEXT: v_mov_b32_e32 v15, v5
256 ; GFX12-NEXT: v_mov_b32_e32 v14, v4
257 ; GFX12-NEXT: v_mov_b32_e32 v13, v3
258 ; GFX12-NEXT: s_wait_loadcnt 0x0
259 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
260 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
261 ; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
262 ; GFX12-NEXT: global_store_b128 v[9:10], v[13:16], off
263 ; GFX12-NEXT: global_store_b128 v[11:12], v[3:6], off
264 ; GFX12-NEXT: s_nop 0
265 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
266 ; GFX12-NEXT: s_endpgm
268 %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
269 %Index0 = extractelement <2 x i16> %IndexVec, i32 0
270 %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
271 store <4 x i32> %res0, ptr addrspace(1) %out0
272 %Index1 = extractelement <2 x i16> %IndexVec, i32 1
273 %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
274 store <4 x i32> %res1, ptr addrspace(1) %out1
278 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
279 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
280 ; GFX12: ; %bb.0: ; %bb
281 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
282 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
283 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
284 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
285 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
286 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
287 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
288 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
289 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
290 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
291 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
292 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
293 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
294 ; GFX12-NEXT: s_wait_loadcnt 0x0
295 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
296 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
297 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
298 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
299 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
300 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
301 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
302 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
303 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
304 ; GFX12-NEXT: s_nop 0
305 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
306 ; GFX12-NEXT: s_endpgm
308 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
309 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
310 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
311 store <4 x float> %res0, ptr addrspace(1) %out0
312 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
313 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
314 store <4 x float> %res1, ptr addrspace(1) %out1
315 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
316 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
317 store <4 x float> %res2, ptr addrspace(1) %out2
318 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
319 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
320 store <4 x float> %res3, ptr addrspace(1) %out3
324 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
325 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
326 ; GFX12: ; %bb.0: ; %bb
327 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
328 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
329 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
330 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
331 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
332 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
333 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
334 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
335 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
336 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
337 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
338 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
339 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
340 ; GFX12-NEXT: s_wait_loadcnt 0x0
341 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
342 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
343 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
344 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
345 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
346 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
347 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
348 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
349 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
350 ; GFX12-NEXT: s_nop 0
351 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
352 ; GFX12-NEXT: s_endpgm
354 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
355 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
356 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
357 store <4 x float> %res0, ptr addrspace(1) %out0
358 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
359 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
360 store <4 x float> %res1, ptr addrspace(1) %out1
361 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
362 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
363 store <4 x float> %res2, ptr addrspace(1) %out2
364 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
365 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
366 store <4 x float> %res3, ptr addrspace(1) %out3
370 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
371 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
372 ; GFX12: ; %bb.0: ; %bb
373 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
374 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
375 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
376 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
377 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
378 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
379 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
380 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
381 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
382 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
383 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
384 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
385 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
386 ; GFX12-NEXT: s_wait_loadcnt 0x0
387 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
388 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
389 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
390 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
391 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
392 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
393 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
394 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
395 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
396 ; GFX12-NEXT: s_nop 0
397 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
398 ; GFX12-NEXT: s_endpgm
400 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
401 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
402 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
403 store <4 x float> %res0, ptr addrspace(1) %out0
404 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
405 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
406 store <4 x float> %res1, ptr addrspace(1) %out1
407 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
408 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
409 store <4 x float> %res2, ptr addrspace(1) %out2
410 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
411 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
412 store <4 x float> %res3, ptr addrspace(1) %out3
416 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
417 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
418 ; GFX12: ; %bb.0: ; %bb
419 ; GFX12-NEXT: global_load_b32 v7, v[7:8], off
420 ; GFX12-NEXT: v_mov_b32_e32 v20, v6
421 ; GFX12-NEXT: v_mov_b32_e32 v19, v5
422 ; GFX12-NEXT: v_mov_b32_e32 v18, v4
423 ; GFX12-NEXT: v_mov_b32_e32 v17, v3
424 ; GFX12-NEXT: v_mov_b32_e32 v24, v6
425 ; GFX12-NEXT: v_mov_b32_e32 v23, v5
426 ; GFX12-NEXT: v_mov_b32_e32 v22, v4
427 ; GFX12-NEXT: v_mov_b32_e32 v21, v3
428 ; GFX12-NEXT: v_mov_b32_e32 v28, v6
429 ; GFX12-NEXT: v_mov_b32_e32 v27, v5
430 ; GFX12-NEXT: v_mov_b32_e32 v26, v4
431 ; GFX12-NEXT: v_mov_b32_e32 v25, v3
432 ; GFX12-NEXT: s_wait_loadcnt 0x0
433 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
434 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
435 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
436 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
437 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
438 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off
439 ; GFX12-NEXT: global_store_b128 v[11:12], v[21:24], off
440 ; GFX12-NEXT: global_store_b128 v[13:14], v[25:28], off
441 ; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
442 ; GFX12-NEXT: s_nop 0
443 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
444 ; GFX12-NEXT: s_endpgm
446 %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
447 %Index0 = extractelement <4 x i8> %IndexVec, i32 0
448 %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
449 store <4 x float> %res0, ptr addrspace(1) %out0
450 %Index1 = extractelement <4 x i8> %IndexVec, i32 1
451 %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
452 store <4 x float> %res1, ptr addrspace(1) %out1
453 %Index2 = extractelement <4 x i8> %IndexVec, i32 2
454 %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
455 store <4 x float> %res2, ptr addrspace(1) %out2
456 %Index3 = extractelement <4 x i8> %IndexVec, i32 3
457 %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
458 store <4 x float> %res3, ptr addrspace(1) %out3
462 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x float>, i8)
463 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
464 declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
465 declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
466 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
467 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
468 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
469 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
470 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
471 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
472 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)