llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
   8 ; GFX12-NEXT:    s_clause 0x1
   9 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  10 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  11 ; GFX12-NEXT:    s_nop 0
  12 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  13 ; GFX12-NEXT:    s_endpgm
  14 bb:
  15   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
  16   store <8 x float> %res, ptr addrspace(1) %out
  17   ret void
  18 }
  19
  20 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
  21 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
  22 ; GFX12:       ; %bb.0: ; %bb
  23 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
  24 ; GFX12-NEXT:    s_clause 0x1
  25 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  26 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  27 ; GFX12-NEXT:    s_nop 0
  28 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  29 ; GFX12-NEXT:    s_endpgm
  30 bb:
  31   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
  32   store <8 x float> %res, ptr addrspace(1) %out
  33   ret void
  34 }
  35
  36 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
  37 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
  38 ; GFX12:       ; %bb.0: ; %bb
  39 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
  40 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
  41 ; GFX12-NEXT:    s_nop 0
  42 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  43 ; GFX12-NEXT:    s_endpgm
  44 bb:
  45   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
  46   store <8 x half> %res, ptr addrspace(1) %out
  47   ret void
  48 }
  49
  50 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
  51 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
  52 ; GFX12:       ; %bb.0: ; %bb
  53 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
  54 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
  55 ; GFX12-NEXT:    s_nop 0
  56 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  57 ; GFX12-NEXT:    s_endpgm
  58 bb:
  59   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
  60   store <8 x i16> %res, ptr addrspace(1) %out
  61   ret void
  62 }
  63
  64 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
  65 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
  66 ; GFX12:       ; %bb.0: ; %bb
  67 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
  68 ; GFX12-NEXT:    s_clause 0x1
  69 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
  70 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
  71 ; GFX12-NEXT:    s_nop 0
  72 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  73 ; GFX12-NEXT:    s_endpgm
  74 bb:
  75   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
  76   store <8 x i32> %res, ptr addrspace(1) %out
  77   ret void
  78 }
  79
  80 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
  81 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
  82 ; GFX12:       ; %bb.0: ; %bb
  83 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
  84 ; GFX12-NEXT:    s_clause 0x1
  85 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
  86 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
  87 ; GFX12-NEXT:    s_nop 0
  88 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  89 ; GFX12-NEXT:    s_endpgm
  90 bb:
  91   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
  92   store <8 x i32> %res, ptr addrspace(1) %out
  93   ret void
  94 }
  95
  96 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
  97 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
  98 ; GFX12:       ; %bb.0: ; %bb
  99 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
 100 ; GFX12-NEXT:    s_clause 0x1
 101 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 102 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 103 ; GFX12-NEXT:    s_nop 0
 104 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 105 ; GFX12-NEXT:    s_endpgm
 106 bb:
 107   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 108   store <8 x float> %res, ptr addrspace(1) %out
 109   ret void
 110 }
 111
 112 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 113 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
 114 ; GFX12:       ; %bb.0: ; %bb
 115 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
 116 ; GFX12-NEXT:    s_clause 0x1
 117 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 118 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 119 ; GFX12-NEXT:    s_nop 0
 120 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 121 ; GFX12-NEXT:    s_endpgm
 122 bb:
 123   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 124   store <8 x float> %res, ptr addrspace(1) %out
 125   ret void
 126 }
 127
 128 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 129 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
 130 ; GFX12:       ; %bb.0: ; %bb
 131 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
 132 ; GFX12-NEXT:    s_clause 0x1
 133 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 134 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 135 ; GFX12-NEXT:    s_nop 0
 136 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 137 ; GFX12-NEXT:    s_endpgm
 138 bb:
 139   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 140   store <8 x float> %res, ptr addrspace(1) %out
 141   ret void
 142 }
 143
 144 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 145 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
 146 ; GFX12:       ; %bb.0: ; %bb
 147 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
 148 ; GFX12-NEXT:    s_clause 0x1
 149 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 150 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 151 ; GFX12-NEXT:    s_nop 0
 152 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 153 ; GFX12-NEXT:    s_endpgm
 154 bb:
 155   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 156   store <8 x float> %res, ptr addrspace(1) %out
 157   ret void
 158 }
 159
 160 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 161 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
 162 ; GFX12:       ; %bb.0: ; %bb
 163 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
 164 ; GFX12-NEXT:    s_clause 0x1
 165 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 166 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 167 ; GFX12-NEXT:    s_nop 0
 168 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 169 ; GFX12-NEXT:    s_endpgm
 170 bb:
 171   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
 172   store <8 x i32> %res, ptr addrspace(1) %out
 173   ret void
 174 }
 175
 176
 177 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 178 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
 179 ; GFX12:       ; %bb.0: ; %bb
 180 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
 181 ; GFX12-NEXT:    s_clause 0x1
 182 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 183 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 184 ; GFX12-NEXT:    s_nop 0
 185 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 186 ; GFX12-NEXT:    s_endpgm
 187 bb:
 188   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
 189   store <8 x float> %res, ptr addrspace(1) %out
 190   ret void
 191 }
 192
 193 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 194 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
 195 ; GFX12:       ; %bb.0: ; %bb
 196 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
 197 ; GFX12-NEXT:    s_clause 0x1
 198 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 199 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 200 ; GFX12-NEXT:    s_nop 0
 201 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 202 ; GFX12-NEXT:    s_endpgm
 203 bb:
 204   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
 205   store <8 x float> %res, ptr addrspace(1) %out
 206   ret void
 207 }
 208
 209 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 210 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
 211 ; GFX12:       ; %bb.0: ; %bb
 212 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
 213 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
 214 ; GFX12-NEXT:    s_nop 0
 215 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 216 ; GFX12-NEXT:    s_endpgm
 217 bb:
 218   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
 219   store <8 x half> %res, ptr addrspace(1) %out
 220   ret void
 221 }
 222
 223 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
 224 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
 225 ; GFX12:       ; %bb.0: ; %bb
 226 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
 227 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
 228 ; GFX12-NEXT:    s_nop 0
 229 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 230 ; GFX12-NEXT:    s_endpgm
 231 bb:
 232   %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
 233   store <8 x i16> %res, ptr addrspace(1) %out
 234   ret void
 235 }
 236
 237 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 238 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
 239 ; GFX12:       ; %bb.0: ; %bb
 240 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
 241 ; GFX12-NEXT:    s_clause 0x1
 242 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 243 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 244 ; GFX12-NEXT:    s_nop 0
 245 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 246 ; GFX12-NEXT:    s_endpgm
 247 bb:
 248   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
 249   store <8 x i32> %res, ptr addrspace(1) %out
 250   ret void
 251 }
 252
 253 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 254 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
 255 ; GFX12:       ; %bb.0: ; %bb
 256 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
 257 ; GFX12-NEXT:    s_clause 0x1
 258 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
 259 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
 260 ; GFX12-NEXT:    s_nop 0
 261 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 262 ; GFX12-NEXT:    s_endpgm
 263 bb:
 264   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
 265   store <8 x i32> %res, ptr addrspace(1) %out
 266   ret void
 267 }
 268
 269 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
 270 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
 271 ; GFX12:       ; %bb.0: ; %bb
 272 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
 273 ; GFX12-NEXT:    s_clause 0x1
 274 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 275 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 276 ; GFX12-NEXT:    s_nop 0
 277 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 278 ; GFX12-NEXT:    s_endpgm
 279 bb:
 280   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
 281   store <8 x i32> %res, ptr addrspace(1) %out
 282   ret void
 283 }
 284
 285 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 286 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
 287 ; GFX12:       ; %bb.0: ; %bb
 288 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
 289 ; GFX12-NEXT:    s_clause 0x1
 290 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 291 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 292 ; GFX12-NEXT:    s_nop 0
 293 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 294 ; GFX12-NEXT:    s_endpgm
 295 bb:
 296   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 297   store <8 x float> %res, ptr addrspace(1) %out
 298   ret void
 299 }
 300
 301 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 302 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
 303 ; GFX12:       ; %bb.0: ; %bb
 304 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
 305 ; GFX12-NEXT:    s_clause 0x1
 306 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 307 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 308 ; GFX12-NEXT:    s_nop 0
 309 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 310 ; GFX12-NEXT:    s_endpgm
 311 bb:
 312   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 313   store <8 x float> %res, ptr addrspace(1) %out
 314   ret void
 315 }
 316
 317 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 318 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
 319 ; GFX12:       ; %bb.0: ; %bb
 320 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
 321 ; GFX12-NEXT:    s_clause 0x1
 322 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 323 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 324 ; GFX12-NEXT:    s_nop 0
 325 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 326 ; GFX12-NEXT:    s_endpgm
 327 bb:
 328   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 329   store <8 x float> %res, ptr addrspace(1) %out
 330   ret void
 331 }
 332
 333 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 334 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
 335 ; GFX12:       ; %bb.0: ; %bb
 336 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
 337 ; GFX12-NEXT:    s_clause 0x1
 338 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 339 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 340 ; GFX12-NEXT:    s_nop 0
 341 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 342 ; GFX12-NEXT:    s_endpgm
 343 bb:
 344   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 345   store <8 x float> %res, ptr addrspace(1) %out
 346   ret void
 347 }
 348
 349 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
 350 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
 351 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
 352 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
 353 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 354 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
 355 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 356 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 357 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 358 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 359 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 360 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
 361 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
 362 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
 363 declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
 364 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 365 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 366 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
 367 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 368 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 369 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 370 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)