llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
   8 ; GFX12-NEXT:    s_clause 0x1
   9 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  10 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  11 ; GFX12-NEXT:    s_endpgm
  12 bb:
  13   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %C)
  14   store <8 x float> %res, ptr addrspace(1) %out
  15   ret void
  16 }
  17
  18 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
  19 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
  20 ; GFX12:       ; %bb.0: ; %bb
  21 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
  22 ; GFX12-NEXT:    s_clause 0x1
  23 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  24 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  25 ; GFX12-NEXT:    s_endpgm
  26 bb:
  27   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
  28   store <8 x float> %res, ptr addrspace(1) %out
  29   ret void
  30 }
  31
  32 define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
  33 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
  34 ; GFX12:       ; %bb.0: ; %bb
  35 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
  36 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
  37 ; GFX12-NEXT:    s_endpgm
  38 bb:
  39   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
  40   store <8 x half> %res, ptr addrspace(1) %out
  41   ret void
  42 }
  43
  44 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
  45 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
  46 ; GFX12:       ; %bb.0: ; %bb
  47 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
  48 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
  49 ; GFX12-NEXT:    s_endpgm
  50 bb:
  51   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
  52   store <8 x i16> %res, ptr addrspace(1) %out
  53   ret void
  54 }
  55
  56 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
  57 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
  58 ; GFX12:       ; %bb.0: ; %bb
  59 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
  60 ; GFX12-NEXT:    s_clause 0x1
  61 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
  62 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
  63 ; GFX12-NEXT:    s_endpgm
  64 bb:
  65   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
  66   store <8 x i32> %res, ptr addrspace(1) %out
  67   ret void
  68 }
  69
  70 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
  71 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
  72 ; GFX12:       ; %bb.0: ; %bb
  73 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
  74 ; GFX12-NEXT:    s_clause 0x1
  75 ; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
  76 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
  77 ; GFX12-NEXT:    s_endpgm
  78 bb:
  79   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
  80   store <8 x i32> %res, ptr addrspace(1) %out
  81   ret void
  82 }
  83
  84 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
  85 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
  86 ; GFX12:       ; %bb.0: ; %bb
  87 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
  88 ; GFX12-NEXT:    s_clause 0x1
  89 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
  90 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
  91 ; GFX12-NEXT:    s_endpgm
  92 bb:
  93   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
  94   store <8 x float> %res, ptr addrspace(1) %out
  95   ret void
  96 }
  97
  98 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
  99 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
 100 ; GFX12:       ; %bb.0: ; %bb
 101 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
 102 ; GFX12-NEXT:    s_clause 0x1
 103 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 104 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 105 ; GFX12-NEXT:    s_endpgm
 106 bb:
 107   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 108   store <8 x float> %res, ptr addrspace(1) %out
 109   ret void
 110 }
 111
 112 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 113 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
 114 ; GFX12:       ; %bb.0: ; %bb
 115 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
 116 ; GFX12-NEXT:    s_clause 0x1
 117 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 118 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 119 ; GFX12-NEXT:    s_endpgm
 120 bb:
 121   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 122   store <8 x float> %res, ptr addrspace(1) %out
 123   ret void
 124 }
 125
 126 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 127 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
 128 ; GFX12:       ; %bb.0: ; %bb
 129 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
 130 ; GFX12-NEXT:    s_clause 0x1
 131 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 132 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 133 ; GFX12-NEXT:    s_endpgm
 134 bb:
 135   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
 136   store <8 x float> %res, ptr addrspace(1) %out
 137   ret void
 138 }
 139
 140 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 141 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
 142 ; GFX12:       ; %bb.0: ; %bb
 143 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
 144 ; GFX12-NEXT:    s_clause 0x1
 145 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 146 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 147 ; GFX12-NEXT:    s_endpgm
 148 bb:
 149   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
 150   store <8 x i32> %res, ptr addrspace(1) %out
 151   ret void
 152 }
 153
 154
 155 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 156 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
 157 ; GFX12:       ; %bb.0: ; %bb
 158 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
 159 ; GFX12-NEXT:    s_clause 0x1
 160 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 161 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 162 ; GFX12-NEXT:    s_endpgm
 163 bb:
 164   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
 165   store <8 x float> %res, ptr addrspace(1) %out
 166   ret void
 167 }
 168
 169 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 170 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
 171 ; GFX12:       ; %bb.0: ; %bb
 172 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
 173 ; GFX12-NEXT:    s_clause 0x1
 174 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 175 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 176 ; GFX12-NEXT:    s_endpgm
 177 bb:
 178   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
 179   store <8 x float> %res, ptr addrspace(1) %out
 180   ret void
 181 }
 182
 183 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 184 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
 185 ; GFX12:       ; %bb.0: ; %bb
 186 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
 187 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
 188 ; GFX12-NEXT:    s_endpgm
 189 bb:
 190   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
 191   store <8 x half> %res, ptr addrspace(1) %out
 192   ret void
 193 }
 194
 195 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
 196 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
 197 ; GFX12:       ; %bb.0: ; %bb
 198 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
 199 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
 200 ; GFX12-NEXT:    s_endpgm
 201 bb:
 202   %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
 203   store <8 x i16> %res, ptr addrspace(1) %out
 204   ret void
 205 }
 206
 207 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 208 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
 209 ; GFX12:       ; %bb.0: ; %bb
 210 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
 211 ; GFX12-NEXT:    s_clause 0x1
 212 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 213 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 214 ; GFX12-NEXT:    s_endpgm
 215 bb:
 216   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
 217   store <8 x i32> %res, ptr addrspace(1) %out
 218   ret void
 219 }
 220
 221 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 222 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
 223 ; GFX12:       ; %bb.0: ; %bb
 224 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
 225 ; GFX12-NEXT:    s_clause 0x1
 226 ; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
 227 ; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
 228 ; GFX12-NEXT:    s_endpgm
 229 bb:
 230   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
 231   store <8 x i32> %res, ptr addrspace(1) %out
 232   ret void
 233 }
 234
 235 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
 236 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
 237 ; GFX12:       ; %bb.0: ; %bb
 238 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
 239 ; GFX12-NEXT:    s_clause 0x1
 240 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 241 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 242 ; GFX12-NEXT:    s_endpgm
 243 bb:
 244   %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
 245   store <8 x i32> %res, ptr addrspace(1) %out
 246   ret void
 247 }
 248
 249 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 250 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
 251 ; GFX12:       ; %bb.0: ; %bb
 252 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
 253 ; GFX12-NEXT:    s_clause 0x1
 254 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 255 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 256 ; GFX12-NEXT:    s_endpgm
 257 bb:
 258   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 259   store <8 x float> %res, ptr addrspace(1) %out
 260   ret void
 261 }
 262
 263 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 264 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
 265 ; GFX12:       ; %bb.0: ; %bb
 266 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
 267 ; GFX12-NEXT:    s_clause 0x1
 268 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 269 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 270 ; GFX12-NEXT:    s_endpgm
 271 bb:
 272   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 273   store <8 x float> %res, ptr addrspace(1) %out
 274   ret void
 275 }
 276
 277 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 278 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
 279 ; GFX12:       ; %bb.0: ; %bb
 280 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
 281 ; GFX12-NEXT:    s_clause 0x1
 282 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 283 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 284 ; GFX12-NEXT:    s_endpgm
 285 bb:
 286   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 287   store <8 x float> %res, ptr addrspace(1) %out
 288   ret void
 289 }
 290
 291 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 292 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
 293 ; GFX12:       ; %bb.0: ; %bb
 294 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
 295 ; GFX12-NEXT:    s_clause 0x1
 296 ; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
 297 ; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
 298 ; GFX12-NEXT:    s_endpgm
 299 bb:
 300   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 301   store <8 x float> %res, ptr addrspace(1) %out
 302   ret void
 303 }
 304
 305 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half>, <8 x half>, <8 x float>)
 306 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16>, <8 x i16>, <8 x float>)
 307 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
 308 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
 309 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 310 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
 311 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 312 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 313 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 314 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 315 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 316 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x float>, i16)
 317 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
 318 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16)
 319 declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
 320 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 321 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 322 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
 323 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 324 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 325 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 326 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)