llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
   8 ; GFX12-NEXT:    s_clause 0x1
   9 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  10 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  11 ; GFX12-NEXT:    s_nop 0
  12 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  13 ; GFX12-NEXT:    s_endpgm
  14 bb:
  15   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
  16   store <8 x float> %res, ptr addrspace(1) %out
  17   ret void
  18 }
  19
  20 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
  21 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
  22 ; GFX12:       ; %bb.0: ; %bb
  23 ; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
  24 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
  25 ; GFX12-NEXT:    s_mov_b32 s7, s0
  26 ; GFX12-NEXT:    s_mov_b32 s1, s0
  27 ; GFX12-NEXT:    s_mov_b32 s2, s0
  28 ; GFX12-NEXT:    s_mov_b32 s3, s0
  29 ; GFX12-NEXT:    s_mov_b32 s4, s0
  30 ; GFX12-NEXT:    s_mov_b32 s5, s0
  31 ; GFX12-NEXT:    s_mov_b32 s6, s0
  32 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
  33 ; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
  34 ; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
  35 ; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
  36 ; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
  37 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
  38 ; GFX12-NEXT:    s_clause 0x1
  39 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  40 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  41 ; GFX12-NEXT:    s_nop 0
  42 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  43 ; GFX12-NEXT:    s_endpgm
  44 bb:
  45   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
  46   store <8 x float> %res, ptr addrspace(1) %out
  47   ret void
  48 }
  49
  50 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
  51 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
  52 ; GFX12:       ; %bb.0: ; %bb
  53 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
  54 ; GFX12-NEXT:    s_clause 0x1
  55 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  56 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  57 ; GFX12-NEXT:    s_nop 0
  58 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  59 ; GFX12-NEXT:    s_endpgm
  60 bb:
  61   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
  62   store <8 x float> %res, ptr addrspace(1) %out
  63   ret void
  64 }
  65
  66 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
  67 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
  68 ; GFX12:       ; %bb.0: ; %bb
  69 ; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
  70 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
  71 ; GFX12-NEXT:    s_mov_b32 s7, s0
  72 ; GFX12-NEXT:    s_mov_b32 s1, s0
  73 ; GFX12-NEXT:    s_mov_b32 s2, s0
  74 ; GFX12-NEXT:    s_mov_b32 s3, s0
  75 ; GFX12-NEXT:    s_mov_b32 s4, s0
  76 ; GFX12-NEXT:    s_mov_b32 s5, s0
  77 ; GFX12-NEXT:    s_mov_b32 s6, s0
  78 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
  79 ; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
  80 ; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
  81 ; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
  82 ; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
  83 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
  84 ; GFX12-NEXT:    s_clause 0x1
  85 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  86 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  87 ; GFX12-NEXT:    s_nop 0
  88 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  89 ; GFX12-NEXT:    s_endpgm
  90 bb:
  91   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
  92   store <8 x float> %res, ptr addrspace(1) %out
  93   ret void
  94 }
  95
  96 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
  97 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
  98 ; GFX12:       ; %bb.0: ; %bb
  99 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
 100 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 101 ; GFX12-NEXT:    s_nop 0
 102 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 103 ; GFX12-NEXT:    s_endpgm
 104 bb:
 105   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
 106   store <8 x half> %res, ptr addrspace(1) %out
 107   ret void
 108 }
 109
 110 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
 111 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
 112 ; GFX12:       ; %bb.0: ; %bb
 113 ; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
 114 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 115 ; GFX12-NEXT:    s_mov_b32 s3, s0
 116 ; GFX12-NEXT:    s_mov_b32 s1, s0
 117 ; GFX12-NEXT:    s_mov_b32 s2, s0
 118 ; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
 119 ; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
 120 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 121 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
 122 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 123 ; GFX12-NEXT:    s_nop 0
 124 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 125 ; GFX12-NEXT:    s_endpgm
 126 bb:
 127   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
 128   store <8 x half> %res, ptr addrspace(1) %out
 129   ret void
 130 }
 131
 132 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
 133 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
 134 ; GFX12:       ; %bb.0: ; %bb
 135 ; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
 136 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 137 ; GFX12-NEXT:    s_mov_b32 s3, s0
 138 ; GFX12-NEXT:    s_mov_b32 s1, s0
 139 ; GFX12-NEXT:    s_mov_b32 s2, s0
 140 ; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
 141 ; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
 142 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 143 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 144 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 145 ; GFX12-NEXT:    s_nop 0
 146 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 147 ; GFX12-NEXT:    s_endpgm
 148 bb:
 149   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
 150   store <8 x i16> %res, ptr addrspace(1) %out
 151   ret void
 152 }
 153
 154 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
 155 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
 156 ; GFX12:       ; %bb.0: ; %bb
 157 ; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
 158 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 159 ; GFX12-NEXT:    s_mov_b32 s3, s0
 160 ; GFX12-NEXT:    s_mov_b32 s1, s0
 161 ; GFX12-NEXT:    s_mov_b32 s2, s0
 162 ; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
 163 ; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
 164 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 165 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 166 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 167 ; GFX12-NEXT:    s_nop 0
 168 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 169 ; GFX12-NEXT:    s_endpgm
 170 bb:
 171   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
 172   store <8 x i16> %res, ptr addrspace(1) %out
 173   ret void
 174 }
 175
 176 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 177 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
 178 ; GFX12:       ; %bb.0: ; %bb
 179 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
 180 ; GFX12-NEXT:    s_clause 0x1
 181 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 182 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 183 ; GFX12-NEXT:    s_nop 0
 184 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 185 ; GFX12-NEXT:    s_endpgm
 186 bb:
 187   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
 188   store <8 x i32> %res, ptr addrspace(1) %out
 189   ret void
 190 }
 191
 192 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 193 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
 194 ; GFX12:       ; %bb.0: ; %bb
 195 ; GFX12-NEXT:    s_movk_i32 s0, 0x80
 196 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 197 ; GFX12-NEXT:    s_mov_b32 s7, s0
 198 ; GFX12-NEXT:    s_mov_b32 s1, s0
 199 ; GFX12-NEXT:    s_mov_b32 s2, s0
 200 ; GFX12-NEXT:    s_mov_b32 s3, s0
 201 ; GFX12-NEXT:    s_mov_b32 s4, s0
 202 ; GFX12-NEXT:    s_mov_b32 s5, s0
 203 ; GFX12-NEXT:    s_mov_b32 s6, s0
 204 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 205 ; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
 206 ; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
 207 ; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
 208 ; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 209 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
 210 ; GFX12-NEXT:    s_clause 0x1
 211 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 212 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 213 ; GFX12-NEXT:    s_nop 0
 214 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 215 ; GFX12-NEXT:    s_endpgm
 216 bb:
 217   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
 218   store <8 x i32> %res, ptr addrspace(1) %out
 219   ret void
 220 }
 221
 222 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
 223 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
 224 ; GFX12:       ; %bb.0: ; %bb
 225 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
 226 ; GFX12-NEXT:    s_clause 0x1
 227 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 228 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
 229 ; GFX12-NEXT:    s_nop 0
 230 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 231 ; GFX12-NEXT:    s_endpgm
 232 bb:
 233   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
 234   store <8 x i32> %res, ptr addrspace(1) %out
 235   ret void
 236 }
 237
 238 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
 239 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
 240 ; GFX12:       ; %bb.0: ; %bb
 241 ; GFX12-NEXT:    s_movk_i32 s0, 0x80
 242 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 243 ; GFX12-NEXT:    s_mov_b32 s7, s0
 244 ; GFX12-NEXT:    s_mov_b32 s1, s0
 245 ; GFX12-NEXT:    s_mov_b32 s2, s0
 246 ; GFX12-NEXT:    s_mov_b32 s3, s0
 247 ; GFX12-NEXT:    s_mov_b32 s4, s0
 248 ; GFX12-NEXT:    s_mov_b32 s5, s0
 249 ; GFX12-NEXT:    s_mov_b32 s6, s0
 250 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 251 ; GFX12-NEXT:    v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
 252 ; GFX12-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
 253 ; GFX12-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
 254 ; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
 255 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
 256 ; GFX12-NEXT:    s_clause 0x1
 257 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 258 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
 259 ; GFX12-NEXT:    s_nop 0
 260 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 261 ; GFX12-NEXT:    s_endpgm
 262 bb:
 263   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
 264   store <8 x i32> %res, ptr addrspace(1) %out
 265   ret void
 266 }
 267
 268 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 269 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
 270 ; GFX12:       ; %bb.0: ; %bb
 271 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
 272 ; GFX12-NEXT:    s_clause 0x1
 273 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 274 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 275 ; GFX12-NEXT:    s_nop 0
 276 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 277 ; GFX12-NEXT:    s_endpgm
 278 bb:
 279   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 280   store <8 x float> %res, ptr addrspace(1) %out
 281   ret void
 282 }
 283
 284 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 285 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
 286 ; GFX12:       ; %bb.0: ; %bb
 287 ; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
 288 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 289 ; GFX12-NEXT:    s_mov_b32 s7, s0
 290 ; GFX12-NEXT:    s_mov_b32 s1, s0
 291 ; GFX12-NEXT:    s_mov_b32 s2, s0
 292 ; GFX12-NEXT:    s_mov_b32 s3, s0
 293 ; GFX12-NEXT:    s_mov_b32 s4, s0
 294 ; GFX12-NEXT:    s_mov_b32 s5, s0
 295 ; GFX12-NEXT:    s_mov_b32 s6, s0
 296 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 297 ; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
 298 ; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
 299 ; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
 300 ; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 301 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
 302 ; GFX12-NEXT:    s_clause 0x1
 303 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 304 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 305 ; GFX12-NEXT:    s_nop 0
 306 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 307 ; GFX12-NEXT:    s_endpgm
 308 bb:
 309   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 310   store <8 x float> %res, ptr addrspace(1) %out
 311   ret void
 312 }
 313
 314 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 315 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
 316 ; GFX12:       ; %bb.0: ; %bb
 317 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
 318 ; GFX12-NEXT:    s_clause 0x1
 319 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 320 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 321 ; GFX12-NEXT:    s_nop 0
 322 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 323 ; GFX12-NEXT:    s_endpgm
 324 bb:
 325   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 326   store <8 x float> %res, ptr addrspace(1) %out
 327   ret void
 328 }
 329
 330 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 331 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
 332 ; GFX12:       ; %bb.0: ; %bb
 333 ; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
 334 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 335 ; GFX12-NEXT:    s_mov_b32 s7, s0
 336 ; GFX12-NEXT:    s_mov_b32 s1, s0
 337 ; GFX12-NEXT:    s_mov_b32 s2, s0
 338 ; GFX12-NEXT:    s_mov_b32 s3, s0
 339 ; GFX12-NEXT:    s_mov_b32 s4, s0
 340 ; GFX12-NEXT:    s_mov_b32 s5, s0
 341 ; GFX12-NEXT:    s_mov_b32 s6, s0
 342 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 343 ; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
 344 ; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
 345 ; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
 346 ; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 347 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
 348 ; GFX12-NEXT:    s_clause 0x1
 349 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 350 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 351 ; GFX12-NEXT:    s_nop 0
 352 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 353 ; GFX12-NEXT:    s_endpgm
 354 bb:
 355   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 356   store <8 x float> %res, ptr addrspace(1) %out
 357   ret void
 358 }
 359
 360 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 361 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
 362 ; GFX12:       ; %bb.0: ; %bb
 363 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
 364 ; GFX12-NEXT:    s_clause 0x1
 365 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 366 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 367 ; GFX12-NEXT:    s_nop 0
 368 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 369 ; GFX12-NEXT:    s_endpgm
 370 bb:
 371   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 372   store <8 x float> %res, ptr addrspace(1) %out
 373   ret void
 374 }
 375
 376 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 377 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
 378 ; GFX12:       ; %bb.0: ; %bb
 379 ; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
 380 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 381 ; GFX12-NEXT:    s_mov_b32 s7, s0
 382 ; GFX12-NEXT:    s_mov_b32 s1, s0
 383 ; GFX12-NEXT:    s_mov_b32 s2, s0
 384 ; GFX12-NEXT:    s_mov_b32 s3, s0
 385 ; GFX12-NEXT:    s_mov_b32 s4, s0
 386 ; GFX12-NEXT:    s_mov_b32 s5, s0
 387 ; GFX12-NEXT:    s_mov_b32 s6, s0
 388 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 389 ; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
 390 ; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
 391 ; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
 392 ; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 393 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
 394 ; GFX12-NEXT:    s_clause 0x1
 395 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 396 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 397 ; GFX12-NEXT:    s_nop 0
 398 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 399 ; GFX12-NEXT:    s_endpgm
 400 bb:
 401   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 402   store <8 x float> %res, ptr addrspace(1) %out
 403   ret void
 404 }
 405
 406 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 407 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
 408 ; GFX12:       ; %bb.0: ; %bb
 409 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
 410 ; GFX12-NEXT:    s_clause 0x1
 411 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 412 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 413 ; GFX12-NEXT:    s_nop 0
 414 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 415 ; GFX12-NEXT:    s_endpgm
 416 bb:
 417   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 418   store <8 x float> %res, ptr addrspace(1) %out
 419   ret void
 420 }
 421
 422 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 423 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
 424 ; GFX12:       ; %bb.0: ; %bb
 425 ; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
 426 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 427 ; GFX12-NEXT:    s_mov_b32 s7, s0
 428 ; GFX12-NEXT:    s_mov_b32 s1, s0
 429 ; GFX12-NEXT:    s_mov_b32 s2, s0
 430 ; GFX12-NEXT:    s_mov_b32 s3, s0
 431 ; GFX12-NEXT:    s_mov_b32 s4, s0
 432 ; GFX12-NEXT:    s_mov_b32 s5, s0
 433 ; GFX12-NEXT:    s_mov_b32 s6, s0
 434 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 435 ; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
 436 ; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
 437 ; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
 438 ; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 439 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
 440 ; GFX12-NEXT:    s_clause 0x1
 441 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 442 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 443 ; GFX12-NEXT:    s_nop 0
 444 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 445 ; GFX12-NEXT:    s_endpgm
 446 bb:
 447   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 448   store <8 x float> %res, ptr addrspace(1) %out
 449   ret void
 450 }
 451
 452 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 453 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
 454 ; GFX12:       ; %bb.0: ; %bb
 455 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
 456 ; GFX12-NEXT:    s_clause 0x1
 457 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 458 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 459 ; GFX12-NEXT:    s_nop 0
 460 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 461 ; GFX12-NEXT:    s_endpgm
 462 bb:
 463   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
 464   store <8 x i32> %res, ptr addrspace(1) %out
 465   ret void
 466 }
 467
 468 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 469 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
 470 ; GFX12:       ; %bb.0: ; %bb
 471 ; GFX12-NEXT:    s_movk_i32 s0, 0x80
 472 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 473 ; GFX12-NEXT:    s_mov_b32 s7, s0
 474 ; GFX12-NEXT:    s_mov_b32 s1, s0
 475 ; GFX12-NEXT:    s_mov_b32 s2, s0
 476 ; GFX12-NEXT:    s_mov_b32 s3, s0
 477 ; GFX12-NEXT:    s_mov_b32 s4, s0
 478 ; GFX12-NEXT:    s_mov_b32 s5, s0
 479 ; GFX12-NEXT:    s_mov_b32 s6, s0
 480 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 481 ; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
 482 ; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
 483 ; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
 484 ; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
 485 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
 486 ; GFX12-NEXT:    s_clause 0x1
 487 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 488 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 489 ; GFX12-NEXT:    s_nop 0
 490 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 491 ; GFX12-NEXT:    s_endpgm
 492 bb:
 493   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
 494   store <8 x i32> %res, ptr addrspace(1) %out
 495   ret void
 496 }
 497
 498 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
 499 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
 500 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
 501 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
 502 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 503 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
 504 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 505 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 506 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 507 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
 508 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 509 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
 510 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
 511 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
 512 declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
 513 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 514 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
 515 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
 516 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 517 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 518 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
 519 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)