llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
   8 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
   9 ; GFX12-NEXT:    s_endpgm
  10 bb:
  11   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
  12   store <4 x float> %res, ptr addrspace(1) %out
  13   ret void
  14 }
  15
  16 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
  17 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
  18 ; GFX12:       ; %bb.0: ; %bb
  19 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
  20 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
  21 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
  22 ; GFX12-NEXT:    v_mov_b32_e32 v8, v6
  23 ; GFX12-NEXT:    v_mov_b32_e32 v9, v6
  24 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
  25 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
  26 ; GFX12-NEXT:    s_endpgm
  27 bb:
  28   %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
  29   store <4 x float> %res, ptr addrspace(1) %out
  30   ret void
  31 }
  32
  33 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
  34 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
  35 ; GFX12:       ; %bb.0: ; %bb
  36 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
  37 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
  38 ; GFX12-NEXT:    s_endpgm
  39 bb:
  40   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
  41   store <4 x float> %res, ptr addrspace(1) %out
  42   ret void
  43 }
  44
  45 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
  46 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
  47 ; GFX12:       ; %bb.0: ; %bb
  48 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
  49 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
  50 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
  51 ; GFX12-NEXT:    v_mov_b32_e32 v8, v6
  52 ; GFX12-NEXT:    v_mov_b32_e32 v9, v6
  53 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
  54 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
  55 ; GFX12-NEXT:    s_endpgm
  56 bb:
  57   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
  58   store <4 x float> %res, ptr addrspace(1) %out
  59   ret void
  60 }
  61
  62 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
  63 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
  64 ; GFX12:       ; %bb.0: ; %bb
  65 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
  66 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
  67 ; GFX12-NEXT:    s_endpgm
  68 bb:
  69   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
  70   store <4 x half> %res, ptr addrspace(1) %out
  71   ret void
  72 }
  73
  74 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
  75 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
  76 ; GFX12:       ; %bb.0: ; %bb
  77 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x42004200
  78 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
  79 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
  80 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
  81 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
  82 ; GFX12-NEXT:    s_endpgm
  83 bb:
  84   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
  85   store <4 x half> %res, ptr addrspace(1) %out
  86   ret void
  87 }
  88
  89 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
  90 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
  91 ; GFX12:       ; %bb.0: ; %bb
  92 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3f803f80
  93 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
  94 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
  95 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
  96 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
  97 ; GFX12-NEXT:    s_endpgm
  98 bb:
  99   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
 100   store <4 x i16> %res, ptr addrspace(1) %out
 101   ret void
 102 }
 103
 104 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
 105 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
 106 ; GFX12:       ; %bb.0: ; %bb
 107 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3fc03fc0
 108 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 109 ; GFX12-NEXT:    v_mov_b32_e32 v7, v6
 110 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
 111 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
 112 ; GFX12-NEXT:    s_endpgm
 113 bb:
 114   %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
 115   store <4 x i16> %res, ptr addrspace(1) %out
 116   ret void
 117 }
 118
 119 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
 120 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
 121 ; GFX12:       ; %bb.0: ; %bb
 122 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
 123 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 124 ; GFX12-NEXT:    s_endpgm
 125 bb:
 126   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
 127   store <4 x i32> %res, ptr addrspace(1) %out
 128   ret void
 129 }
 130
 131 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
 132 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
 133 ; GFX12:       ; %bb.0: ; %bb
 134 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
 135 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 136 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 137 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 138 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 139 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
 140 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 141 ; GFX12-NEXT:    s_endpgm
 142 bb:
 143   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
 144   store <4 x i32> %res, ptr addrspace(1) %out
 145   ret void
 146 }
 147
 148 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
 149 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
 150 ; GFX12:       ; %bb.0: ; %bb
 151 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
 152 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 153 ; GFX12-NEXT:    s_endpgm
 154 bb:
 155   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
 156   store <4 x i32> %res, ptr addrspace(1) %out
 157   ret void
 158 }
 159
 160 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
 161 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
 162 ; GFX12:       ; %bb.0: ; %bb
 163 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
 164 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 165 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 166 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 167 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 168 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
 169 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 170 ; GFX12-NEXT:    s_endpgm
 171 bb:
 172   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
 173   store <4 x i32> %res, ptr addrspace(1) %out
 174   ret void
 175 }
 176
 177 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 178 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
 179 ; GFX12:       ; %bb.0: ; %bb
 180 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
 181 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 182 ; GFX12-NEXT:    s_endpgm
 183 bb:
 184   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
 185   store <4 x float> %res, ptr addrspace(1) %out
 186   ret void
 187 }
 188
 189 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 190 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
 191 ; GFX12:       ; %bb.0: ; %bb
 192 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
 193 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 194 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 195 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 196 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 197 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
 198 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 199 ; GFX12-NEXT:    s_endpgm
 200 bb:
 201   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
 202   store <4 x float> %res, ptr addrspace(1) %out
 203   ret void
 204 }
 205
 206 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 207 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
 208 ; GFX12:       ; %bb.0: ; %bb
 209 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
 210 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 211 ; GFX12-NEXT:    s_endpgm
 212 bb:
 213   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
 214   store <4 x float> %res, ptr addrspace(1) %out
 215   ret void
 216 }
 217
 218 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 219 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
 220 ; GFX12:       ; %bb.0: ; %bb
 221 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
 222 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 223 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 224 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 225 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 226 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
 227 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 228 ; GFX12-NEXT:    s_endpgm
 229 bb:
 230   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
 231   store <4 x float> %res, ptr addrspace(1) %out
 232   ret void
 233 }
 234
 235 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 236 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
 237 ; GFX12:       ; %bb.0: ; %bb
 238 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
 239 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 240 ; GFX12-NEXT:    s_endpgm
 241 bb:
 242   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
 243   store <4 x float> %res, ptr addrspace(1) %out
 244   ret void
 245 }
 246
 247 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 248 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
 249 ; GFX12:       ; %bb.0: ; %bb
 250 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
 251 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 252 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 253 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 254 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 255 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
 256 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 257 ; GFX12-NEXT:    s_endpgm
 258 bb:
 259   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
 260   store <4 x float> %res, ptr addrspace(1) %out
 261   ret void
 262 }
 263
 264 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 265 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
 266 ; GFX12:       ; %bb.0: ; %bb
 267 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
 268 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 269 ; GFX12-NEXT:    s_endpgm
 270 bb:
 271   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
 272   store <4 x float> %res, ptr addrspace(1) %out
 273   ret void
 274 }
 275
 276 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 277 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
 278 ; GFX12:       ; %bb.0: ; %bb
 279 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
 280 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 281 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 282 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 283 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 284 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
 285 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 286 ; GFX12-NEXT:    s_endpgm
 287 bb:
 288   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
 289   store <4 x float> %res, ptr addrspace(1) %out
 290   ret void
 291 }
 292
 293 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 294 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
 295 ; GFX12:       ; %bb.0: ; %bb
 296 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
 297 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 298 ; GFX12-NEXT:    s_endpgm
 299 bb:
 300   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
 301   store <4 x i32> %res, ptr addrspace(1) %out
 302   ret void
 303 }
 304
 305 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 306 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
 307 ; GFX12:       ; %bb.0: ; %bb
 308 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
 309 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 310 ; GFX12-NEXT:    v_mov_b32_e32 v5, v4
 311 ; GFX12-NEXT:    v_mov_b32_e32 v6, v4
 312 ; GFX12-NEXT:    v_mov_b32_e32 v7, v4
 313 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
 314 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 315 ; GFX12-NEXT:    s_endpgm
 316 bb:
 317   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
 318   store <4 x i32> %res, ptr addrspace(1) %out
 319   ret void
 320 }
 321
 322 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half>, <4 x half>, <4 x float>)
 323 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16>, <4 x i16>, <4 x float>)
 324 declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
 325 declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
 326 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
 327 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
 328 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32, i32, <4 x float>)
 329 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <4 x float>)
 330 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>)
 331 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>)
 332 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)