llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
   8 ; GFX12-NEXT:    s_clause 0x1
   9 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  10 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  11 ; GFX12-NEXT:    s_endpgm
  12 bb:
  13   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
  14   store <8 x float> %res, ptr addrspace(1) %out
  15   ret void
  16 }
  17
  18 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
  19 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
  20 ; GFX12:       ; %bb.0: ; %bb
  21 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
  22 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
  23 ; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
  24 ; GFX12-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
  25 ; GFX12-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
  26 ; GFX12-NEXT:    v_mov_b32_e32 v17, v10
  27 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
  28 ; GFX12-NEXT:    s_clause 0x1
  29 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  30 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  31 ; GFX12-NEXT:    s_endpgm
  32 bb:
  33   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
  34   store <8 x float> %res, ptr addrspace(1) %out
  35   ret void
  36 }
  37
  38 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
  39 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
  40 ; GFX12:       ; %bb.0: ; %bb
  41 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
  42 ; GFX12-NEXT:    s_clause 0x1
  43 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  44 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  45 ; GFX12-NEXT:    s_endpgm
  46 bb:
  47   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
  48   store <8 x float> %res, ptr addrspace(1) %out
  49   ret void
  50 }
  51
  52 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
  53 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
  54 ; GFX12:       ; %bb.0: ; %bb
  55 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
  56 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
  57 ; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
  58 ; GFX12-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
  59 ; GFX12-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
  60 ; GFX12-NEXT:    v_mov_b32_e32 v17, v10
  61 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
  62 ; GFX12-NEXT:    s_clause 0x1
  63 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
  64 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  65 ; GFX12-NEXT:    s_endpgm
  66 bb:
  67   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
  68   store <8 x float> %res, ptr addrspace(1) %out
  69   ret void
  70 }
  71
  72 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
  73 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
  74 ; GFX12:       ; %bb.0: ; %bb
  75 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
  76 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  77 ; GFX12-NEXT:    s_endpgm
  78 bb:
  79   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
  80   store <8 x half> %res, ptr addrspace(1) %out
  81   ret void
  82 }
  83
  84 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
  85 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
  86 ; GFX12:       ; %bb.0: ; %bb
  87 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x42004200
  88 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
  89 ; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
  90 ; GFX12-NEXT:    v_mov_b32_e32 v13, v10
  91 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
  92 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
  93 ; GFX12-NEXT:    s_endpgm
  94 bb:
  95   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
  96   store <8 x half> %res, ptr addrspace(1) %out
  97   ret void
  98 }
  99
 100 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
 101 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
 102 ; GFX12:       ; %bb.0: ; %bb
 103 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x3f803f80
 104 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 105 ; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
 106 ; GFX12-NEXT:    v_mov_b32_e32 v13, v10
 107 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 108 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 109 ; GFX12-NEXT:    s_endpgm
 110 bb:
 111   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
 112   store <8 x i16> %res, ptr addrspace(1) %out
 113   ret void
 114 }
 115
 116 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
 117 ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
 118 ; GFX12:       ; %bb.0: ; %bb
 119 ; GFX12-NEXT:    v_mov_b32_e32 v10, 0x3fc03fc0
 120 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 121 ; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
 122 ; GFX12-NEXT:    v_mov_b32_e32 v13, v10
 123 ; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
 124 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 125 ; GFX12-NEXT:    s_endpgm
 126 bb:
 127   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
 128   store <8 x i16> %res, ptr addrspace(1) %out
 129   ret void
 130 }
 131
 132 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 133 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
 134 ; GFX12:       ; %bb.0: ; %bb
 135 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
 136 ; GFX12-NEXT:    s_clause 0x1
 137 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 138 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 139 ; GFX12-NEXT:    s_endpgm
 140 bb:
 141   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
 142   store <8 x i32> %res, ptr addrspace(1) %out
 143   ret void
 144 }
 145
 146 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 147 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
 148 ; GFX12:       ; %bb.0: ; %bb
 149 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
 150 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 151 ; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
 152 ; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
 153 ; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
 154 ; GFX12-NEXT:    v_mov_b32_e32 v13, v6
 155 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
 156 ; GFX12-NEXT:    s_clause 0x1
 157 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 158 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 159 ; GFX12-NEXT:    s_endpgm
 160 bb:
 161   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
 162   store <8 x i32> %res, ptr addrspace(1) %out
 163   ret void
 164 }
 165
 166 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
 167 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
 168 ; GFX12:       ; %bb.0: ; %bb
 169 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
 170 ; GFX12-NEXT:    s_clause 0x1
 171 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
 172 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 173 ; GFX12-NEXT:    s_endpgm
 174 bb:
 175   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
 176   store <8 x i32> %res, ptr addrspace(1) %out
 177   ret void
 178 }
 179
 180 define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
 181 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
 182 ; GFX12:       ; %bb.0: ; %bb
 183 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
 184 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 185 ; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
 186 ; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
 187 ; GFX12-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
 188 ; GFX12-NEXT:    v_mov_b32_e32 v11, v4
 189 ; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
 190 ; GFX12-NEXT:    s_clause 0x1
 191 ; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
 192 ; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
 193 ; GFX12-NEXT:    s_endpgm
 194 bb:
 195   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
 196   store <8 x i32> %res, ptr addrspace(1) %out
 197   ret void
 198 }
 199
 200 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 201 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
 202 ; GFX12:       ; %bb.0: ; %bb
 203 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
 204 ; GFX12-NEXT:    s_clause 0x1
 205 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 206 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 207 ; GFX12-NEXT:    s_endpgm
 208 bb:
 209   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 210   store <8 x float> %res, ptr addrspace(1) %out
 211   ret void
 212 }
 213
 214 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 215 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
 216 ; GFX12:       ; %bb.0: ; %bb
 217 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
 218 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 219 ; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
 220 ; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
 221 ; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
 222 ; GFX12-NEXT:    v_mov_b32_e32 v13, v6
 223 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
 224 ; GFX12-NEXT:    s_clause 0x1
 225 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 226 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 227 ; GFX12-NEXT:    s_endpgm
 228 bb:
 229   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 230   store <8 x float> %res, ptr addrspace(1) %out
 231   ret void
 232 }
 233
 234 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 235 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
 236 ; GFX12:       ; %bb.0: ; %bb
 237 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
 238 ; GFX12-NEXT:    s_clause 0x1
 239 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 240 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 241 ; GFX12-NEXT:    s_endpgm
 242 bb:
 243   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 244   store <8 x float> %res, ptr addrspace(1) %out
 245   ret void
 246 }
 247
 248 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 249 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
 250 ; GFX12:       ; %bb.0: ; %bb
 251 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
 252 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 253 ; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
 254 ; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
 255 ; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
 256 ; GFX12-NEXT:    v_mov_b32_e32 v13, v6
 257 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
 258 ; GFX12-NEXT:    s_clause 0x1
 259 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 260 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 261 ; GFX12-NEXT:    s_endpgm
 262 bb:
 263   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 264   store <8 x float> %res, ptr addrspace(1) %out
 265   ret void
 266 }
 267
 268 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 269 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
 270 ; GFX12:       ; %bb.0: ; %bb
 271 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
 272 ; GFX12-NEXT:    s_clause 0x1
 273 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 274 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 275 ; GFX12-NEXT:    s_endpgm
 276 bb:
 277   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 278   store <8 x float> %res, ptr addrspace(1) %out
 279   ret void
 280 }
 281
 282 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 283 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
 284 ; GFX12:       ; %bb.0: ; %bb
 285 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
 286 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 287 ; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
 288 ; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
 289 ; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
 290 ; GFX12-NEXT:    v_mov_b32_e32 v13, v6
 291 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
 292 ; GFX12-NEXT:    s_clause 0x1
 293 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 294 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 295 ; GFX12-NEXT:    s_endpgm
 296 bb:
 297   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 298   store <8 x float> %res, ptr addrspace(1) %out
 299   ret void
 300 }
 301
 302 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 303 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
 304 ; GFX12:       ; %bb.0: ; %bb
 305 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
 306 ; GFX12-NEXT:    s_clause 0x1
 307 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 308 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 309 ; GFX12-NEXT:    s_endpgm
 310 bb:
 311   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 312   store <8 x float> %res, ptr addrspace(1) %out
 313   ret void
 314 }
 315
 316 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 317 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
 318 ; GFX12:       ; %bb.0: ; %bb
 319 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
 320 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 321 ; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
 322 ; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
 323 ; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
 324 ; GFX12-NEXT:    v_mov_b32_e32 v13, v6
 325 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
 326 ; GFX12-NEXT:    s_clause 0x1
 327 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 328 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 329 ; GFX12-NEXT:    s_endpgm
 330 bb:
 331   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
 332   store <8 x float> %res, ptr addrspace(1) %out
 333   ret void
 334 }
 335
 336 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 337 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
 338 ; GFX12:       ; %bb.0: ; %bb
 339 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
 340 ; GFX12-NEXT:    s_clause 0x1
 341 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 342 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 343 ; GFX12-NEXT:    s_endpgm
 344 bb:
 345   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
 346   store <8 x i32> %res, ptr addrspace(1) %out
 347   ret void
 348 }
 349
 350 define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
 351 ; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
 352 ; GFX12:       ; %bb.0: ; %bb
 353 ; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
 354 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 355 ; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
 356 ; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
 357 ; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
 358 ; GFX12-NEXT:    v_mov_b32_e32 v13, v6
 359 ; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
 360 ; GFX12-NEXT:    s_clause 0x1
 361 ; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
 362 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 363 ; GFX12-NEXT:    s_endpgm
 364 bb:
 365   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
 366   store <8 x i32> %res, ptr addrspace(1) %out
 367   ret void
 368 }
 369
 370 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half>, <8 x half>, <8 x float>)
 371 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16>, <8 x i16>, <8 x float>)
 372 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
 373 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
 374 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
 375 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
 376 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 377 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 378 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 379 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 380 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)