llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
   5 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
   8 ; GFX12-NEXT:    v_mov_b32_e32 v23, v9
   9 ; GFX12-NEXT:    v_mov_b32_e32 v22, v8
  10 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
  11 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
  12 ; GFX12-NEXT:    v_mov_b32_e32 v27, v9
  13 ; GFX12-NEXT:    v_mov_b32_e32 v26, v8
  14 ; GFX12-NEXT:    v_mov_b32_e32 v25, v7
  15 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
  16 ; GFX12-NEXT:    v_mov_b32_e32 v31, v9
  17 ; GFX12-NEXT:    v_mov_b32_e32 v30, v8
  18 ; GFX12-NEXT:    v_mov_b32_e32 v29, v7
  19 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
  20 ; GFX12-NEXT:    s_wait_loadcnt 0x0
  21 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
  22 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
  23 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
  24 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
  25 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
  26 ; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
  27 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
  28 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
  29 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
  30 ; GFX12-NEXT:    s_endpgm
  31 bb:
  32   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
  33   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
  34   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
  35   store <4 x float> %res0, ptr addrspace(1) %out0
  36   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
  37   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
  38   store <4 x float> %res1, ptr addrspace(1) %out1
  39   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
  40   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
  41   store <4 x float> %res2, ptr addrspace(1) %out2
  42   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
  43   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
  44   store <4 x float> %res3, ptr addrspace(1) %out3
  45   ret void
  46 }
  47
  48 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
  49 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
  50 ; GFX12:       ; %bb.0: ; %bb
  51 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
  52 ; GFX12-NEXT:    v_mov_b32_e32 v23, v9
  53 ; GFX12-NEXT:    v_mov_b32_e32 v22, v8
  54 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
  55 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
  56 ; GFX12-NEXT:    v_mov_b32_e32 v27, v9
  57 ; GFX12-NEXT:    v_mov_b32_e32 v26, v8
  58 ; GFX12-NEXT:    v_mov_b32_e32 v25, v7
  59 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
  60 ; GFX12-NEXT:    v_mov_b32_e32 v31, v9
  61 ; GFX12-NEXT:    v_mov_b32_e32 v30, v8
  62 ; GFX12-NEXT:    v_mov_b32_e32 v29, v7
  63 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
  64 ; GFX12-NEXT:    s_wait_loadcnt 0x0
  65 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
  66 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
  67 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
  68 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
  69 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
  70 ; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
  71 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
  72 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
  73 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
  74 ; GFX12-NEXT:    s_endpgm
  75 bb:
  76   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
  77   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
  78   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
  79   store <4 x float> %res0, ptr addrspace(1) %out0
  80   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
  81   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
  82   store <4 x float> %res1, ptr addrspace(1) %out1
  83   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
  84   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
  85   store <4 x float> %res2, ptr addrspace(1) %out2
  86   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
  87   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
  88   store <4 x float> %res3, ptr addrspace(1) %out3
  89   ret void
  90 }
  91
  92 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
  93 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
  94 ; GFX12:       ; %bb.0: ; %bb
  95 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
  96 ; GFX12-NEXT:    v_mov_b32_e32 v9, v7
  97 ; GFX12-NEXT:    v_mov_b32_e32 v8, v6
  98 ; GFX12-NEXT:    v_mov_b32_e32 v19, v7
  99 ; GFX12-NEXT:    v_mov_b32_e32 v18, v6
 100 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
 101 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 102 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 103 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
 104 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
 105 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 106 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
 107 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
 108 ; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
 109 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 110 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 111 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
 112 ; GFX12-NEXT:    s_endpgm
 113 bb:
 114   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 115   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 116   %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
 117   store <4 x half> %res0, ptr addrspace(1) %out0
 118   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 119   %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
 120   store <4 x half> %res1, ptr addrspace(1) %out1
 121   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 122   %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
 123   store <4 x half> %res2, ptr addrspace(1) %out2
 124   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 125   %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
 126   store <4 x half> %res3, ptr addrspace(1) %out3
 127   ret void
 128 }
 129
 130 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 131 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
 132 ; GFX12:       ; %bb.0: ; %bb
 133 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
 134 ; GFX12-NEXT:    v_mov_b32_e32 v9, v7
 135 ; GFX12-NEXT:    v_mov_b32_e32 v8, v6
 136 ; GFX12-NEXT:    v_mov_b32_e32 v19, v7
 137 ; GFX12-NEXT:    v_mov_b32_e32 v18, v6
 138 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
 139 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 140 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 141 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
 142 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
 143 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 144 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
 145 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
 146 ; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
 147 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 148 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 149 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
 150 ; GFX12-NEXT:    s_endpgm
 151 bb:
 152   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 153   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 154   %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
 155   store <4 x i16> %res0, ptr addrspace(1) %out0
 156   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 157   %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
 158   store <4 x i16> %res1, ptr addrspace(1) %out1
 159   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 160   %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
 161   store <4 x i16> %res2, ptr addrspace(1) %out2
 162   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 163   %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
 164   store <4 x i16> %res3, ptr addrspace(1) %out3
 165   ret void
 166 }
 167
 168 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 169 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
 170 ; GFX12:       ; %bb.0: ; %bb
 171 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 172 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 173 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 174 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 175 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 176 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 177 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 178 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 179 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 180 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 181 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 182 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 183 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 184 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 185 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
 186 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
 187 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 188 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
 189 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
 190 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 191 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 192 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 193 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 194 ; GFX12-NEXT:    s_endpgm
 195 bb:
 196   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 197   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 198   %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
 199   store <4 x i32> %res0, ptr addrspace(1) %out0
 200   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 201   %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
 202   store <4 x i32> %res1, ptr addrspace(1) %out1
 203   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 204   %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
 205   store <4 x i32> %res2, ptr addrspace(1) %out2
 206   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 207   %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
 208   store <4 x i32> %res3, ptr addrspace(1) %out3
 209   ret void
 210 }
 211
 212 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 213 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
 214 ; GFX12:       ; %bb.0: ; %bb
 215 ; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
 216 ; GFX12-NEXT:    v_mov_b32_e32 v15, v5
 217 ; GFX12-NEXT:    v_mov_b32_e32 v14, v4
 218 ; GFX12-NEXT:    v_mov_b32_e32 v13, v3
 219 ; GFX12-NEXT:    v_mov_b32_e32 v12, v2
 220 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 221 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 222 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
 223 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
 224 ; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
 225 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
 226 ; GFX12-NEXT:    s_endpgm
 227 bb:
 228   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
 229   %Index0 = extractelement <2 x i16> %IndexVec, i32 0
 230   %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
 231   store <4 x i32> %res0, ptr addrspace(1) %out0
 232   %Index1 = extractelement <2 x i16> %IndexVec, i32 1
 233   %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
 234   store <4 x i32> %res1, ptr addrspace(1) %out1
 235   ret void
 236 }
 237
 238 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 239 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
 240 ; GFX12:       ; %bb.0: ; %bb
 241 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 242 ; GFX12-NEXT:    v_mov_b32_e32 v16, v6
 243 ; GFX12-NEXT:    v_mov_b32_e32 v15, v5
 244 ; GFX12-NEXT:    v_mov_b32_e32 v14, v4
 245 ; GFX12-NEXT:    v_mov_b32_e32 v13, v3
 246 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 247 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 248 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
 249 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
 250 ; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
 251 ; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
 252 ; GFX12-NEXT:    s_endpgm
 253 bb:
 254   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
 255   %Index0 = extractelement <2 x i16> %IndexVec, i32 0
 256   %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
 257   store <4 x i32> %res0, ptr addrspace(1) %out0
 258   %Index1 = extractelement <2 x i16> %IndexVec, i32 1
 259   %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
 260   store <4 x i32> %res1, ptr addrspace(1) %out1
 261   ret void
 262 }
 263
 264 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 265 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
 266 ; GFX12:       ; %bb.0: ; %bb
 267 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 268 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 269 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 270 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 271 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 272 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 273 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 274 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 275 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 276 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 277 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 278 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 279 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 280 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 281 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
 282 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
 283 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 284 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
 285 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
 286 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 287 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 288 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 289 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 290 ; GFX12-NEXT:    s_endpgm
 291 bb:
 292   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 293   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 294   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 295   store <4 x float> %res0, ptr addrspace(1) %out0
 296   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 297   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 298   store <4 x float> %res1, ptr addrspace(1) %out1
 299   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 300   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 301   store <4 x float> %res2, ptr addrspace(1) %out2
 302   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 303   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 304   store <4 x float> %res3, ptr addrspace(1) %out3
 305   ret void
 306 }
 307
 308 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 309 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
 310 ; GFX12:       ; %bb.0: ; %bb
 311 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 312 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 313 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 314 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 315 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 316 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 317 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 318 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 319 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 320 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 321 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 322 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 323 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 324 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 325 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
 326 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
 327 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 328 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
 329 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
 330 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 331 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 332 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 333 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 334 ; GFX12-NEXT:    s_endpgm
 335 bb:
 336   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 337   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 338   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 339   store <4 x float> %res0, ptr addrspace(1) %out0
 340   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 341   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 342   store <4 x float> %res1, ptr addrspace(1) %out1
 343   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 344   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 345   store <4 x float> %res2, ptr addrspace(1) %out2
 346   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 347   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 348   store <4 x float> %res3, ptr addrspace(1) %out3
 349   ret void
 350 }
 351
 352 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 353 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
 354 ; GFX12:       ; %bb.0: ; %bb
 355 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 356 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 357 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 358 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 359 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 360 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 361 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 362 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 363 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 364 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 365 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 366 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 367 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 368 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 369 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
 370 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
 371 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 372 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
 373 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
 374 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 375 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 376 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 377 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 378 ; GFX12-NEXT:    s_endpgm
 379 bb:
 380   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 381   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 382   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 383   store <4 x float> %res0, ptr addrspace(1) %out0
 384   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 385   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 386   store <4 x float> %res1, ptr addrspace(1) %out1
 387   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 388   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 389   store <4 x float> %res2, ptr addrspace(1) %out2
 390   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 391   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 392   store <4 x float> %res3, ptr addrspace(1) %out3
 393   ret void
 394 }
 395
 396 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 397 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
 398 ; GFX12:       ; %bb.0: ; %bb
 399 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 400 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 401 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 402 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 403 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 404 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 405 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 406 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 407 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 408 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 409 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 410 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 411 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 412 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 413 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
 414 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
 415 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 416 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
 417 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
 418 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 419 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 420 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 421 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 422 ; GFX12-NEXT:    s_endpgm
 423 bb:
 424   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 425   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 426   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 427   store <4 x float> %res0, ptr addrspace(1) %out0
 428   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 429   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 430   store <4 x float> %res1, ptr addrspace(1) %out1
 431   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 432   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 433   store <4 x float> %res2, ptr addrspace(1) %out2
 434   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 435   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 436   store <4 x float> %res3, ptr addrspace(1) %out3
 437   ret void
 438 }
 439
 440 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x float>, i8)
 441 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
 442 declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
 443 declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
 444 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
 445 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
 446 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
 447 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 448 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 449 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 450 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)