llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
   5 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
   8 ; GFX12-NEXT:    v_mov_b32_e32 v23, v9
   9 ; GFX12-NEXT:    v_mov_b32_e32 v22, v8
  10 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
  11 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
  12 ; GFX12-NEXT:    v_mov_b32_e32 v27, v9
  13 ; GFX12-NEXT:    v_mov_b32_e32 v26, v8
  14 ; GFX12-NEXT:    v_mov_b32_e32 v25, v7
  15 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
  16 ; GFX12-NEXT:    v_mov_b32_e32 v31, v9
  17 ; GFX12-NEXT:    v_mov_b32_e32 v30, v8
  18 ; GFX12-NEXT:    v_mov_b32_e32 v29, v7
  19 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
  20 ; GFX12-NEXT:    s_wait_loadcnt 0x0
  21 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
  22 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
  23 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
  24 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
  25 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
  26 ; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
  27 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
  28 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
  29 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
  30 ; GFX12-NEXT:    s_nop 0
  31 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  32 ; GFX12-NEXT:    s_endpgm
  33 bb:
  34   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
  35   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
  36   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
  37   store <4 x float> %res0, ptr addrspace(1) %out0
  38   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
  39   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
  40   store <4 x float> %res1, ptr addrspace(1) %out1
  41   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
  42   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
  43   store <4 x float> %res2, ptr addrspace(1) %out2
  44   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
  45   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
  46   store <4 x float> %res3, ptr addrspace(1) %out3
  47   ret void
  48 }
  49
  50 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
  51 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
  52 ; GFX12:       ; %bb.0: ; %bb
  53 ; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
  54 ; GFX12-NEXT:    v_mov_b32_e32 v23, v9
  55 ; GFX12-NEXT:    v_mov_b32_e32 v22, v8
  56 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
  57 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
  58 ; GFX12-NEXT:    v_mov_b32_e32 v27, v9
  59 ; GFX12-NEXT:    v_mov_b32_e32 v26, v8
  60 ; GFX12-NEXT:    v_mov_b32_e32 v25, v7
  61 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
  62 ; GFX12-NEXT:    v_mov_b32_e32 v31, v9
  63 ; GFX12-NEXT:    v_mov_b32_e32 v30, v8
  64 ; GFX12-NEXT:    v_mov_b32_e32 v29, v7
  65 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
  66 ; GFX12-NEXT:    s_wait_loadcnt 0x0
  67 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
  68 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
  69 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
  70 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
  71 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
  72 ; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
  73 ; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
  74 ; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
  75 ; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
  76 ; GFX12-NEXT:    s_nop 0
  77 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  78 ; GFX12-NEXT:    s_endpgm
  79 bb:
  80   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
  81   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
  82   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
  83   store <4 x float> %res0, ptr addrspace(1) %out0
  84   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
  85   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
  86   store <4 x float> %res1, ptr addrspace(1) %out1
  87   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
  88   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
  89   store <4 x float> %res2, ptr addrspace(1) %out2
  90   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
  91   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
  92   store <4 x float> %res3, ptr addrspace(1) %out3
  93   ret void
  94 }
  95
  96 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
  97 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
  98 ; GFX12:       ; %bb.0: ; %bb
  99 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
 100 ; GFX12-NEXT:    v_mov_b32_e32 v9, v7
 101 ; GFX12-NEXT:    v_mov_b32_e32 v8, v6
 102 ; GFX12-NEXT:    v_mov_b32_e32 v19, v7
 103 ; GFX12-NEXT:    v_mov_b32_e32 v18, v6
 104 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
 105 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 106 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 107 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
 108 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
 109 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 110 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
 111 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
 112 ; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
 113 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 114 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 115 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
 116 ; GFX12-NEXT:    s_nop 0
 117 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 118 ; GFX12-NEXT:    s_endpgm
 119 bb:
 120   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 121   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 122   %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
 123   store <4 x half> %res0, ptr addrspace(1) %out0
 124   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 125   %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
 126   store <4 x half> %res1, ptr addrspace(1) %out1
 127   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 128   %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
 129   store <4 x half> %res2, ptr addrspace(1) %out2
 130   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 131   %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
 132   store <4 x half> %res3, ptr addrspace(1) %out3
 133   ret void
 134 }
 135
 136 define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 137 ; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
 138 ; GFX12:       ; %bb.0: ; %bb
 139 ; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
 140 ; GFX12-NEXT:    v_mov_b32_e32 v9, v7
 141 ; GFX12-NEXT:    v_mov_b32_e32 v8, v6
 142 ; GFX12-NEXT:    v_mov_b32_e32 v19, v7
 143 ; GFX12-NEXT:    v_mov_b32_e32 v18, v6
 144 ; GFX12-NEXT:    v_mov_b32_e32 v21, v7
 145 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 146 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 147 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
 148 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
 149 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 150 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
 151 ; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
 152 ; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
 153 ; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
 154 ; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
 155 ; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
 156 ; GFX12-NEXT:    s_nop 0
 157 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 158 ; GFX12-NEXT:    s_endpgm
 159 bb:
 160   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 161   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 162   %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
 163   store <4 x i16> %res0, ptr addrspace(1) %out0
 164   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 165   %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
 166   store <4 x i16> %res1, ptr addrspace(1) %out1
 167   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 168   %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
 169   store <4 x i16> %res2, ptr addrspace(1) %out2
 170   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 171   %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
 172   store <4 x i16> %res3, ptr addrspace(1) %out3
 173   ret void
 174 }
 175
 176 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 177 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
 178 ; GFX12:       ; %bb.0: ; %bb
 179 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 180 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 181 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 182 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 183 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 184 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 185 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 186 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 187 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 188 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 189 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 190 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 191 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 192 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 193 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
 194 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
 195 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 196 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
 197 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
 198 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 199 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 200 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 201 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 202 ; GFX12-NEXT:    s_nop 0
 203 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 204 ; GFX12-NEXT:    s_endpgm
 205 bb:
 206   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 207   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 208   %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
 209   store <4 x i32> %res0, ptr addrspace(1) %out0
 210   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 211   %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
 212   store <4 x i32> %res1, ptr addrspace(1) %out1
 213   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 214   %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
 215   store <4 x i32> %res2, ptr addrspace(1) %out2
 216   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 217   %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
 218   store <4 x i32> %res3, ptr addrspace(1) %out3
 219   ret void
 220 }
 221
 222 define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 223 ; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
 224 ; GFX12:       ; %bb.0: ; %bb
 225 ; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
 226 ; GFX12-NEXT:    v_mov_b32_e32 v15, v5
 227 ; GFX12-NEXT:    v_mov_b32_e32 v14, v4
 228 ; GFX12-NEXT:    v_mov_b32_e32 v13, v3
 229 ; GFX12-NEXT:    v_mov_b32_e32 v12, v2
 230 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 231 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 232 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
 233 ; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
 234 ; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
 235 ; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
 236 ; GFX12-NEXT:    s_nop 0
 237 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 238 ; GFX12-NEXT:    s_endpgm
 239 bb:
 240   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
 241   %Index0 = extractelement <2 x i16> %IndexVec, i32 0
 242   %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
 243   store <4 x i32> %res0, ptr addrspace(1) %out0
 244   %Index1 = extractelement <2 x i16> %IndexVec, i32 1
 245   %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
 246   store <4 x i32> %res1, ptr addrspace(1) %out1
 247   ret void
 248 }
 249
 250 define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 251 ; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
 252 ; GFX12:       ; %bb.0: ; %bb
 253 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 254 ; GFX12-NEXT:    v_mov_b32_e32 v16, v6
 255 ; GFX12-NEXT:    v_mov_b32_e32 v15, v5
 256 ; GFX12-NEXT:    v_mov_b32_e32 v14, v4
 257 ; GFX12-NEXT:    v_mov_b32_e32 v13, v3
 258 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 259 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 260 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
 261 ; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
 262 ; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
 263 ; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
 264 ; GFX12-NEXT:    s_nop 0
 265 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 266 ; GFX12-NEXT:    s_endpgm
 267 bb:
 268   %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
 269   %Index0 = extractelement <2 x i16> %IndexVec, i32 0
 270   %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
 271   store <4 x i32> %res0, ptr addrspace(1) %out0
 272   %Index1 = extractelement <2 x i16> %IndexVec, i32 1
 273   %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
 274   store <4 x i32> %res1, ptr addrspace(1) %out1
 275   ret void
 276 }
 277
 278 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 279 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
 280 ; GFX12:       ; %bb.0: ; %bb
 281 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 282 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 283 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 284 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 285 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 286 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 287 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 288 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 289 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 290 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 291 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 292 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 293 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 294 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 295 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
 296 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
 297 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 298 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
 299 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
 300 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 301 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 302 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 303 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 304 ; GFX12-NEXT:    s_nop 0
 305 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 306 ; GFX12-NEXT:    s_endpgm
 307 bb:
 308   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 309   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 310   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 311   store <4 x float> %res0, ptr addrspace(1) %out0
 312   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 313   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 314   store <4 x float> %res1, ptr addrspace(1) %out1
 315   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 316   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 317   store <4 x float> %res2, ptr addrspace(1) %out2
 318   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 319   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 320   store <4 x float> %res3, ptr addrspace(1) %out3
 321   ret void
 322 }
 323
 324 define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 325 ; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
 326 ; GFX12:       ; %bb.0: ; %bb
 327 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 328 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 329 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 330 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 331 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 332 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 333 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 334 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 335 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 336 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 337 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 338 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 339 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 340 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 341 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
 342 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
 343 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 344 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
 345 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
 346 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 347 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 348 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 349 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 350 ; GFX12-NEXT:    s_nop 0
 351 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 352 ; GFX12-NEXT:    s_endpgm
 353 bb:
 354   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 355   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 356   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 357   store <4 x float> %res0, ptr addrspace(1) %out0
 358   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 359   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 360   store <4 x float> %res1, ptr addrspace(1) %out1
 361   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 362   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 363   store <4 x float> %res2, ptr addrspace(1) %out2
 364   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 365   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 366   store <4 x float> %res3, ptr addrspace(1) %out3
 367   ret void
 368 }
 369
 370 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 371 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
 372 ; GFX12:       ; %bb.0: ; %bb
 373 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 374 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 375 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 376 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 377 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 378 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 379 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 380 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 381 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 382 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 383 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 384 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 385 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 386 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 387 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
 388 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
 389 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 390 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
 391 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
 392 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 393 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 394 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 395 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 396 ; GFX12-NEXT:    s_nop 0
 397 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 398 ; GFX12-NEXT:    s_endpgm
 399 bb:
 400   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 401   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 402   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 403   store <4 x float> %res0, ptr addrspace(1) %out0
 404   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 405   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 406   store <4 x float> %res1, ptr addrspace(1) %out1
 407   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 408   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 409   store <4 x float> %res2, ptr addrspace(1) %out2
 410   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 411   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 412   store <4 x float> %res3, ptr addrspace(1) %out3
 413   ret void
 414 }
 415
 416 define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 417 ; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
 418 ; GFX12:       ; %bb.0: ; %bb
 419 ; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
 420 ; GFX12-NEXT:    v_mov_b32_e32 v20, v6
 421 ; GFX12-NEXT:    v_mov_b32_e32 v19, v5
 422 ; GFX12-NEXT:    v_mov_b32_e32 v18, v4
 423 ; GFX12-NEXT:    v_mov_b32_e32 v17, v3
 424 ; GFX12-NEXT:    v_mov_b32_e32 v24, v6
 425 ; GFX12-NEXT:    v_mov_b32_e32 v23, v5
 426 ; GFX12-NEXT:    v_mov_b32_e32 v22, v4
 427 ; GFX12-NEXT:    v_mov_b32_e32 v21, v3
 428 ; GFX12-NEXT:    v_mov_b32_e32 v28, v6
 429 ; GFX12-NEXT:    v_mov_b32_e32 v27, v5
 430 ; GFX12-NEXT:    v_mov_b32_e32 v26, v4
 431 ; GFX12-NEXT:    v_mov_b32_e32 v25, v3
 432 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 433 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
 434 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
 435 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 436 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
 437 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
 438 ; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
 439 ; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
 440 ; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
 441 ; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
 442 ; GFX12-NEXT:    s_nop 0
 443 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 444 ; GFX12-NEXT:    s_endpgm
 445 bb:
 446   %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
 447   %Index0 = extractelement <4 x i8> %IndexVec, i32 0
 448   %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
 449   store <4 x float> %res0, ptr addrspace(1) %out0
 450   %Index1 = extractelement <4 x i8> %IndexVec, i32 1
 451   %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
 452   store <4 x float> %res1, ptr addrspace(1) %out1
 453   %Index2 = extractelement <4 x i8> %IndexVec, i32 2
 454   %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
 455   store <4 x float> %res2, ptr addrspace(1) %out2
 456   %Index3 = extractelement <4 x i8> %IndexVec, i32 3
 457   %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
 458   store <4 x float> %res3, ptr addrspace(1) %out3
 459   ret void
 460 }
 461
 462 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x float>, i8)
 463 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
 464 declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
 465 declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
 466 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
 467 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
 468 declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
 469 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 470 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 471 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)
 472 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i8(i32, <2 x i32>, <4 x float>, i8)