llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
   3
   4 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
   5 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
   6 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
   7 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
   8 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
   9 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
  10 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
  11 declare i32 @llvm.amdgcn.workitem.id.x()
  12
  13 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) {
  14 ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k:
  15 ; GCN:       ; %bb.0: ; %bb
  16 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
  17 ; GCN-NEXT:    s_mov_b64 s[36:37], 1
  18 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
  19 ; GCN-NEXT:    s_mov_b32 s36, 2
  20 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
  21 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  22 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
  23 ; GCN-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
  24 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  25 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
  26 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
  27 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
  28 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
  29 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
  30 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
  31 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
  32 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
  33 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
  34 ; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
  35 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
  36 ; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
  37 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
  38 ; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
  39 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
  40 ; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
  41 ; GCN-NEXT:    v_mov_b32_e32 v4, s8
  42 ; GCN-NEXT:    v_accvgpr_write_b32 a8, v4
  43 ; GCN-NEXT:    v_mov_b32_e32 v4, s9
  44 ; GCN-NEXT:    v_accvgpr_write_b32 a9, v4
  45 ; GCN-NEXT:    v_mov_b32_e32 v4, s10
  46 ; GCN-NEXT:    v_accvgpr_write_b32 a10, v4
  47 ; GCN-NEXT:    v_mov_b32_e32 v4, s11
  48 ; GCN-NEXT:    v_accvgpr_write_b32 a11, v4
  49 ; GCN-NEXT:    v_mov_b32_e32 v4, s12
  50 ; GCN-NEXT:    v_accvgpr_write_b32 a12, v4
  51 ; GCN-NEXT:    v_mov_b32_e32 v4, s13
  52 ; GCN-NEXT:    v_accvgpr_write_b32 a13, v4
  53 ; GCN-NEXT:    v_mov_b32_e32 v4, s14
  54 ; GCN-NEXT:    v_accvgpr_write_b32 a14, v4
  55 ; GCN-NEXT:    v_mov_b32_e32 v4, s15
  56 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v4
  57 ; GCN-NEXT:    v_mov_b32_e32 v4, s16
  58 ; GCN-NEXT:    v_accvgpr_write_b32 a16, v4
  59 ; GCN-NEXT:    v_mov_b32_e32 v4, s17
  60 ; GCN-NEXT:    v_accvgpr_write_b32 a17, v4
  61 ; GCN-NEXT:    v_mov_b32_e32 v4, s18
  62 ; GCN-NEXT:    v_accvgpr_write_b32 a18, v4
  63 ; GCN-NEXT:    v_mov_b32_e32 v4, s19
  64 ; GCN-NEXT:    v_accvgpr_write_b32 a19, v4
  65 ; GCN-NEXT:    v_mov_b32_e32 v4, s20
  66 ; GCN-NEXT:    v_accvgpr_write_b32 a20, v4
  67 ; GCN-NEXT:    v_mov_b32_e32 v4, s21
  68 ; GCN-NEXT:    v_accvgpr_write_b32 a21, v4
  69 ; GCN-NEXT:    v_mov_b32_e32 v4, s22
  70 ; GCN-NEXT:    v_accvgpr_write_b32 a22, v4
  71 ; GCN-NEXT:    v_mov_b32_e32 v4, s23
  72 ; GCN-NEXT:    v_accvgpr_write_b32 a23, v4
  73 ; GCN-NEXT:    v_mov_b32_e32 v4, s24
  74 ; GCN-NEXT:    v_accvgpr_write_b32 a24, v4
  75 ; GCN-NEXT:    v_mov_b32_e32 v4, s25
  76 ; GCN-NEXT:    v_accvgpr_write_b32 a25, v4
  77 ; GCN-NEXT:    v_mov_b32_e32 v4, s26
  78 ; GCN-NEXT:    v_accvgpr_write_b32 a26, v4
  79 ; GCN-NEXT:    v_mov_b32_e32 v4, s27
  80 ; GCN-NEXT:    v_accvgpr_write_b32 a27, v4
  81 ; GCN-NEXT:    v_mov_b32_e32 v4, s28
  82 ; GCN-NEXT:    v_accvgpr_write_b32 a28, v4
  83 ; GCN-NEXT:    v_mov_b32_e32 v4, s29
  84 ; GCN-NEXT:    v_accvgpr_write_b32 a29, v4
  85 ; GCN-NEXT:    v_mov_b32_e32 v4, s30
  86 ; GCN-NEXT:    v_accvgpr_write_b32 a30, v4
  87 ; GCN-NEXT:    v_mov_b32_e32 v4, s31
  88 ; GCN-NEXT:    v_accvgpr_write_b32 a31, v4
  89 ; GCN-NEXT:    s_nop 1
  90 ; GCN-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
  91 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
  92 ; GCN-NEXT:    s_nop 7
  93 ; GCN-NEXT:    s_nop 7
  94 ; GCN-NEXT:    s_nop 1
  95 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
  96 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
  97 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
  98 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
  99 ; GCN-NEXT:    global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
 100 ; GCN-NEXT:    global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
 101 ; GCN-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
 102 ; GCN-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
 103 ; GCN-NEXT:    s_endpgm
 104 bb:
 105   %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
 106   %a = bitcast i64 1 to <4 x i16>
 107   %b = bitcast i64 2 to <4 x i16>
 108   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
 109   store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
 110   ret void
 111 }
 112
 113 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) {
 114 ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k:
 115 ; GCN:       ; %bb.0: ; %bb
 116 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 117 ; GCN-NEXT:    s_mov_b64 s[18:19], 1
 118 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
 119 ; GCN-NEXT:    s_mov_b32 s18, 2
 120 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
 121 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 122 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 123 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 124 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 125 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 126 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
 127 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 128 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 129 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 130 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
 131 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 132 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 133 ; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
 134 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
 135 ; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
 136 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 137 ; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
 138 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
 139 ; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
 140 ; GCN-NEXT:    v_mov_b32_e32 v4, s8
 141 ; GCN-NEXT:    v_accvgpr_write_b32 a8, v4
 142 ; GCN-NEXT:    v_mov_b32_e32 v4, s9
 143 ; GCN-NEXT:    v_accvgpr_write_b32 a9, v4
 144 ; GCN-NEXT:    v_mov_b32_e32 v4, s10
 145 ; GCN-NEXT:    v_accvgpr_write_b32 a10, v4
 146 ; GCN-NEXT:    v_mov_b32_e32 v4, s11
 147 ; GCN-NEXT:    v_accvgpr_write_b32 a11, v4
 148 ; GCN-NEXT:    v_mov_b32_e32 v4, s12
 149 ; GCN-NEXT:    v_accvgpr_write_b32 a12, v4
 150 ; GCN-NEXT:    v_mov_b32_e32 v4, s13
 151 ; GCN-NEXT:    v_accvgpr_write_b32 a13, v4
 152 ; GCN-NEXT:    v_mov_b32_e32 v4, s14
 153 ; GCN-NEXT:    v_accvgpr_write_b32 a14, v4
 154 ; GCN-NEXT:    v_mov_b32_e32 v4, s15
 155 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v4
 156 ; GCN-NEXT:    s_nop 1
 157 ; GCN-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 158 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 159 ; GCN-NEXT:    s_nop 7
 160 ; GCN-NEXT:    s_nop 1
 161 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 162 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 163 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 164 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 165 ; GCN-NEXT:    s_endpgm
 166 bb:
 167   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
 168   %a = bitcast i64 1 to <4 x i16>
 169   %b = bitcast i64 2 to <4 x i16>
 170   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
 171   store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
 172   ret void
 173 }
 174
 175 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) {
 176 ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k:
 177 ; GCN:       ; %bb.0: ; %bb
 178 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 179 ; GCN-NEXT:    s_mov_b64 s[6:7], 1
 180 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 181 ; GCN-NEXT:    s_mov_b32 s6, 2
 182 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
 183 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 184 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 185 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 186 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 187 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 188 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
 189 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 190 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 191 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 192 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
 193 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 194 ; GCN-NEXT:    s_nop 1
 195 ; GCN-NEXT:    v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 196 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 197 ; GCN-NEXT:    s_nop 3
 198 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
 199 ; GCN-NEXT:    s_endpgm
 200 bb:
 201   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
 202   %a = bitcast i64 1 to <4 x i16>
 203   %b = bitcast i64 2 to <4 x i16>
 204   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
 205   store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
 206   ret void
 207 }
 208
 209 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) {
 210 ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k:
 211 ; GCN:       ; %bb.0: ; %bb
 212 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
 213 ; GCN-NEXT:    s_mov_b64 s[18:19], 1
 214 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
 215 ; GCN-NEXT:    s_mov_b32 s18, 2
 216 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
 217 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 218 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 219 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 220 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 221 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 222 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
 223 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 224 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 225 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 226 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
 227 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 228 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 229 ; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
 230 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
 231 ; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
 232 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 233 ; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
 234 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
 235 ; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
 236 ; GCN-NEXT:    v_mov_b32_e32 v4, s8
 237 ; GCN-NEXT:    v_accvgpr_write_b32 a8, v4
 238 ; GCN-NEXT:    v_mov_b32_e32 v4, s9
 239 ; GCN-NEXT:    v_accvgpr_write_b32 a9, v4
 240 ; GCN-NEXT:    v_mov_b32_e32 v4, s10
 241 ; GCN-NEXT:    v_accvgpr_write_b32 a10, v4
 242 ; GCN-NEXT:    v_mov_b32_e32 v4, s11
 243 ; GCN-NEXT:    v_accvgpr_write_b32 a11, v4
 244 ; GCN-NEXT:    v_mov_b32_e32 v4, s12
 245 ; GCN-NEXT:    v_accvgpr_write_b32 a12, v4
 246 ; GCN-NEXT:    v_mov_b32_e32 v4, s13
 247 ; GCN-NEXT:    v_accvgpr_write_b32 a13, v4
 248 ; GCN-NEXT:    v_mov_b32_e32 v4, s14
 249 ; GCN-NEXT:    v_accvgpr_write_b32 a14, v4
 250 ; GCN-NEXT:    v_mov_b32_e32 v4, s15
 251 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v4
 252 ; GCN-NEXT:    s_nop 1
 253 ; GCN-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
 254 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 255 ; GCN-NEXT:    s_nop 7
 256 ; GCN-NEXT:    s_nop 7
 257 ; GCN-NEXT:    s_nop 1
 258 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 259 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 260 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 261 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 262 ; GCN-NEXT:    s_endpgm
 263 bb:
 264   %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
 265   %a = bitcast i64 1 to <4 x i16>
 266   %b = bitcast i64 2 to <4 x i16>
 267   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
 268   store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
 269   ret void
 270 }
 271
 272 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) {
 273 ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k:
 274 ; GCN:       ; %bb.0: ; %bb
 275 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 276 ; GCN-NEXT:    s_mov_b64 s[6:7], 1
 277 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 278 ; GCN-NEXT:    s_mov_b32 s6, 2
 279 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
 280 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 281 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 282 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 283 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 284 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 285 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
 286 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 287 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 288 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 289 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
 290 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 291 ; GCN-NEXT:    s_nop 1
 292 ; GCN-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 293 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 294 ; GCN-NEXT:    s_nop 7
 295 ; GCN-NEXT:    s_nop 1
 296 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
 297 ; GCN-NEXT:    s_endpgm
 298 bb:
 299   %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
 300   %a = bitcast i64 1 to <4 x i16>
 301   %b = bitcast i64 2 to <4 x i16>
 302   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
 303   store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
 304   ret void
 305 }
 306
 307 define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) {
 308 ; GCN-LABEL: test_mfma_f64_4x4x4f64:
 309 ; GCN:       ; %bb.0: ; %bb
 310 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 311 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 312 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 313 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 314 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 315 ; GCN-NEXT:    s_nop 1
 316 ; GCN-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
 317 ; GCN-NEXT:    s_nop 3
 318 ; GCN-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
 319 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 320 ; GCN-NEXT:    s_nop 7
 321 ; GCN-NEXT:    global_store_dwordx2 v0, a[0:1], s[4:5]
 322 ; GCN-NEXT:    s_endpgm
 323 bb:
 324   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
 325   %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
 326   store double %mai.2, double addrspace(1)* %arg
 327   ret void
 328 }
 329
 330 define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) {
 331 ; GCN-LABEL: test_mfma_f64_16x16x4f64:
 332 ; GCN:       ; %bb.0: ; %bb
 333 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
 334 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x34
 335 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 336 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
 337 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
 338 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
 339 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 340 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 341 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 342 ; GCN-NEXT:    v_mov_b32_e32 v4, s1
 343 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 344 ; GCN-NEXT:    v_mov_b32_e32 v4, s2
 345 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 346 ; GCN-NEXT:    v_mov_b32_e32 v4, s3
 347 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 348 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 349 ; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
 350 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
 351 ; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
 352 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 353 ; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
 354 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
 355 ; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
 356 ; GCN-NEXT:    s_nop 1
 357 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 358 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 359 ; GCN-NEXT:    s_nop 7
 360 ; GCN-NEXT:    s_nop 7
 361 ; GCN-NEXT:    s_nop 0
 362 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 363 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 364 ; GCN-NEXT:    s_endpgm
 365 bb:
 366   %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
 367   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
 368   store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
 369   ret void
 370 }
 371
 372 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
 373 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm:
 374 ; GCN:       ; %bb.0: ; %bb
 375 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 376 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 377 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 378 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
 379 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 380 ; GCN-NEXT:    s_nop 1
 381 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
 382 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 383 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 384 ; GCN-NEXT:    s_nop 7
 385 ; GCN-NEXT:    s_nop 7
 386 ; GCN-NEXT:    s_nop 0
 387 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
 388 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
 389 ; GCN-NEXT:    s_endpgm
 390 bb:
 391   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
 392   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
 393   store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg
 394   ret void
 395 }
 396
 397 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
 398 ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
 399 ; GCN:       ; %bb.0: ; %bb
 400 ; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
 401 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 402 ; GCN-NEXT:    s_mov_b64 s[10:11], 1.0
 403 ; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
 404 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 405 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 406 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 407 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
 408 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 409 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 410 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 411 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 412 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
 413 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 414 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
 415 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 416 ; GCN-NEXT:    v_mov_b32_e32 v4, s8
 417 ; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
 418 ; GCN-NEXT:    v_mov_b32_e32 v4, s9
 419 ; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
 420 ; GCN-NEXT:    v_mov_b32_e32 v4, s10
 421 ; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
 422 ; GCN-NEXT:    v_mov_b32_e32 v4, s11
 423 ; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
 424 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 425 ; GCN-NEXT:    s_nop 1
 426 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 427 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 428 ; GCN-NEXT:    s_nop 7
 429 ; GCN-NEXT:    s_nop 7
 430 ; GCN-NEXT:    s_nop 0
 431 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
 432 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
 433 ; GCN-NEXT:    s_endpgm
 434 bb:
 435   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
 436   store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
 437   ret void
 438 }
 439
 440 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) {
 441 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
 442 ; GCN:       ; %bb.0: ; %bb
 443 ; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
 444 ; GCN-NEXT:    s_mov_b32 s4, 0
 445 ; GCN-NEXT:    s_mov_b32 s5, 0x405ec000
 446 ; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
 447 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 448 ; GCN-NEXT:    s_mov_b64 s[10:11], s[4:5]
 449 ; GCN-NEXT:    v_mov_b32_e32 v4, s4
 450 ; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
 451 ; GCN-NEXT:    v_mov_b32_e32 v4, s5
 452 ; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
 453 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 454 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 455 ; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
 456 ; GCN-NEXT:    v_mov_b32_e32 v4, s7
 457 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 458 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
 459 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
 460 ; GCN-NEXT:    v_mov_b32_e32 v4, s8
 461 ; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
 462 ; GCN-NEXT:    v_mov_b32_e32 v4, s9
 463 ; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
 464 ; GCN-NEXT:    v_mov_b32_e32 v4, s10
 465 ; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
 466 ; GCN-NEXT:    v_mov_b32_e32 v4, s11
 467 ; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
 468 ; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 469 ; GCN-NEXT:    s_nop 1
 470 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 471 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 472 ; GCN-NEXT:    s_nop 7
 473 ; GCN-NEXT:    s_nop 7
 474 ; GCN-NEXT:    s_nop 0
 475 ; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
 476 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
 477 ; GCN-NEXT:    s_endpgm
 478 bb:
 479   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
 480   store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
 481   ret void
 482 }