llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
   2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
   3
   4 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
   5 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
   6 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
   7 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
   8 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
   9 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
  10 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
  11 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
  12 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
  13 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
  14 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
  15 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
  16 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
  17 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
  18
  19 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
  20 ; GCN:  v_mfma_f32_32x32x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
  21 define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) {
  22 bb:
  23   %in.1 = load <32 x float>, ptr addrspace(1) %arg
  24   %a = bitcast i32 1 to <2 x i16>
  25   %b = bitcast i32 2 to <2 x i16>
  26   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 0, i32 0, i32 0)
  27   store <32 x float> %mai.1, ptr addrspace(1) %arg
  28   ret void
  29 }
  30
  31 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
  32 ; GCN: v_mfma_f32_16x16x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
  33 define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) {
  34 bb:
  35   %in.1 = load <16 x float>, ptr addrspace(1) %arg
  36   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  37   store <16 x float> %mai.1, ptr addrspace(1) %arg
  38   ret void
  39 }
  40
  41 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
  42 ; GCN: v_mfma_f32_4x4x2bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
  43 define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) {
  44 bb:
  45   %in.1 = load <4 x float>, ptr addrspace(1) %arg
  46   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  47   store <4 x float> %mai.1, ptr addrspace(1) %arg
  48   ret void
  49 }
  50
  51 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
  52 ; GCN: v_mfma_f32_32x32x4bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
  53 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) {
  54 bb:
  55   %in.1 = load <16 x float>, ptr addrspace(1) %arg
  56   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  57   store <16 x float> %mai.1, ptr addrspace(1) %arg
  58   ret void
  59 }
  60
  61 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
  62 ; GCN: v_mfma_f32_16x16x8bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
  63 define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) {
  64 bb:
  65   %in.1 = load <4 x float>, ptr addrspace(1) %arg
  66   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  67   store <4 x float> %mai.1, ptr addrspace(1) %arg
  68   ret void
  69 }
  70
  71 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
  72 ; GCN:      v_mfma_f32_32x32x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
  73 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) {
  74 bb:
  75   %in.1 = load <32 x float>, ptr addrspace(1) %arg
  76   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
  77   store <32 x float> %mai.1, ptr addrspace(1) %arg
  78   ret void
  79 }
  80
  81 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
  82 ; GCN: v_mfma_f32_16x16x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
  83 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) {
  84 bb:
  85   %in.1 = load <16 x float>, ptr addrspace(1) %arg
  86   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  87   store <16 x float> %mai.1, ptr addrspace(1) %arg
  88   ret void
  89 }
  90
  91 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
  92 ; GCN: v_mfma_f32_4x4x4bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
  93 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) {
  94 bb:
  95   %in.1 = load <4 x float>, ptr addrspace(1) %arg
  96   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  97   store <4 x float> %mai.1, ptr addrspace(1) %arg
  98   ret void
  99 }
 100
 101 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
 102 ; GCN: v_mfma_f32_32x32x8bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 103 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) {
 104 bb:
 105   %in.1 = load <16 x float>, ptr addrspace(1) %arg
 106   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
 107   store <16 x float> %mai.1, ptr addrspace(1) %arg
 108   ret void
 109 }
 110
 111 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
 112 ; GCN: v_mfma_f32_16x16x16bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 113 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) {
 114 bb:
 115   %in.1 = load <4 x float>, ptr addrspace(1) %arg
 116   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
 117   store <4 x float> %mai.1, ptr addrspace(1) %arg
 118   ret void
 119 }
 120
 121 ; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
 122 ; GCN: v_mfma_f64_4x4x4f64 v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+:[0-9]+}}
 123 define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) {
 124 bb:
 125   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double 1.0, double 1.0, double 128.0, i32 0, i32 0, i32 0)
 126   store double %mai.1, ptr addrspace(1) %arg
 127   ret void
 128 }
 129
 130 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
 131 ; GCN: v_mfma_f64_16x16x4f64 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 132 define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) {
 133 bb:
 134   %in.1 = load <4 x double>, ptr addrspace(1) %arg
 135   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double 1.0, double 1.0, <4 x double> %in.1, i32 0, i32 0, i32 0)
 136   store <4 x double> %mai.1, ptr addrspace(1) %arg
 137   ret void
 138 }
 139
 140 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
 141 ; GCN: v_mfma_i32_32x32x8i8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 142 define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) {
 143 bb:
 144   %in.1 = load <16 x i32>, ptr addrspace(1) %arg
 145   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0)
 146   store <16 x i32> %mai.1, ptr addrspace(1) %arg
 147   ret void
 148 }
 149
 150 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
 151 ; GCN: v_mfma_i32_16x16x16i8 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 152 define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) {
 153 bb:
 154   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
 155   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0)
 156   store <4 x i32> %mai.1, ptr addrspace(1) %arg
 157   ret void
 158 }