llvm/test/CodeGen/AMDGPU/spill-agpr.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
   3
   4 ; GCN-LABEL: {{^}}max_12regs_13a_used:
   5 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
   6 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
   7 ; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
   8 ; GCN-NOT: buffer_store_dword
   9 ; GCN-NOT: buffer_load_dword
  10 ; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
  11 ; GCN:     ScratchSize: 0
  12 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
  13 bb:
  14   %in.1 = load <4 x float>, ptr addrspace(1) %arg
  15   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
  16   %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
  17   %cmp = icmp eq i32 %cond, 0
  18   br i1 %cmp, label %use, label %st
  19
  20 use:
  21   call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
  22   store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, ptr addrspace(1) %out
  23   br label %st
  24
  25 st:
  26   %gep1 = getelementptr <4 x float>, ptr addrspace(1) %out, i64 16
  27   %gep2 = getelementptr <4 x float>, ptr addrspace(1) %out, i64 32
  28   call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
  29   ret void
  30 }
  31
  32 ; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
  33 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
  34 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
  35 ; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
  36 ; GCN-NOT: buffer_store_dword
  37 ; GCN-NOT: buffer_load_dword
  38 ; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
  39 ; GCN:     ScratchSize: 0
  40 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
  41   %a1 = call <4 x i32> asm sideeffect "", "=a"()
  42   %a2 = call <4 x i32> asm sideeffect "", "=a"()
  43   %a3 = call i32 asm sideeffect "", "=a"()
  44   %a4 = call <2 x i32> asm sideeffect "", "=a"()
  45   call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3)
  46   call void asm sideeffect "", "a"(<2 x i32> %a4)
  47   ret void
  48 }
  49
  50 ; GCN-LABEL: {{^}}max_32regs_mfma32:
  51 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
  52 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
  53 ; GCN-NOT: buffer_store_dword
  54 ; GCN:     v_accvgpr_read_b32
  55 ; GCN:     v_mfma_f32_32x32x1f32
  56 ; GCN-NOT: buffer_load_dword
  57 ; GCN:     v_accvgpr_write_b32
  58 ; GCN:     ScratchSize: 0
  59 define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
  60 bb:
  61   %v = call i32 asm sideeffect "", "=a"()
  62   br label %use
  63
  64 use:
  65   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
  66   call void asm sideeffect "", "a"(i32 %v)
  67   %elt1 = extractelement <32 x float> %mai.1, i32 0
  68   store float %elt1, ptr addrspace(1) %arg
  69   ret void
  70 }
  71
  72 ; Should spill agprs to memory for both gfx908 and gfx90a.
  73 ; GCN-LABEL: {{^}}max_6regs_used_8a:
  74 ; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
  75 ; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
  76
  77 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a0 ; Reload Reuse
  78 ; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
  79 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a1 ; Reload Reuse
  80 ; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
  81 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a2 ; Reload Reuse
  82 ; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
  83 ; GFX908-DAG:  v_accvgpr_read_b32 v5, a3 ; Reload Reuse
  84 ; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
  85
  86 ; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
  87 ; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
  88 ; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
  89 ; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
  90
  91 ; GCN:  v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
  92
  93 ; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
  94 ; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
  95 ; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
  96 ; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
  97 ; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
  98
  99 ; GFX90A-DAG:  buffer_load_dword v2, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
 100 ; GFX90A-DAG:  buffer_load_dword v3, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
 101 ; GFX90A-DAG:  buffer_load_dword v4, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
 102 ; GFX90A-DAG:  buffer_load_dword v5, off, s[4:7], 0 offset:16 ; 4-byte Folded Reload
 103 ; GFX90A:  global_store_dwordx4 v[0:1], v[2:5], off
 104
 105 ; GCN: ScratchSize: 20
 106 define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 107   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 108   %v0 = call float asm sideeffect "; def $0", "=v"()
 109   %a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
 110   %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
 111   %mai.in = load <4 x float>, ptr addrspace(1) %gep
 112   %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0)
 113   store <4 x float> %mai.out, ptr addrspace(1) %gep
 114   store volatile <4 x float> %a4, ptr addrspace(1) undef
 115   call void asm sideeffect "; use $0", "v"(float %v0);
 116   ret void
 117 }
 118
 119 declare i32 @llvm.amdgcn.workitem.id.x()
 120 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
 121 declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
 122 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 123
 124 attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
 125 attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
 126 attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
 127 attributes #4 = { nounwind "amdgpu-num-vgpr"="6" }