llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll

   1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
   2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
   3
   4 declare void @extern_func()
   5
   6 define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
   7 ; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
   8 ; preserved across the call and should get 8 scratch registers.
   9
  10 ; GFX9-LABEL: non_preserved_vgpr_tuple8:
  11 ; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
  12 ; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
  13 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
  14 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
  15 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
  16
  17 ; GFX9: v_mov_b32_e32 v36, v16
  18 ; GFX9-NEXT: v_mov_b32_e32 v35, v15
  19 ; GFX9-NEXT: v_mov_b32_e32 v34, v14
  20 ; GFX9-NEXT: v_mov_b32_e32 v33, v13
  21 ; GFX9-NEXT: v_mov_b32_e32 v32, v12
  22 ; GFX9: ;;#ASMSTART
  23 ; GFX9-NEXT: ;;#ASMEND
  24 ; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
  25 ; GFX9-NEXT: s_getpc_b64 s[4:5]
  26 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
  27 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
  28 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
  29 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0
  30 ; GFX9: s_waitcnt lgkmcnt(0)
  31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
  32
  33 ; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
  34 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
  35 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
  36 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
  37 ; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
  38 ; GFX9: s_setpc_b64 s[4:5]
  39 ;
  40 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
  41 ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
  42 ; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
  43 ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
  44 ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
  45 ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
  46
  47 ; GFX10: v_mov_b32_e32 v36, v16
  48 ; GFX10-NEXT: v_mov_b32_e32 v35, v15
  49 ; GFX10-NEXT: v_mov_b32_e32 v34, v14
  50 ; GFX10-NEXT: v_mov_b32_e32 v33, v13
  51 ; GFX10-NEXT: v_mov_b32_e32 v32, v12
  52
  53 ; GFX10: ;;#ASMSTART
  54 ; GFX10-NEXT: ;;#ASMEND
  55
  56 ; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
  57 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
  58 ; GFX10-NEXT: s_getpc_b64 s[4:5]
  59 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
  60 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
  61 ; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0
  62 ; GFX10: s_waitcnt lgkmcnt(0)
  63 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
  64
  65 ; GFX10: buffer_load_dword v44, off, s[0:3], s33
  66 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4
  67 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
  68 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
  69
  70 ; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
  71 ; GFX10: s_setpc_b64 s[4:5]
  72 main_body:
  73   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
  74   call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
  75   call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
  76   call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
  77   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
  78   call void @extern_func()
  79   ret <4 x float> %v
  80 }
  81
  82 define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
  83 ; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
  84 ; across the call and should get allcoated to 8 CSRs.
  85 ; Only the lower 5 sub-registers of the tuple are preserved.
  86 ; The upper 3 sub-registers are unused.
  87
  88 ; GFX9-LABEL: call_preserved_vgpr_tuple8:
  89 ; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
  90 ; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
  91 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
  92 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
  93 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
  94 ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
  95
  96 ; GFX9: v_mov_b32_e32 v45, v16
  97 ; GFX9-NEXT: v_mov_b32_e32 v44, v15
  98 ; GFX9-NEXT: v_mov_b32_e32 v43, v14
  99 ; GFX9-NEXT: v_mov_b32_e32 v42, v13
 100 ; GFX9-NEXT: v_mov_b32_e32 v41, v12
 101
 102 ; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
 103 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 104 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 105 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 106 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
 107 ; GFX9: s_waitcnt vmcnt(0)
 108 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 109 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
 110 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
 111 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
 112
 113 ; GFX9: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
 114 ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 115 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 116 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 117 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 118
 119 ; GFX9: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 120 ; GFX9: s_setpc_b64 s[4:5]
 121 ;
 122 ; GFX10-LABEL: call_preserved_vgpr_tuple8:
 123 ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 124 ; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 125 ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 126 ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 127 ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 128 ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
 129
 130
 131 ; GFX10:      image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 132 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
 133 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 134 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 135 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 136 ; GFX10-NEXT: v_mov_b32_e32 v41, v16
 137 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
 138 ; GFX10-NEXT: v_mov_b32_e32 v42, v15
 139 ; GFX10-NEXT: v_mov_b32_e32 v43, v14
 140 ; GFX10-NEXT: v_mov_b32_e32 v44, v13
 141 ; GFX10-NEXT: v_mov_b32_e32 v45, v12
 142 ; GFX10-NEXT: s_waitcnt vmcnt(0)
 143 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 144 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
 145 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
 146 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 147
 148 ; GFX10: buffer_load_dword v45, off, s[0:3], s33{{$}}
 149 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4
 150 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8
 151 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12
 152 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16
 153 ; GFX10: buffer_load_dword v40, off, s[0:3], s32 offset:20
 154 ; GFX10: s_setpc_b64 s[4:5]
 155 main_body:
 156   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
 157   store <4 x float> %v, <4 x float> addrspace(1)* undef
 158   call void @extern_func()
 159   %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
 160   ret <4 x float> %v1
 161 }
 162
 163 declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
 164
 165 attributes #0 = { nounwind writeonly }
 166 attributes #1 = { nounwind readonly }