llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
   3 ; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
   4
   5 ; FIXME: Generated test checks do not check metadata at the end of the
   6 ; function, so this also includes manually added checks.
   7
   8 ; Test that we can select a statically sized alloca outside of the
   9 ; entry block.
  10
  11 ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
  12 ; alignment less than the stack alignment.
  13 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
  14 ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
  15 ; GCN:       ; %bb.0: ; %entry
  16 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x8
  17 ; GCN-NEXT:    s_add_u32 s0, s0, s9
  18 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
  19 ; GCN-NEXT:    s_movk_i32 s32, 0x400
  20 ; GCN-NEXT:    s_mov_b32 s33, 0
  21 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  22 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
  23 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
  24 ; GCN-NEXT:  ; %bb.1: ; %bb.0
  25 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0xc
  26 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  27 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
  28 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
  29 ; GCN-NEXT:  ; %bb.2: ; %bb.1
  30 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x10
  31 ; GCN-NEXT:    s_add_u32 s7, s32, 0x1000
  32 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
  33 ; GCN-NEXT:    v_mov_b32_e32 v2, s7
  34 ; GCN-NEXT:    v_mov_b32_e32 v3, 1
  35 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  36 ; GCN-NEXT:    s_lshl_b32 s6, s6, 2
  37 ; GCN-NEXT:    s_add_u32 s6, s7, s6
  38 ; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
  39 ; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
  40 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
  41 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
  42 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
  43 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  44 ; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
  45 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  46 ; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
  47 ; GCN-NEXT:  .LBB0_3: ; %bb.2
  48 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
  49 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
  50 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  51 ; GCN-NEXT:    s_endpgm
  52
  53 entry:
  54   %cond0 = icmp eq i32 %arg.cond0, 0
  55   br i1 %cond0, label %bb.0, label %bb.2
  56
  57 bb.0:
  58   %alloca = alloca [16 x i32], align 4, addrspace(5)
  59   %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
  60   %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
  61   %cond1 = icmp eq i32 %arg.cond1, 0
  62   br i1 %cond1, label %bb.1, label %bb.2
  63
  64 bb.1:
  65   ; Use the alloca outside of the defining block.
  66   store i32 0, i32 addrspace(5)* %gep0
  67   store i32 1, i32 addrspace(5)* %gep1
  68   %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
  69   %load = load i32, i32 addrspace(5)* %gep2
  70   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  71   %add = add i32 %load, %tid
  72   store i32 %add, i32 addrspace(1)* %out
  73   br label %bb.2
  74
  75 bb.2:
  76   store volatile i32 0, i32 addrspace(1)* undef
  77   ret void
  78 }
  79 ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
  80 ; DEFAULTSIZE: ; ScratchSize: 4112
  81
  82 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
  83 ; ASSUME1024: ; ScratchSize: 1040
  84
  85 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
  86 ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
  87 ; GCN:       ; %bb.0: ; %entry
  88 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x8
  89 ; GCN-NEXT:    s_add_u32 s0, s0, s9
  90 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
  91 ; GCN-NEXT:    s_movk_i32 s32, 0x1000
  92 ; GCN-NEXT:    s_mov_b32 s33, 0
  93 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  94 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
  95 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
  96 ; GCN-NEXT:  ; %bb.1: ; %bb.0
  97 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0xc
  98 ; GCN-NEXT:    s_add_u32 s7, s32, 0x1000
  99 ; GCN-NEXT:    s_and_b32 s7, s7, 0xfffff000
 100 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 101 ; GCN-NEXT:    v_mov_b32_e32 v2, s7
 102 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 103 ; GCN-NEXT:    s_lshl_b32 s6, s6, 2
 104 ; GCN-NEXT:    v_mov_b32_e32 v3, 1
 105 ; GCN-NEXT:    s_add_u32 s6, s7, s6
 106 ; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 107 ; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 108 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
 109 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 110 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 111 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 112 ; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
 113 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 114 ; GCN-NEXT:    global_store_dword v1, v0, s[4:5]
 115 ; GCN-NEXT:  .LBB1_2: ; %bb.1
 116 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 117 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 118 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 119 ; GCN-NEXT:    s_endpgm
 120 entry:
 121   %cond = icmp eq i32 %arg.cond, 0
 122   br i1 %cond, label %bb.0, label %bb.1
 123
 124 bb.0:
 125   %alloca = alloca [16 x i32], align 64, addrspace(5)
 126   %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
 127   %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
 128   store i32 0, i32 addrspace(5)* %gep0
 129   store i32 1, i32 addrspace(5)* %gep1
 130   %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
 131   %load = load i32, i32 addrspace(5)* %gep2
 132   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 133   %add = add i32 %load, %tid
 134   store i32 %add, i32 addrspace(1)* %out
 135   br label %bb.1
 136
 137 bb.1:
 138   store volatile i32 0, i32 addrspace(1)* undef
 139   ret void
 140 }
 141
 142 ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
 143 ; DEFAULTSIZE: ; ScratchSize: 4160
 144
 145 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
 146 ; ASSUME1024: ; ScratchSize: 1088
 147
 148
 149 define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
 150 ; GCN-LABEL: func_non_entry_block_static_alloca_align4:
 151 ; GCN:       ; %bb.0: ; %entry
 152 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 153 ; GCN-NEXT:    s_mov_b32 s7, s33
 154 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 155 ; GCN-NEXT:    s_mov_b32 s33, s32
 156 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 157 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 158 ; GCN-NEXT:    s_cbranch_execz .LBB2_3
 159 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 160 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 161 ; GCN-NEXT:    s_and_b64 exec, exec, vcc
 162 ; GCN-NEXT:    s_cbranch_execz .LBB2_3
 163 ; GCN-NEXT:  ; %bb.2: ; %bb.1
 164 ; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
 165 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 166 ; GCN-NEXT:    v_mov_b32_e32 v3, s6
 167 ; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 168 ; GCN-NEXT:    v_mov_b32_e32 v2, 1
 169 ; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
 170 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v4
 171 ; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
 172 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 173 ; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
 174 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 175 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
 176 ; GCN-NEXT:    global_store_dword v[0:1], v2, off
 177 ; GCN-NEXT:  .LBB2_3: ; %bb.2
 178 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 179 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 180 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 181 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 182 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 183 ; GCN-NEXT:    s_mov_b32 s33, s7
 184 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 185
 186 entry:
 187   %cond0 = icmp eq i32 %arg.cond0, 0
 188   br i1 %cond0, label %bb.0, label %bb.2
 189
 190 bb.0:
 191   %alloca = alloca [16 x i32], align 4, addrspace(5)
 192   %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
 193   %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
 194   %cond1 = icmp eq i32 %arg.cond1, 0
 195   br i1 %cond1, label %bb.1, label %bb.2
 196
 197 bb.1:
 198   ; Use the alloca outside of the defining block.
 199   store i32 0, i32 addrspace(5)* %gep0
 200   store i32 1, i32 addrspace(5)* %gep1
 201   %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
 202   %load = load i32, i32 addrspace(5)* %gep2
 203   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 204   %add = add i32 %load, %tid
 205   store i32 %add, i32 addrspace(1)* %out
 206   br label %bb.2
 207
 208 bb.2:
 209   store volatile i32 0, i32 addrspace(1)* undef
 210   ret void
 211 }
 212
 213 define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
 214 ; GCN-LABEL: func_non_entry_block_static_alloca_align64:
 215 ; GCN:       ; %bb.0: ; %entry
 216 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 217 ; GCN-NEXT:    s_mov_b32 s7, s33
 218 ; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
 219 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 220 ; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
 221 ; GCN-NEXT:    s_addk_i32 s32, 0x2000
 222 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 223 ; GCN-NEXT:    s_cbranch_execz .LBB3_2
 224 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 225 ; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
 226 ; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
 227 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 228 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
 229 ; GCN-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 230 ; GCN-NEXT:    v_mov_b32_e32 v2, 1
 231 ; GCN-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
 232 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v3
 233 ; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
 234 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 235 ; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
 236 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 237 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
 238 ; GCN-NEXT:    global_store_dword v[0:1], v2, off
 239 ; GCN-NEXT:  .LBB3_2: ; %bb.1
 240 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 241 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 242 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 243 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 244 ; GCN-NEXT:    s_addk_i32 s32, 0xe000
 245 ; GCN-NEXT:    s_mov_b32 s33, s7
 246 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 247 entry:
 248   %cond = icmp eq i32 %arg.cond, 0
 249   br i1 %cond, label %bb.0, label %bb.1
 250
 251 bb.0:
 252   %alloca = alloca [16 x i32], align 64, addrspace(5)
 253   %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
 254   %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
 255   store i32 0, i32 addrspace(5)* %gep0
 256   store i32 1, i32 addrspace(5)* %gep1
 257   %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
 258   %load = load i32, i32 addrspace(5)* %gep2
 259   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 260   %add = add i32 %load, %tid
 261   store i32 %add, i32 addrspace(1)* %out
 262   br label %bb.1
 263
 264 bb.1:
 265   store volatile i32 0, i32 addrspace(1)* undef
 266   ret void
 267 }
 268
 269 declare i32 @llvm.amdgcn.workitem.id.x() #0
 270
 271 attributes #0 = { nounwind readnone speculatable }