llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll

   1 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
   2 ; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s
   3
   4 target datalayout = "A5"
   5
   6 @all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4
   7 @some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
   8
   9 @initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
  10 @initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
  11
  12 ; This function cannot promote to using LDS because of the size of the
  13 ; constant expression use in the function, which was previously not
  14 ; detected.
  15 ; IR-LABEL: @constant_expression_uses_all_lds(
  16 ; IR: alloca
  17
  18 ; ASM-LABEL: constant_expression_uses_all_lds:
  19 ; ASM: .amdhsa_group_segment_fixed_size 65536
  20 define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
  21 entry:
  22   %stack = alloca [4 x i32], align 4, addrspace(5)
  23   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
  24   %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
  25   %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
  26   %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
  27   store i32 9, i32 addrspace(5)* %gep0
  28   store i32 10, i32 addrspace(5)* %gep1
  29   store i32 99, i32 addrspace(5)* %gep2
  30   store i32 43, i32 addrspace(5)* %gep3
  31   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
  32   %load = load i32, i32 addrspace(5)* %arrayidx, align 4
  33   store i32 %load, i32 addrspace(1)* %out
  34
  35   store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef
  36   ret void
  37 }
  38
  39 ; Has a constant expression use through a single level of constant
  40 ; expression, but not enough LDS to block promotion
  41
  42 ; IR-LABEL: @constant_expression_uses_some_lds(
  43 ; IR-NOT: alloca
  44
  45 ; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
  46 ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
  47 define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
  48 entry:
  49   %stack = alloca [4 x i32], align 4, addrspace(5)
  50   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
  51   %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
  52   %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
  53   %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
  54   store i32 9, i32 addrspace(5)* %gep0
  55   store i32 10, i32 addrspace(5)* %gep1
  56   store i32 99, i32 addrspace(5)* %gep2
  57   store i32 43, i32 addrspace(5)* %gep3
  58   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
  59   %load = load i32, i32 addrspace(5)* %arrayidx, align 4
  60   store i32 %load, i32 addrspace(1)* %out
  61   store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
  62   ret void
  63 }
  64
  65 declare void @callee(i8*)
  66
  67 ; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
  68 ; IR: alloca
  69
  70 ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
  71 ; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
  72 define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
  73 entry:
  74   %stack = alloca [4 x i32], align 4, addrspace(5)
  75   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
  76   %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
  77   %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
  78   %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
  79   store i32 9, i32 addrspace(5)* %gep0
  80   store i32 10, i32 addrspace(5)* %gep1
  81   store i32 99, i32 addrspace(5)* %gep2
  82   store i32 43, i32 addrspace(5)* %gep3
  83   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
  84   %load = load i32, i32 addrspace(5)* %arrayidx, align 4
  85   store i32 %load, i32 addrspace(1)* %out
  86   call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
  87   ret void
  88 }
  89
  90 ; IR-LABEL: @constant_expression_uses_some_lds_multi_level(
  91 ; IR-NOT: alloca
  92 ; IR: llvm.amdgcn.workitem.id
  93
  94 ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
  95 ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
  96 define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
  97 entry:
  98   %stack = alloca [4 x i32], align 4, addrspace(5)
  99   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
 100   %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
 101   %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
 102   %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
 103   store i32 9, i32 addrspace(5)* %gep0
 104   store i32 10, i32 addrspace(5)* %gep1
 105   store i32 99, i32 addrspace(5)* %gep2
 106   store i32 43, i32 addrspace(5)* %gep3
 107   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
 108   %load = load i32, i32 addrspace(5)* %arrayidx, align 4
 109   store i32 %load, i32 addrspace(1)* %out
 110   call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
 111   ret void
 112 }
 113
 114 ; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
 115 ; IR-NOT: alloca
 116 ; IR: llvm.amdgcn.workitem.id
 117
 118 ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
 119 ; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
 120 define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
 121 entry:
 122   %stack = alloca [4 x i32], align 4, addrspace(5)
 123   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
 124   %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
 125   %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
 126   %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
 127   store i32 9, i32 addrspace(5)* %gep0
 128   store i32 10, i32 addrspace(5)* %gep1
 129   store i32 99, i32 addrspace(5)* %gep2
 130   store i32 43, i32 addrspace(5)* %gep3
 131   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
 132   %load = load i32, i32 addrspace(5)* %arrayidx, align 4
 133   store i32 %load, i32 addrspace(1)* %out
 134
 135   store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
 136   ret void
 137 }
 138
 139 ; We can't actually handle LDS initializers in global initializers,
 140 ; but this should count as usage.
 141
 142 ; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
 143 ; IR: alloca
 144
 145 ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
 146 ; ASM: .group_segment_fixed_size: 65536
 147 define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
 148 entry:
 149   %stack = alloca [4 x i32], align 4, addrspace(5)
 150   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
 151   %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
 152   %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
 153   %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
 154   store i32 9, i32 addrspace(5)* %gep0
 155   store i32 10, i32 addrspace(5)* %gep1
 156   store i32 99, i32 addrspace(5)* %gep2
 157   store i32 43, i32 addrspace(5)* %gep3
 158   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
 159   %load = load i32, i32 addrspace(5)* %arrayidx, align 4
 160   store i32 %load, i32 addrspace(1)* %out
 161   store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
 162   ret void
 163 }
 164
 165 attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }