llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll

   1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s
   2 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope %s
   3
   4 target datalayout = "n32"
   5
   6 ; CHECK-LABEL: @invalid_reqd_work_group_size(
   7 ; CHECK: load i16,
   8 define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 {
   9   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  10   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  11   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  12   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  13   store i16 %group.size.x, i16 addrspace(1)* %out
  14   ret void
  15 }
  16
  17 ; CHECK-LABEL: @volatile_load_group_size_x(
  18 ; CHECK: load volatile i16,
  19 define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  20   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  21   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  22   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  23   %group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  24   store i16 %group.size.x, i16 addrspace(1)* %out
  25   ret void
  26 }
  27
  28 ; CHECK-LABEL: @load_group_size_x(
  29 ; CHECK-NEXT: store i16 8,
  30 define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  31   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  32   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  33   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  34   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  35   store i16 %group.size.x, i16 addrspace(1)* %out
  36   ret void
  37 }
  38
  39 ; CHECK-LABEL: @load_group_size_y(
  40 ; CHECK-NEXT: store i16 16,
  41 define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  42   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  43   %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
  44   %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
  45   %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
  46   store i16 %group.size.y, i16 addrspace(1)* %out
  47   ret void
  48 }
  49
  50 ; CHECK-LABEL: @load_group_size_z(
  51 ; CHECK-NEXT: store i16 2,
  52 define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  53   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  54   %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
  55   %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
  56   %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
  57   store i16 %group.size.z, i16 addrspace(1)* %out
  58   ret void
  59 }
  60
  61 ; Metadata uses i64 instead of i32
  62 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64(
  63 ; CHECK-NEXT: store i16 8,
  64 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 {
  65   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  66   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  67   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  68   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  69   store i16 %group.size.x, i16 addrspace(1)* %out
  70   ret void
  71 }
  72
  73 ; Metadata uses i16 instead of i32
  74 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16(
  75 ; CHECK-NEXT: store i16 8,
  76 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 {
  77   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  78   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  79   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  80   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  81   store i16 %group.size.x, i16 addrspace(1)* %out
  82   ret void
  83 }
  84
  85 ; CHECK-LABEL: @use_local_size_x_8_16_2(
  86 ; CHECK-NEXT: store i64 8,
  87 define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  88   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  89   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  90   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  91   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  92   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
  93   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
  94   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
  95   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
  96   %group.size.x.zext = zext i16 %group.size.x to i32
  97   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
  98   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
  99   %cmp = icmp ult i32 %sub, %group.size.x.zext
 100   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 101   %zext = zext i32 %select to i64
 102   store i64 %zext, i64 addrspace(1)* %out
 103   ret void
 104 }
 105
 106 ; CHECK-LABEL: @use_local_size_y_8_16_2(
 107 ; CHECK-NEXT: store i64 16,
 108 define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 109   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 110   %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
 111   %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
 112   %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
 113   %gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
 114   %gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)*
 115   %grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4
 116   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 117   %group.size.y.zext = zext i16 %group.size.y to i32
 118   %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext
 119   %sub = sub i32 %grid.size.y, %group.id_x_group.size.y
 120   %cmp = icmp ult i32 %sub, %group.size.y.zext
 121   %select = select i1 %cmp, i32 %sub, i32 %group.size.y.zext
 122   %zext = zext i32 %select to i64
 123   store i64 %zext, i64 addrspace(1)* %out
 124   ret void
 125 }
 126
 127 ; CHECK-LABEL: @use_local_size_z_8_16_2(
 128 ; CHECK-NEXT: store i64 2,
 129 define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 130   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 131   %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
 132   %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
 133   %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
 134   %gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20
 135   %gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)*
 136   %grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4
 137   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
 138   %group.size.z.zext = zext i16 %group.size.z to i32
 139   %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext
 140   %sub = sub i32 %grid.size.z, %group.id_x_group.size.z
 141   %cmp = icmp ult i32 %sub, %group.size.z.zext
 142   %select = select i1 %cmp, i32 %sub, i32 %group.size.z.zext
 143   %zext = zext i32 %select to i64
 144   store i64 %zext, i64 addrspace(1)* %out
 145   ret void
 146 }
 147
 148 ; Simplification on select is invalid, but we can still eliminate the
 149 ; load of the group size.
 150
 151 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id(
 152 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 153 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 154 define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 155   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 156   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 157   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 158   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 159   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 160   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 161   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 162   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 163   %group.size.x.zext = zext i16 %group.size.x to i32
 164   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 165   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 166   %cmp = icmp ult i32 %sub, %group.size.x.zext
 167   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 168   %zext = zext i32 %select to i64
 169   store i64 %zext, i64 addrspace(1)* %out
 170   ret void
 171 }
 172
 173 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size(
 174 ; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 175 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 176 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 177   define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 178   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 179   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 180   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 181   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 182   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
 183   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 184   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 185   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 186   %group.size.x.zext = zext i16 %group.size.x to i32
 187   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 188   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 189   %cmp = icmp ult i32 %sub, %group.size.x.zext
 190   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 191   %zext = zext i32 %select to i64
 192   store i64 %zext, i64 addrspace(1)* %out
 193   ret void
 194 }
 195
 196 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type(
 197 ; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 198 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 199 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 200 ; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x
 201 ; CHECK: %cmp = icmp slt i32 %sub, 8
 202 ; CHECK: %select = select i1 %cmp, i32 %sub, i32 8
 203 define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 204   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 205   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 206   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 207   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 208   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 209   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 210   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 211   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 212   %group.size.x.zext = zext i16 %group.size.x to i32
 213   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 214   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 215   %cmp = icmp slt i32 %sub, %group.size.x.zext
 216   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 217   %zext = zext i32 %select to i64
 218   store i64 %zext, i64 addrspace(1)* %out
 219   ret void
 220 }
 221
 222 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_select(
 223 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 224 ; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x
 225 ; CHECK: %1 = icmp ugt i32 %sub, 8
 226 ; CHECK: %select = select i1 %1, i32 %sub, i32 8
 227 ; CHECK: %zext = zext i32 %select to i64
 228 define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 229   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 230   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 231   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 232   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 233   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 234   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 235   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 236   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 237   %group.size.x.zext = zext i16 %group.size.x to i32
 238   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 239   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 240   %cmp = icmp ult i32 %sub, %group.size.x.zext
 241   %select = select i1 %cmp, i32 %group.size.x.zext, i32 %sub
 242   %zext = zext i32 %select to i64
 243   store i64 %zext, i64 addrspace(1)* %out
 244   ret void
 245 }
 246
 247 ; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size(
 248 ; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
 249 ; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32
 250 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 251 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 252 ; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x.zext
 253 define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 254   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 255   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 256   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 257   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 258   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 259   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)*
 260   %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
 261   %grid.size.x.zext = zext i16 %grid.size.x to i32
 262   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 263   %group.size.x.zext = zext i16 %group.size.x to i32
 264   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 265   %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
 266   %cmp = icmp ult i32 %sub, %group.size.x.zext
 267   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 268   %zext = zext i32 %select to i64
 269   store i64 %zext, i64 addrspace(1)* %out
 270   ret void
 271 }
 272
 273 ; CHECK-LABEL: @func_group_size_x(
 274 ; CHECK-NEXT: ret i32 8
 275 define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 276   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 277   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 278   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 279   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 280   %zext = zext i16 %group.size.x to i32
 281   ret i32 %zext
 282 }
 283
 284 ; CHECK-LABEL: @__ockl_get_local_size_reqd_size(
 285 ; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ]
 286 define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 {
 287 bb:
 288   %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
 289   switch i32 %arg, label %bb25 [
 290     i32 0, label %bb1
 291     i32 1, label %bb9
 292     i32 2, label %bb17
 293   ]
 294
 295 bb1:                                              ; preds = %bb
 296   %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x()
 297   %tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12
 298   %tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)*
 299   %tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4
 300   %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
 301   %tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)*
 302   %tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4
 303   br label %bb25
 304
 305 bb9:                                              ; preds = %bb
 306   %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y()
 307   %tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16
 308   %tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)*
 309   %tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8
 310   %tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6
 311   %tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)*
 312   %tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2
 313   br label %bb25
 314
 315 bb17:                                             ; preds = %bb
 316   %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z()
 317   %tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20
 318   %tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)*
 319   %tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4
 320   %tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8
 321   %tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)*
 322   %tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8
 323   br label %bb25
 324
 325 bb25:                                             ; preds = %bb17, %bb9, %bb1, %bb
 326   %tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ]
 327   %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ]
 328   %tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ]
 329   %tmp29 = zext i16 %group.size to i32
 330   %tmp30 = mul i32 %tmp28, %tmp29
 331   %tmp31 = sub i32 %tmp26, %tmp30
 332   %tmp32 = icmp ult i32 %tmp31, %tmp29
 333   %tmp33 = select i1 %tmp32, i32 %tmp31, i32 %tmp29
 334   %tmp34 = zext i32 %tmp33 to i64
 335   ret i64 %tmp34
 336 }
 337
 338 ; CHECK-LABEL: @all_local_size(
 339 ; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
 340 ; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
 341 ; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
 342 define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 {
 343   %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
 344   %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
 345   %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
 346   %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)*
 347   %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4
 348   %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
 349   %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)*
 350   %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4
 351   %tmp29.i = zext i16 %tmp8.i to i32
 352   %tmp30.i = mul i32 %tmp2.i, %tmp29.i
 353   %tmp31.i = sub i32 %tmp5.i, %tmp30.i
 354   %tmp32.i = icmp ult i32 %tmp31.i, %tmp29.i
 355   %tmp33.i = select i1 %tmp32.i, i32 %tmp31.i, i32 %tmp29.i
 356   %tmp34.i = zext i32 %tmp33.i to i64
 357   %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
 358   %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
 359   %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)*
 360   %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8
 361   %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
 362   %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)*
 363   %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2
 364   %tmp29.i9 = zext i16 %tmp16.i to i32
 365   %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
 366   %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
 367   %tmp32.i12 = icmp ult i32 %tmp31.i11, %tmp29.i9
 368   %tmp33.i13 = select i1 %tmp32.i12, i32 %tmp31.i11, i32 %tmp29.i9
 369   %tmp34.i14 = zext i32 %tmp33.i13 to i64
 370   %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
 371   %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
 372   %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)*
 373   %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4
 374   %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
 375   %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)*
 376   %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8
 377   %tmp29.i2 = zext i16 %tmp24.i to i32
 378   %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
 379   %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
 380   %tmp32.i5 = icmp ult i32 %tmp31.i4, %tmp29.i2
 381   %tmp33.i6 = select i1 %tmp32.i5, i32 %tmp31.i4, i32 %tmp29.i2
 382   %tmp34.i7 = zext i32 %tmp33.i6 to i64
 383   store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4
 384   store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4
 385   store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4
 386   ret void
 387 }
 388
 389 ; TODO: Should be able to handle this, but not much reason to.
 390 ; CHECK-LABEL: @partial_load_group_size_x(
 391 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 392 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 393 ; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 4
 394 ; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
 395 define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 396   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 397   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 398   %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
 399   store i8 %group.size.x.lo, i8 addrspace(1)* %out
 400   ret void
 401 }
 402
 403 ; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align(
 404 ; CHECK-NEXT: %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 405 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 406 ; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 2
 407 ; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
 408 define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 409   %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 410   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 411   %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
 412   store i8 %group.size.x.lo, i8 addrspace(1)* %out
 413   ret void
 414 }
 415
 416 ; TODO: Should be able to handle this
 417 ; CHECK-LABEL: @load_group_size_xy_i32(
 418 ; CHECK: %group.size.xy = load i32,
 419 ; CHECK: store i32 %group.size.xy
 420 define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 421   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 422   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 423   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)*
 424   %group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4
 425   store i32 %group.size.xy, i32 addrspace(1)* %out
 426   ret void
 427 }
 428
 429 ; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr(
 430 ; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2
 431 ; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2
 432 define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 433   %dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 434   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4
 435   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 436   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 437   store volatile i16 %group.size.x, i16 addrspace(1)* %out
 438
 439   %dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 440   %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6
 441   %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
 442   %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
 443   store volatile i16 %group.size.y, i16 addrspace(1)* %out
 444
 445   ret void
 446 }
 447
 448 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size(
 449 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 450 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 451 ; CHECK-NEXT: %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 452 ; CHECK-NEXT: %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 453 ; CHECK-NEXT: %zext = zext i16 %group.size.x to i64
 454 ; CHECK-NEXT: store i64 %zext, i64 addrspace(1)* %out, align 4
 455 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspace(1)* %out) #2 {
 456   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 457   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 458   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 459   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 460   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 461   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 462   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 463   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 464   %group.size.x.zext = zext i16 %group.size.x to i32
 465   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 466   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 467   %cmp = icmp ult i32 %sub, %group.size.x.zext
 468   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 469   %zext = zext i32 %select to i64
 470   store i64 %zext, i64 addrspace(1)* %out
 471   ret void
 472 }
 473
 474 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false(
 475 ; CHECK: icmp ult
 476 ; CHECK: select
 477 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 {
 478   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 479   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 480   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 481   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 482   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 483   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 484   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 485   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 486   %group.size.x.zext = zext i16 %group.size.x to i32
 487   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 488   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 489   %cmp = icmp ult i32 %sub, %group.size.x.zext
 490   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 491   %zext = zext i32 %select to i64
 492   store i64 %zext, i64 addrspace(1)* %out
 493   ret void
 494 }
 495
 496 ; CHECK-LABEL: @no_use_dispatch_ptr(
 497 ; CHECK-NEXT: ret void
 498 define amdgpu_kernel void @no_use_dispatch_ptr() {
 499   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 500   ret void
 501 }
 502
 503 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
 504 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 505 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 506 declare i32 @llvm.amdgcn.workgroup.id.z() #1
 507
 508 attributes #0 = { nounwind "uniform-work-group-size"="true" }
 509 attributes #1 = { nounwind readnone speculatable }
 510 attributes #2 = { nounwind "uniform-work-group-size"="true" }
 511 attributes #3 = { nounwind "uniform-work-group-size"="false" }
 512
 513 !0 = !{i32 8, i32 16, i32 2}
 514 !1 = !{i32 8, i32 16}
 515 !2 = !{i64 8, i64 16, i64 2}
 516 !3 = !{i16 8, i16 16, i16 2}