test/CodeGen/AMDGPU/reqd-work-group-size.ll

   1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s
   2
   3 ; CHECK-LABEL: @invalid_reqd_work_group_size(
   4 ; CHECK: load i16,
   5 define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 {
   6   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
   7   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
   8   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
   9   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  10   store i16 %group.size.x, i16 addrspace(1)* %out
  11   ret void
  12 }
  13
  14 ; CHECK-LABEL: @volatile_load_group_size_x(
  15 ; CHECK: load volatile i16,
  16 define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  17   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  18   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  19   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  20   %group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  21   store i16 %group.size.x, i16 addrspace(1)* %out
  22   ret void
  23 }
  24
  25 ; CHECK-LABEL: @load_group_size_x(
  26 ; CHECK-NEXT: store i16 8,
  27 define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  28   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  29   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  30   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  31   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  32   store i16 %group.size.x, i16 addrspace(1)* %out
  33   ret void
  34 }
  35
  36 ; CHECK-LABEL: @load_group_size_y(
  37 ; CHECK-NEXT: store i16 16,
  38 define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  39   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  40   %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
  41   %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
  42   %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
  43   store i16 %group.size.y, i16 addrspace(1)* %out
  44   ret void
  45 }
  46
  47 ; CHECK-LABEL: @load_group_size_z(
  48 ; CHECK-NEXT: store i16 2,
  49 define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  50   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  51   %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
  52   %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
  53   %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
  54   store i16 %group.size.z, i16 addrspace(1)* %out
  55   ret void
  56 }
  57
  58 ; Metadata uses i64 instead of i32
  59 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64(
  60 ; CHECK-NEXT: store i16 8,
  61 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 {
  62   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  63   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  64   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  65   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  66   store i16 %group.size.x, i16 addrspace(1)* %out
  67   ret void
  68 }
  69
  70 ; Metadata uses i16 instead of i32
  71 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16(
  72 ; CHECK-NEXT: store i16 8,
  73 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 {
  74   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  75   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  76   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  77   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  78   store i16 %group.size.x, i16 addrspace(1)* %out
  79   ret void
  80 }
  81
  82 ; CHECK-LABEL: @use_local_size_x_8_16_2(
  83 ; CHECK-NEXT: store i64 8,
  84 define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
  85   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
  86   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
  87   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
  88   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
  89   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
  90   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
  91   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
  92   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
  93   %group.size.x.zext = zext i16 %group.size.x to i32
  94   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
  95   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
  96   %cmp = icmp ult i32 %sub, %group.size.x.zext
  97   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
  98   %zext = zext i32 %select to i64
  99   store i64 %zext, i64 addrspace(1)* %out
 100   ret void
 101 }
 102
 103 ; CHECK-LABEL: @use_local_size_y_8_16_2(
 104 ; CHECK-NEXT: store i64 16,
 105 define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 106   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 107   %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
 108   %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
 109   %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
 110   %gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
 111   %gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)*
 112   %grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4
 113   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 114   %group.size.y.zext = zext i16 %group.size.y to i32
 115   %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext
 116   %sub = sub i32 %grid.size.y, %group.id_x_group.size.y
 117   %cmp = icmp ult i32 %sub, %group.size.y.zext
 118   %select = select i1 %cmp, i32 %sub, i32 %group.size.y.zext
 119   %zext = zext i32 %select to i64
 120   store i64 %zext, i64 addrspace(1)* %out
 121   ret void
 122 }
 123
 124 ; CHECK-LABEL: @use_local_size_z_8_16_2(
 125 ; CHECK-NEXT: store i64 2,
 126 define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 127   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 128   %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
 129   %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
 130   %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
 131   %gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20
 132   %gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)*
 133   %grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4
 134   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
 135   %group.size.z.zext = zext i16 %group.size.z to i32
 136   %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext
 137   %sub = sub i32 %grid.size.z, %group.id_x_group.size.z
 138   %cmp = icmp ult i32 %sub, %group.size.z.zext
 139   %select = select i1 %cmp, i32 %sub, i32 %group.size.z.zext
 140   %zext = zext i32 %select to i64
 141   store i64 %zext, i64 addrspace(1)* %out
 142   ret void
 143 }
 144
 145 ; Simplification on select is invalid, but we can still eliminate the
 146 ; load of the group size.
 147
 148 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id(
 149 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 150 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 151 define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 152   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 153   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 154   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 155   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 156   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 157   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 158   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 159   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 160   %group.size.x.zext = zext i16 %group.size.x to i32
 161   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 162   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 163   %cmp = icmp ult i32 %sub, %group.size.x.zext
 164   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 165   %zext = zext i32 %select to i64
 166   store i64 %zext, i64 addrspace(1)* %out
 167   ret void
 168 }
 169
 170 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size(
 171 ; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 172 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 173 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 174 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 175   define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 176   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 177   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 178   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 179   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 180   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
 181   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 182   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 183   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 184   %group.size.x.zext = zext i16 %group.size.x to i32
 185   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 186   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 187   %cmp = icmp ult i32 %sub, %group.size.x.zext
 188   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 189   %zext = zext i32 %select to i64
 190   store i64 %zext, i64 addrspace(1)* %out
 191   ret void
 192 }
 193
 194 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type(
 195 ; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 196 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 197 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 198 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 199 ; CHECK: %cmp = icmp slt i32 %sub, 8
 200 ; CHECK: %select = select i1 %cmp, i32 %sub, i32 8
 201 define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 202   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 203   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 204   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 205   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 206   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 207   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 208   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 209   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 210   %group.size.x.zext = zext i16 %group.size.x to i32
 211   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 212   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 213   %cmp = icmp slt i32 %sub, %group.size.x.zext
 214   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 215   %zext = zext i32 %select to i64
 216   store i64 %zext, i64 addrspace(1)* %out
 217   ret void
 218 }
 219
 220 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_select(
 221 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 222 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 223 ; CHECK: %1 = icmp ugt i32 %sub, 8
 224 ; CHECK: %select = select i1 %1, i32 %sub, i32 8
 225 ; CHECK: %zext = zext i32 %select to i64
 226 define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 227   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 228   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 229   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 230   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 231   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 232   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 233   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 234   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 235   %group.size.x.zext = zext i16 %group.size.x to i32
 236   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 237   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 238   %cmp = icmp ult i32 %sub, %group.size.x.zext
 239   %select = select i1 %cmp, i32 %group.size.x.zext, i32 %sub
 240   %zext = zext i32 %select to i64
 241   store i64 %zext, i64 addrspace(1)* %out
 242   ret void
 243 }
 244
 245 ; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size(
 246 ; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
 247 ; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32
 248 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 249 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 250 ; CHECK: %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
 251 define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 252   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 253   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 254   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 255   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 256   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 257   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)*
 258   %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
 259   %grid.size.x.zext = zext i16 %grid.size.x to i32
 260   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 261   %group.size.x.zext = zext i16 %group.size.x to i32
 262   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 263   %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
 264   %cmp = icmp ult i32 %sub, %group.size.x.zext
 265   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 266   %zext = zext i32 %select to i64
 267   store i64 %zext, i64 addrspace(1)* %out
 268   ret void
 269 }
 270
 271 ; CHECK-LABEL: @func_group_size_x(
 272 ; CHECK-NEXT: ret i32 8
 273 define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 274   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 275   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 276   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 277   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 278   %zext = zext i16 %group.size.x to i32
 279   ret i32 %zext
 280 }
 281
 282 ; CHECK-LABEL: @__ockl_get_local_size_reqd_size(
 283 ; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ]
 284 define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 {
 285 bb:
 286   %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
 287   switch i32 %arg, label %bb25 [
 288     i32 0, label %bb1
 289     i32 1, label %bb9
 290     i32 2, label %bb17
 291   ]
 292
 293 bb1:                                              ; preds = %bb
 294   %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x()
 295   %tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12
 296   %tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)*
 297   %tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4
 298   %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
 299   %tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)*
 300   %tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4
 301   br label %bb25
 302
 303 bb9:                                              ; preds = %bb
 304   %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y()
 305   %tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16
 306   %tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)*
 307   %tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8
 308   %tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6
 309   %tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)*
 310   %tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2
 311   br label %bb25
 312
 313 bb17:                                             ; preds = %bb
 314   %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z()
 315   %tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20
 316   %tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)*
 317   %tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4
 318   %tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8
 319   %tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)*
 320   %tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8
 321   br label %bb25
 322
 323 bb25:                                             ; preds = %bb17, %bb9, %bb1, %bb
 324   %tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ]
 325   %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ]
 326   %tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ]
 327   %tmp29 = zext i16 %group.size to i32
 328   %tmp30 = mul i32 %tmp28, %tmp29
 329   %tmp31 = sub i32 %tmp26, %tmp30
 330   %tmp32 = icmp ult i32 %tmp31, %tmp29
 331   %tmp33 = select i1 %tmp32, i32 %tmp31, i32 %tmp29
 332   %tmp34 = zext i32 %tmp33 to i64
 333   ret i64 %tmp34
 334 }
 335
 336 ; CHECK-LABEL: @all_local_size(
 337 ; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
 338 ; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
 339 ; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
 340 define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 {
 341   %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
 342   %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
 343   %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
 344   %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)*
 345   %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4
 346   %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
 347   %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)*
 348   %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4
 349   %tmp29.i = zext i16 %tmp8.i to i32
 350   %tmp30.i = mul i32 %tmp2.i, %tmp29.i
 351   %tmp31.i = sub i32 %tmp5.i, %tmp30.i
 352   %tmp32.i = icmp ult i32 %tmp31.i, %tmp29.i
 353   %tmp33.i = select i1 %tmp32.i, i32 %tmp31.i, i32 %tmp29.i
 354   %tmp34.i = zext i32 %tmp33.i to i64
 355   %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
 356   %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
 357   %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)*
 358   %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8
 359   %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
 360   %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)*
 361   %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2
 362   %tmp29.i9 = zext i16 %tmp16.i to i32
 363   %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
 364   %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
 365   %tmp32.i12 = icmp ult i32 %tmp31.i11, %tmp29.i9
 366   %tmp33.i13 = select i1 %tmp32.i12, i32 %tmp31.i11, i32 %tmp29.i9
 367   %tmp34.i14 = zext i32 %tmp33.i13 to i64
 368   %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
 369   %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
 370   %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)*
 371   %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4
 372   %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
 373   %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)*
 374   %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8
 375   %tmp29.i2 = zext i16 %tmp24.i to i32
 376   %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
 377   %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
 378   %tmp32.i5 = icmp ult i32 %tmp31.i4, %tmp29.i2
 379   %tmp33.i6 = select i1 %tmp32.i5, i32 %tmp31.i4, i32 %tmp29.i2
 380   %tmp34.i7 = zext i32 %tmp33.i6 to i64
 381   store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4
 382   store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4
 383   store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4
 384   ret void
 385 }
 386
 387 ; TODO: Should be able to handle this, but not much reason to.
 388 ; CHECK-LABEL: @partial_load_group_size_x(
 389 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 390 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 391 ; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
 392 ; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
 393 define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 394   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 395   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 396   %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
 397   store i8 %group.size.x.lo, i8 addrspace(1)* %out
 398   ret void
 399 }
 400
 401 ; TODO: Should be able to handle this
 402 ; CHECK-LABEL: @load_group_size_xy_i32(
 403 ; CHECK: %group.size.xy = load i32,
 404 ; CHECK: store i32 %group.size.xy
 405 define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 406   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 407   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 408   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)*
 409   %group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4
 410   store i32 %group.size.xy, i32 addrspace(1)* %out
 411   ret void
 412 }
 413
 414 ; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr(
 415 ; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2
 416 ; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2
 417 define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
 418   %dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 419   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4
 420   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 421   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 422   store volatile i16 %group.size.x, i16 addrspace(1)* %out
 423
 424   %dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 425   %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6
 426   %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
 427   %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
 428   store volatile i16 %group.size.y, i16 addrspace(1)* %out
 429
 430   ret void
 431 }
 432
 433 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size(
 434 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 435 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 436 ; CHECK-NEXT: %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 437 ; CHECK-NEXT: %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 438 ; CHECK-NEXT: %zext = zext i16 %group.size.x to i64
 439 ; CHECK-NEXT: store i64 %zext, i64 addrspace(1)* %out, align 4
 440 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspace(1)* %out) #2 {
 441   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 442   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 443   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 444   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 445   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 446   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 447   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 448   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 449   %group.size.x.zext = zext i16 %group.size.x to i32
 450   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 451   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 452   %cmp = icmp ult i32 %sub, %group.size.x.zext
 453   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 454   %zext = zext i32 %select to i64
 455   store i64 %zext, i64 addrspace(1)* %out
 456   ret void
 457 }
 458
 459 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false(
 460 ; CHECK: icmp ult
 461 ; CHECK: select
 462 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 {
 463   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 464   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
 465   %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
 466   %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
 467   %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
 468   %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
 469   %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
 470   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 471   %group.size.x.zext = zext i16 %group.size.x to i32
 472   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
 473   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 474   %cmp = icmp ult i32 %sub, %group.size.x.zext
 475   %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
 476   %zext = zext i32 %select to i64
 477   store i64 %zext, i64 addrspace(1)* %out
 478   ret void
 479 }
 480
 481 ; CHECK-LABEL: @no_use_dispatch_ptr(
 482 ; CHECK-NEXT: ret void
 483 define amdgpu_kernel void @no_use_dispatch_ptr() {
 484   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
 485   ret void
 486 }
 487
 488 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
 489 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 490 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 491 declare i32 @llvm.amdgcn.workgroup.id.z() #1
 492
 493 attributes #0 = { nounwind "uniform-work-group-size"="true" }
 494 attributes #1 = { nounwind readnone speculatable }
 495 attributes #2 = { nounwind "uniform-work-group-size"="true" }
 496 attributes #3 = { nounwind "uniform-work-group-size"="false" }
 497
 498 !0 = !{i32 8, i32 16, i32 2}
 499 !1 = !{i32 8, i32 16}
 500 !2 = !{i64 8, i64 16, i64 2}
 501 !3 = !{i16 8, i16 16, i16 2}