llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll

   1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
   2
   3 ; Both of these kernels have the same value for
   4 ; amdgpu-flat-work-group-size, except one explicitly sets it. This is
   5 ; a program visible property which should always take precedence over
   6 ; the amdgpu-waves-per-eu optimization hint.
   7 ;
   8 ; The range is incompatible with the amdgpu-waves-per-eu value, so the
   9 ; flat work group size should take precedence implying a requirement
  10 ; to support 1024 size workgroups (which exceeds the available LDS
  11 ; amount).
  12
  13 ; CHECK-NOT: @no_flat_workgroup_size.stack
  14 ; CHECK-NOT: @explicit_default_workgroup_size.stack
  15
  16 ; CHECK-LABEL: @no_flat_workgroup_size(
  17 ; CHECK: alloca [5 x i32]
  18 ; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
  19 define amdgpu_kernel void @no_flat_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
  20 entry:
  21   %stack = alloca [5 x i32], align 4, addrspace(5)
  22   %0 = load i32, ptr addrspace(1) %in, align 4
  23   %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
  24   store i32 4, ptr addrspace(5) %arrayidx1, align 4
  25   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
  26   %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
  27   %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
  28   store i32 5, ptr addrspace(5) %arrayidx3, align 4
  29   %2 = load i32, ptr addrspace(5) %stack, align 4
  30   store i32 %2, ptr addrspace(1) %out, align 4
  31   %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
  32   %3 = load i32, ptr addrspace(5) %arrayidx12
  33   %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
  34   store i32 %3, ptr addrspace(1) %arrayidx13
  35   ret void
  36 }
  37
  38 ; CHECK-LABEL: @explicit_default_workgroup_size(
  39 ; CHECK: alloca [5 x i32]
  40 ; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
  41 define amdgpu_kernel void @explicit_default_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
  42 entry:
  43   %stack = alloca [5 x i32], align 4, addrspace(5)
  44   %0 = load i32, ptr addrspace(1) %in, align 4
  45   %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
  46   store i32 4, ptr addrspace(5) %arrayidx1, align 4
  47   %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
  48   %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
  49   %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
  50   store i32 5, ptr addrspace(5) %arrayidx3, align 4
  51   %2 = load i32, ptr addrspace(5) %stack, align 4
  52   store i32 %2, ptr addrspace(1) %out, align 4
  53   %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
  54   %3 = load i32, ptr addrspace(5) %arrayidx12
  55   %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
  56   store i32 %3, ptr addrspace(1) %arrayidx13
  57   ret void
  58 }
  59
  60 attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
  61 attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }