llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx802  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx900  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
   4
   5 ; GCN-LABEL: barrier_vmcnt_global:
   6 ; GFX8:         flat_load_dword
   7 ; GFX9_10:      global_load_dword
   8 ; GFX8:         s_waitcnt vmcnt(0){{$}}
   9 ; GFX9_10:      s_waitcnt vmcnt(0){{$}}
  10 ; GCN-NEXT:     s_barrier
  11 define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
  12 bb:
  13   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  14   %tmp1 = zext i32 %tmp to i64
  15   %tmp2 = shl nuw nsw i64 %tmp1, 32
  16   %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
  17   %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
  18   fence syncscope("singlethread") release
  19   tail call void @llvm.amdgcn.s.barrier()
  20   fence syncscope("singlethread") acquire
  21   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
  22   %tmp6 = lshr exact i64 %tmp5, 32
  23   %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
  24   store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
  25   ret void
  26 }
  27
  28 ; GCN-LABEL: barrier_vscnt_global:
  29 ; GFX8:       flat_store_dword
  30 ; GFX9_10:    global_store_dword
  31 ; GFX8:       s_waitcnt vmcnt(0){{$}}
  32 ; GFX9:       s_waitcnt vmcnt(0){{$}}
  33 ; GFX10:      s_waitcnt_vscnt null, 0x0
  34 ; GCN-NEXT:   s_barrier
  35 define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
  36 bb:
  37   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  38   %tmp1 = zext i32 %tmp to i64
  39   %tmp2 = shl nuw nsw i64 %tmp1, 32
  40   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
  41   %tmp4 = lshr exact i64 %tmp3, 32
  42   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
  43   store i32 0, i32 addrspace(1)* %tmp5, align 4
  44   fence syncscope("singlethread") release
  45   tail call void @llvm.amdgcn.s.barrier() #3
  46   fence syncscope("singlethread") acquire
  47   %tmp6 = add nuw nsw i64 %tmp2, 4294967296
  48   %tmp7 = lshr exact i64 %tmp6, 32
  49   %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7
  50   store i32 1, i32 addrspace(1)* %tmp8, align 4
  51   ret void
  52 }
  53
  54 ; GCN-LABEL: barrier_vmcnt_vscnt_global:
  55 ; GFX8:         flat_load_dword
  56 ; GFX9_10:      global_load_dword
  57 ; GFX8:         s_waitcnt vmcnt(0){{$}}
  58 ; GFX9_10:      s_waitcnt vmcnt(0){{$}}
  59 ; GFX10:        s_waitcnt_vscnt null, 0x0
  60 ; GCN-NEXT:     s_barrier
  61 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
  62 bb:
  63   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  64   %tmp1 = zext i32 %tmp to i64
  65   %tmp2 = shl nuw nsw i64 %tmp1, 32
  66   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
  67   %tmp4 = lshr exact i64 %tmp3, 32
  68   %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
  69   store i32 0, i32 addrspace(1)* %tmp5, align 4
  70   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
  71   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
  72   fence syncscope("singlethread") release
  73   tail call void @llvm.amdgcn.s.barrier()
  74   fence syncscope("singlethread") acquire
  75   %tmp8 = add nuw nsw i64 %tmp2, 4294967296
  76   %tmp9 = lshr exact i64 %tmp8, 32
  77   %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9
  78   store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4
  79   ret void
  80 }
  81
  82 ; GCN-LABEL: barrier_vmcnt_flat:
  83 ; GCN:      flat_load_dword
  84 ; GCN:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
  85 ; GCN-NEXT: s_barrier
  86 define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
  87 bb:
  88   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  89   %tmp1 = zext i32 %tmp to i64
  90   %tmp2 = shl nuw nsw i64 %tmp1, 32
  91   %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
  92   %tmp4 = load i32, i32* %tmp3, align 4
  93   fence syncscope("singlethread") release
  94   tail call void @llvm.amdgcn.s.barrier()
  95   fence syncscope("singlethread") acquire
  96   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
  97   %tmp6 = lshr exact i64 %tmp5, 32
  98   %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
  99   store i32 %tmp4, i32* %tmp7, align 4
 100   ret void
 101 }
 102
 103 ; GCN-LABEL: barrier_vscnt_flat:
 104 ; GCN:         flat_store_dword
 105 ; GFX8_9:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 106 ; GFX10:       s_waitcnt lgkmcnt(0){{$}}
 107 ; GFX10:       s_waitcnt_vscnt null, 0x0
 108 ; GCN-NEXT:    s_barrier
 109 define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
 110 bb:
 111   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 112   %tmp1 = zext i32 %tmp to i64
 113   %tmp2 = shl nuw nsw i64 %tmp1, 32
 114   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
 115   %tmp4 = lshr exact i64 %tmp3, 32
 116   %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
 117   store i32 0, i32* %tmp5, align 4
 118   fence syncscope("singlethread") release
 119   tail call void @llvm.amdgcn.s.barrier() #3
 120   fence syncscope("singlethread") acquire
 121   %tmp6 = add nuw nsw i64 %tmp2, 4294967296
 122   %tmp7 = lshr exact i64 %tmp6, 32
 123   %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7
 124   store i32 1, i32* %tmp8, align 4
 125   ret void
 126 }
 127
 128 ; GCN-LABEL: barrier_vmcnt_vscnt_flat:
 129 ; GCN:        flat_load_dword
 130 ; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 131 ; GFX10:      s_waitcnt_vscnt null, 0x0
 132 ; GCN-NEXT:   s_barrier
 133 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
 134 bb:
 135   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 136   %tmp1 = zext i32 %tmp to i64
 137   %tmp2 = shl nuw nsw i64 %tmp1, 32
 138   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
 139   %tmp4 = lshr exact i64 %tmp3, 32
 140   %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
 141   store i32 0, i32* %tmp5, align 4
 142   %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
 143   %tmp7 = load i32, i32* %tmp6, align 4
 144   fence syncscope("singlethread") release
 145   tail call void @llvm.amdgcn.s.barrier()
 146   fence syncscope("singlethread") acquire
 147   %tmp8 = add nuw nsw i64 %tmp2, 4294967296
 148   %tmp9 = lshr exact i64 %tmp8, 32
 149   %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
 150   store i32 %tmp7, i32* %tmp10, align 4
 151   ret void
 152 }
 153
 154 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
 155 ; GCN:        flat_load_dword
 156 ; GFX8_9:     s_waitcnt lgkmcnt(0){{$}}
 157 ; GFX8_9:     s_waitcnt vmcnt(0){{$}}
 158 ; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 159 ; GFX10:      s_waitcnt_vscnt null, 0x0
 160 ; GCN-NEXT:   s_barrier
 161 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
 162 bb:
 163   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 164   %tmp1 = zext i32 %tmp to i64
 165   %tmp2 = shl nuw nsw i64 %tmp1, 32
 166   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
 167   %tmp4 = lshr exact i64 %tmp3, 32
 168   %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
 169   store i32 0, i32* %tmp5, align 4
 170   %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
 171   %tmp7 = load i32, i32* %tmp6, align 4
 172   fence syncscope("workgroup") release
 173   tail call void @llvm.amdgcn.s.barrier()
 174   fence syncscope("workgroup") acquire
 175   %tmp8 = add nuw nsw i64 %tmp2, 4294967296
 176   %tmp9 = lshr exact i64 %tmp8, 32
 177   %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
 178   store i32 %tmp7, i32* %tmp10, align 4
 179   ret void
 180 }
 181
 182 ; GCN-LABEL: load_vmcnt_global:
 183 ; GFX8:     flat_load_dword
 184 ; GFX9_10:  global_load_dword
 185 ; GFX8:     s_waitcnt vmcnt(0){{$}}
 186 ; GFX9_10:  s_waitcnt vmcnt(0){{$}}
 187 ; GCN-NEXT: {{global|flat}}_store_dword
 188 define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) {
 189 bb:
 190   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 191   %tmp1 = zext i32 %tmp to i64
 192   %tmp2 = shl nuw nsw i64 %tmp1, 32
 193   %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
 194   %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
 195   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
 196   %tmp6 = lshr exact i64 %tmp5, 32
 197   %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
 198   store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
 199   ret void
 200 }
 201
 202 ; GCN-LABEL: load_vmcnt_flat:
 203 ; GCN:      flat_load_dword
 204 ; GCN-NOT:  vscnt
 205 ; GCN:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 206 ; GCN-NEXT: {{global|flat}}_store_dword
 207 define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) {
 208 bb:
 209   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 210   %tmp1 = zext i32 %tmp to i64
 211   %tmp2 = shl nuw nsw i64 %tmp1, 32
 212   %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
 213   %tmp4 = load i32, i32* %tmp3, align 4
 214   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
 215   %tmp6 = lshr exact i64 %tmp5, 32
 216   %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
 217   store i32 %tmp4, i32* %tmp7, align 4
 218   ret void
 219 }
 220
 221 ; GCN-LABEL: store_vscnt_private:
 222 ; GCN:         buffer_store_dword
 223 ; GFX8_9:      s_waitcnt vmcnt(0)
 224 ; GFX10:       s_waitcnt_vscnt null, 0x0
 225 ; GCN-NEXT:    s_setpc_b64
 226 define void @store_vscnt_private(i32 addrspace(5)* %p) {
 227   store i32 0, i32 addrspace(5)* %p
 228   ret void
 229 }
 230
 231 ; GCN-LABEL: store_vscnt_global:
 232 ; GFX8:        flat_store_dword
 233 ; GFX9_10:     global_store_dword
 234 ; GFX8_9:      s_waitcnt vmcnt(0)
 235 ; GFX10:       s_waitcnt_vscnt null, 0x0
 236 ; GCN-NEXT:    s_setpc_b64
 237 define void @store_vscnt_global(i32 addrspace(1)* %p) {
 238   store i32 0, i32 addrspace(1)* %p
 239   ret void
 240 }
 241
 242 ; GCN-LABEL: store_vscnt_flat:
 243 ; GCN:         flat_store_dword
 244 ; GFX8_9:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 245 ; GFX10:       s_waitcnt lgkmcnt(0){{$}}
 246 ; GFX10:       s_waitcnt_vscnt null, 0x0
 247 ; GCN-NEXT:    s_setpc_b64
 248 define void @store_vscnt_flat(i32* %p) {
 249   store i32 0, i32* %p
 250   ret void
 251 }
 252
 253 ; GCN-LABEL: function_prologue:
 254 ; GCN:        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
 255 ; GFX10:      s_waitcnt_vscnt null, 0x0
 256 ; GCN-NEXT:   s_setpc_b64
 257 define void @function_prologue() {
 258   ret void
 259 }
 260
 261 declare void @llvm.amdgcn.s.barrier()
 262 declare i32 @llvm.amdgcn.workitem.id.x()