llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll

   1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
   2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
   3
   4 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
   5 ; instruction offset field would overflow, and that it accounts for memory
   6 ; swizzling.
   7
   8 ; GCN-LABEL: test_inst_offset_kernel
   9 define amdgpu_kernel void @test_inst_offset_kernel() {
  10 entry:
  11   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
  12   ; the instruction offset field.
  13   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
  14   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  15
  16   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  17   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
  18   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
  19   %a = load volatile i32, i32 addrspace(5)* %aptr
  20
  21   ; Force %a to spill.
  22   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  23
  24   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  25   store volatile i32 %a, i32 addrspace(5)* %outptr
  26
  27   ret void
  28 }
  29
  30 ; GCN-LABEL: test_sgpr_offset_kernel
  31 define amdgpu_kernel void @test_sgpr_offset_kernel() {
  32 entry:
  33   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  34   ; fit in the instruction, and has to live in the SGPR offset.
  35   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
  36   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  37
  38   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  39   ; 0x40000 / 64 = 4096 (for wave64)
  40   ; MUBUF:   s_mov_b32 s4, 0x40000
  41   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
  42   ; FLATSCR: s_movk_i32 s2, 0x1000
  43   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
  44   %a = load volatile i32, i32 addrspace(5)* %aptr
  45
  46   ; Force %a to spill
  47   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  48
  49   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  50   store volatile i32 %a, i32 addrspace(5)* %outptr
  51
  52   ret void
  53 }
  54
  55 ; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
  56 ; pointer to temporarily update, so we just crash.
  57
  58 ; GCN-LABEL: test_sgpr_offset_function_scavenge_fail
  59 define void @test_sgpr_offset_function_scavenge_fail() #2 {
  60 entry:
  61   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  62   ; fit in the instruction, and has to live in the SGPR offset.
  63   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
  64   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  65
  66   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  67
  68   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
  69   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
  70   %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
  71   %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
  72   %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
  73   %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
  74   %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
  75   %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
  76   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
  77
  78   ; 0x40000 / 64 = 4096 (for wave64)
  79   %a = load volatile i32, i32 addrspace(5)* %aptr
  80
  81   ; MUBUF:   s_add_i32 s32, s32, 0x40000
  82   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
  83   ; MUBUF:   s_add_i32 s32, s32, 0xfffc0000
  84   ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000
  85   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
  86   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
  87
  88   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
  89   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
  90   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
  91   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
  92   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
  93   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
  94   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
  95   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
  96   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
  97
  98   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
  99
 100   ; MUBUF:   s_add_i32 s32, s32, 0x40000
 101   ; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
 102   ; MUBUF:   s_add_i32 s32, s32, 0xfffc0000
 103   ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1000
 104   ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
 105
 106    ; Force %a to spill with no free SGPRs
 107   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
 108   ret void
 109 }
 110
 111 ; GCN-LABEL: test_sgpr_offset_subregs_kernel
 112 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 113 entry:
 114   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 115   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
 116   ; the instruction offset field.
 117   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
 118   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 119   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 120
 121   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
 122   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
 123   ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
 124   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]]          ; 8-byte Folded Spill
 125   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 126   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 127
 128   ; Force %a to spill.
 129   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 130
 131   ; Ensure the alloca sticks around.
 132   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 133   %b = load volatile i32, i32 addrspace(5)* %bptr
 134
 135   ; Ensure the spill is of the full super-reg.
 136   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 137
 138   ret void
 139 }
 140
 141 ; GCN-LABEL: test_inst_offset_subregs_kernel
 142 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 143 entry:
 144   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
 145   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
 146   ; in the SGPR offset.
 147   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 148   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 149   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 150
 151   ; 0x3ff00 / 64 = 4092 (for wave64)
 152   ; MUBUF:   s_mov_b32 s4, 0x3ff00
 153   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 154   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
 155   ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
 156   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]]          ; 8-byte Folded Spill
 157   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 158   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 159
 160   ; Force %a to spill.
 161   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 162
 163   ; Ensure the alloca sticks around.
 164   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 165   %b = load volatile i32, i32 addrspace(5)* %bptr
 166
 167   ; Ensure the spill is of the full super-reg.
 168   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 169
 170   ret void
 171 }
 172
 173 ; GCN-LABEL: test_inst_offset_function
 174 define void @test_inst_offset_function() {
 175 entry:
 176   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
 177   ; the instruction offset field.
 178   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
 179   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 180
 181   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 182   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
 183   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
 184   %a = load volatile i32, i32 addrspace(5)* %aptr
 185
 186   ; Force %a to spill.
 187   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 188
 189   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 190   store volatile i32 %a, i32 addrspace(5)* %outptr
 191
 192   ret void
 193 }
 194
 195 ; GCN-LABEL: test_sgpr_offset_function
 196 define void @test_sgpr_offset_function() {
 197 entry:
 198   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 199   ; fit in the instruction, and has to live in the SGPR offset.
 200   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 201   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 202
 203   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 204   ; 0x40000 / 64 = 4096 (for wave64)
 205   ; MUBUF:   s_add_i32 s4, s32, 0x40000
 206   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 207   ; FLATSCR: s_add_i32 s0, s32, 0x1000
 208   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
 209   %a = load volatile i32, i32 addrspace(5)* %aptr
 210
 211   ; Force %a to spill
 212   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 213
 214   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 215   store volatile i32 %a, i32 addrspace(5)* %outptr
 216
 217   ret void
 218 }
 219
 220 ; GCN-LABEL: test_sgpr_offset_subregs_function
 221 define void @test_sgpr_offset_subregs_function() {
 222 entry:
 223   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 224   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
 225   ; the instruction offset field.
 226   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 227   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 228   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 229
 230   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
 231   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
 232   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4088 ; 8-byte Folded Spill
 233   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 234   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 235
 236   ; Force %a to spill.
 237   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 238
 239   ; Ensure the alloca sticks around.
 240   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 241   %b = load volatile i32, i32 addrspace(5)* %bptr
 242
 243   ; Ensure the spill is of the full super-reg.
 244   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 245
 246   ret void
 247 }
 248
 249 ; GCN-LABEL: test_inst_offset_subregs_function
 250 define void @test_inst_offset_subregs_function() {
 251 entry:
 252   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
 253   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
 254   ; in the SGPR offset.
 255   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
 256   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 257   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 258
 259   ; 0x3ff00 / 64 = 4092 (for wave64)
 260   ; MUBUF: s_add_i32 s4, s32, 0x3ff00
 261   ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 262   ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
 263   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4092 ; 8-byte Folded Spill
 264   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 265   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 266
 267   ; Force %a to spill.
 268   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 269
 270   ; Ensure the alloca sticks around.
 271   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 272   %b = load volatile i32, i32 addrspace(5)* %bptr
 273
 274   ; Ensure the spill is of the full super-reg.
 275   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 276
 277   ret void
 278 }
 279
 280 attributes #0 = { nounwind }
 281 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
 282 attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }