test/CodeGen/AMDGPU/spill-offset-calculation.ll

   1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
   2
   3 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
   4 ; instruction offset field would overflow, and that it accounts for memory
   5 ; swizzling.
   6
   7 ; CHECK-LABEL: test_inst_offset_kernel
   8 define amdgpu_kernel void @test_inst_offset_kernel() {
   9 entry:
  10   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
  11   ; the instruction offset field.
  12   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
  13   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  14
  15   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  16   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
  17   %a = load volatile i32, i32 addrspace(5)* %aptr
  18
  19   ; Force %a to spill.
  20   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  21
  22   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  23   store volatile i32 %a, i32 addrspace(5)* %outptr
  24
  25   ret void
  26 }
  27
  28 ; CHECK-LABEL: test_sgpr_offset_kernel
  29 define amdgpu_kernel void @test_sgpr_offset_kernel() {
  30 entry:
  31   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  32   ; fit in the instruction, and has to live in the SGPR offset.
  33   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
  34   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  35
  36   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  37   ; 0x40000 / 64 = 4096 (for wave64)
  38   ; CHECK: s_add_u32 s6, s7, 0x40000
  39   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
  40   %a = load volatile i32, i32 addrspace(5)* %aptr
  41
  42   ; Force %a to spill
  43   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  44
  45   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  46   store volatile i32 %a, i32 addrspace(5)* %outptr
  47
  48   ret void
  49 }
  50
  51 ; CHECK-LABEL: test_sgpr_offset_kernel_scavenge_fail
  52 define amdgpu_kernel void @test_sgpr_offset_kernel_scavenge_fail() #1 {
  53 entry:
  54   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  55   ; fit in the instruction, and has to live in the SGPR offset.
  56   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
  57   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  58
  59   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  60
  61   ; 0x40000 / 64 = 4096 (for wave64)
  62   %a = load volatile i32, i32 addrspace(5)* %aptr
  63
  64   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
  65   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
  66   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
  67   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
  68   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
  69   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
  70   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
  71   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
  72   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
  73
  74   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
  75
  76   ; CHECK: s_add_u32 s7, s7, 0x40000
  77   ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Reload
  78   ; CHECK: s_sub_u32 s7, s7, 0x40000
  79
  80    ; Force %a to spill with no free SGPRs
  81   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
  82   ret void
  83 }
  84
  85 ; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
  86 define void @test_sgpr_offset_function_scavenge_fail() #2 {
  87 entry:
  88   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  89   ; fit in the instruction, and has to live in the SGPR offset.
  90   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
  91   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  92
  93   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  94
  95   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
  96   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
  97   %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
  98   %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
  99   %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
 100   %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
 101   %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
 102   %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
 103   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
 104
 105   ; 0x40000 / 64 = 4096 (for wave64)
 106   %a = load volatile i32, i32 addrspace(5)* %aptr
 107
 108   ; CHECK: s_add_u32 s32, s32, 0x40000
 109   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
 110   ; CHECK: s_sub_u32 s32, s32, 0x40000
 111   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 112
 113   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 114   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
 115   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
 116   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
 117   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
 118   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
 119   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
 120   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
 121   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
 122
 123   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
 124
 125   ; CHECK: s_add_u32 s32, s32, 0x40000
 126   ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
 127   ; CHECK: s_sub_u32 s32, s32, 0x40000
 128
 129    ; Force %a to spill with no free SGPRs
 130   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
 131   ret void
 132 }
 133
 134 ; CHECK-LABEL: test_sgpr_offset_subregs_kernel
 135 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 136 entry:
 137   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 138   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
 139   ; the instruction offset field.
 140   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
 141   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 142   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 143
 144   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
 145   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
 146   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 147   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 148
 149   ; Force %a to spill.
 150   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 151
 152   ; Ensure the alloca sticks around.
 153   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 154   %b = load volatile i32, i32 addrspace(5)* %bptr
 155
 156   ; Ensure the spill is of the full super-reg.
 157   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 158
 159   ret void
 160 }
 161
 162 ; CHECK-LABEL: test_inst_offset_subregs_kernel
 163 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 164 entry:
 165   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
 166   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
 167   ; in the SGPR offset.
 168   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 169   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 170   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 171
 172   ; 0x3ff00 / 64 = 4092 (for wave64)
 173   ; CHECK: s_add_u32 s6, s7, 0x3ff00
 174   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
 175   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
 176   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 177   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 178
 179   ; Force %a to spill.
 180   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 181
 182   ; Ensure the alloca sticks around.
 183   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 184   %b = load volatile i32, i32 addrspace(5)* %bptr
 185
 186   ; Ensure the spill is of the full super-reg.
 187   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 188
 189   ret void
 190 }
 191
 192 ; CHECK-LABEL: test_inst_offset_function
 193 define void @test_inst_offset_function() {
 194 entry:
 195   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
 196   ; the instruction offset field.
 197   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
 198   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 199
 200   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 201   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
 202   %a = load volatile i32, i32 addrspace(5)* %aptr
 203
 204   ; Force %a to spill.
 205   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 206
 207   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 208   store volatile i32 %a, i32 addrspace(5)* %outptr
 209
 210   ret void
 211 }
 212
 213 ; CHECK-LABEL: test_sgpr_offset_function
 214 define void @test_sgpr_offset_function() {
 215 entry:
 216   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 217   ; fit in the instruction, and has to live in the SGPR offset.
 218   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 219   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 220
 221   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 222   ; 0x40000 / 64 = 4096 (for wave64)
 223   ; CHECK: s_add_u32 s4, s32, 0x40000
 224   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 225   %a = load volatile i32, i32 addrspace(5)* %aptr
 226
 227   ; Force %a to spill
 228   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 229
 230   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 231   store volatile i32 %a, i32 addrspace(5)* %outptr
 232
 233   ret void
 234 }
 235
 236 ; CHECK-LABEL: test_sgpr_offset_subregs_function
 237 define void @test_sgpr_offset_subregs_function() {
 238 entry:
 239   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 240   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
 241   ; the instruction offset field.
 242   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 243   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 244   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 245
 246   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
 247   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
 248   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 249   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 250
 251   ; Force %a to spill.
 252   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 253
 254   ; Ensure the alloca sticks around.
 255   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 256   %b = load volatile i32, i32 addrspace(5)* %bptr
 257
 258   ; Ensure the spill is of the full super-reg.
 259   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 260
 261   ret void
 262 }
 263
 264 ; CHECK-LABEL: test_inst_offset_subregs_function
 265 define void @test_inst_offset_subregs_function() {
 266 entry:
 267   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
 268   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
 269   ; in the SGPR offset.
 270   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
 271   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 272   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 273
 274   ; 0x3ff00 / 64 = 4092 (for wave64)
 275   ; CHECK: s_add_u32 s4, s32, 0x3ff00
 276   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 277   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
 278   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 279   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 280
 281   ; Force %a to spill.
 282   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 283
 284   ; Ensure the alloca sticks around.
 285   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 286   %b = load volatile i32, i32 addrspace(5)* %bptr
 287
 288   ; Ensure the spill is of the full super-reg.
 289   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 290
 291   ret void
 292 }
 293
 294 attributes #0 = { nounwind }
 295 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
 296 attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }