llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s
   3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s
   4
   5 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
   6 ; instruction offset field would overflow, and that it accounts for memory
   7 ; swizzling.
   8
   9 define amdgpu_kernel void @test_inst_offset_kernel() {
  10 ; MUBUF-LABEL: test_inst_offset_kernel:
  11 ; MUBUF:       ; %bb.0: ; %entry
  12 ; MUBUF-NEXT:    s_add_u32 s0, s0, s15
  13 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
  14 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
  15 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
  16 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
  17 ; MUBUF-NEXT:    ;;#ASMSTART
  18 ; MUBUF-NEXT:    ;;#ASMEND
  19 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
  20 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
  21 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
  22 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
  23 ; MUBUF-NEXT:    s_endpgm
  24 ;
  25 ; FLATSCR-LABEL: test_inst_offset_kernel:
  26 ; FLATSCR:       ; %bb.0: ; %entry
  27 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
  28 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
  29 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
  30 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
  31 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
  32 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff8
  33 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
  34 ; FLATSCR-NEXT:    ;;#ASMSTART
  35 ; FLATSCR-NEXT:    ;;#ASMEND
  36 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
  37 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
  38 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
  39 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:4
  40 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
  41 ; FLATSCR-NEXT:    s_endpgm
  42 entry:
  43   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
  44   ; the instruction offset field.
  45   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
  46
  47   %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
  48
  49
  50   %a = load volatile i32, ptr addrspace(5) %aptr
  51
  52   ; Force %a to spill.
  53   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  54
  55   %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
  56   store volatile i32 %a, ptr addrspace(5) %outptr
  57
  58   ret void
  59 }
  60
  61 define amdgpu_kernel void @test_sgpr_offset_kernel() {
  62 ; MUBUF-LABEL: test_sgpr_offset_kernel:
  63 ; MUBUF:       ; %bb.0: ; %entry
  64 ; MUBUF-NEXT:    s_add_u32 s0, s0, s15
  65 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
  66 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
  67 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
  68 ; MUBUF-NEXT:    s_mov_b32 s4, 0x40000
  69 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
  70 ; MUBUF-NEXT:    ;;#ASMSTART
  71 ; MUBUF-NEXT:    ;;#ASMEND
  72 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
  73 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
  74 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
  75 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
  76 ; MUBUF-NEXT:    s_endpgm
  77 ;
  78 ; FLATSCR-LABEL: test_sgpr_offset_kernel:
  79 ; FLATSCR:       ; %bb.0: ; %entry
  80 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
  81 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
  82 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
  83 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
  84 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
  85 ; FLATSCR-NEXT:    s_movk_i32 s0, 0x1000
  86 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
  87 ; FLATSCR-NEXT:    ;;#ASMSTART
  88 ; FLATSCR-NEXT:    ;;#ASMEND
  89 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
  90 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
  91 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
  92 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 offset:8
  93 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
  94 ; FLATSCR-NEXT:    s_endpgm
  95 entry:
  96   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  97   ; fit in the instruction, and has to live in the SGPR offset.
  98   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
  99
 100   %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 101   ; 0x40000 / 64 = 4096 (for wave64)
 102   %a = load volatile i32, ptr addrspace(5) %aptr
 103   ; Force %a to spill
 104   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 105
 106   %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 107   store volatile i32 %a, ptr addrspace(5) %outptr
 108
 109   ret void
 110 }
 111
 112 define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
 113 ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func:
 114 ; MUBUF:       ; %bb.0: ; %entry
 115 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 116 ; MUBUF-NEXT:    ;;#ASMSTART
 117 ; MUBUF-NEXT:    ;;#ASMEND
 118 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
 119 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 120 ; MUBUF-NEXT:    s_add_i32 s10, s32, 0x40100
 121 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
 122 ; MUBUF-NEXT:    ;;#ASMSTART
 123 ; MUBUF-NEXT:    ;;#ASMEND
 124 ; MUBUF-NEXT:    ;;#ASMSTART
 125 ; MUBUF-NEXT:    ;;#ASMEND
 126 ; MUBUF-NEXT:    ;;#ASMSTART
 127 ; MUBUF-NEXT:    ;;#ASMEND
 128 ; MUBUF-NEXT:    s_add_i32 s10, s32, 0x40100
 129 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
 130 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 131 ; MUBUF-NEXT:    ;;#ASMSTART
 132 ; MUBUF-NEXT:    ;;#ASMEND
 133 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 134 ;
 135 ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func:
 136 ; FLATSCR:       ; %bb.0: ; %entry
 137 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 138 ; FLATSCR-NEXT:    ;;#ASMSTART
 139 ; FLATSCR-NEXT:    ;;#ASMEND
 140 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
 141 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 142 ; FLATSCR-NEXT:    s_add_i32 s8, s32, 0x1004
 143 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
 144 ; FLATSCR-NEXT:    ;;#ASMSTART
 145 ; FLATSCR-NEXT:    ;;#ASMEND
 146 ; FLATSCR-NEXT:    ;;#ASMSTART
 147 ; FLATSCR-NEXT:    ;;#ASMEND
 148 ; FLATSCR-NEXT:    ;;#ASMSTART
 149 ; FLATSCR-NEXT:    ;;#ASMEND
 150 ; FLATSCR-NEXT:    s_add_i32 s8, s32, 0x1004
 151 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
 152 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 153 ; FLATSCR-NEXT:    ;;#ASMSTART
 154 ; FLATSCR-NEXT:    ;;#ASMEND
 155 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 156 entry:
 157   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 158   ; fit in the instruction, and has to live in the SGPR offset.
 159   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 160
 161   %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 162
 163   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 164   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
 165   %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
 166   %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
 167   %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
 168   %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
 169   %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
 170   %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
 171   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
 172
 173   ; 0x40000 / 64 = 4096 (for wave64)
 174   %a = load volatile i32, ptr addrspace(5) %aptr
 175   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 176
 177   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 178   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
 179   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
 180   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
 181   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
 182   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
 183   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
 184   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
 185   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
 186
 187   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
 188    ; Force %a to spill with no free SGPRs
 189   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
 190   ret void
 191 }
 192
 193 define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
 194 ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
 195 ; MUBUF:       ; %bb.0: ; %entry
 196 ; MUBUF-NEXT:    s_add_u32 s0, s0, s15
 197 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
 198 ; MUBUF-NEXT:    ;;#ASMSTART
 199 ; MUBUF-NEXT:    ;;#ASMEND
 200 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 201 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 202 ; MUBUF-NEXT:    s_mov_b32 s10, 0x40100
 203 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
 204 ; MUBUF-NEXT:    ;;#ASMSTART
 205 ; MUBUF-NEXT:    ;;#ASMEND
 206 ; MUBUF-NEXT:    ;;#ASMSTART
 207 ; MUBUF-NEXT:    ;;#ASMEND
 208 ; MUBUF-NEXT:    ;;#ASMSTART
 209 ; MUBUF-NEXT:    ;;#ASMEND
 210 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
 211 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 212 ; MUBUF-NEXT:    ;;#ASMSTART
 213 ; MUBUF-NEXT:    ;;#ASMEND
 214 ; MUBUF-NEXT:    s_endpgm
 215 ;
 216 ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
 217 ; FLATSCR:       ; %bb.0: ; %entry
 218 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 219 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 220 ; FLATSCR-NEXT:    s_mov_b32 s8, 0
 221 ; FLATSCR-NEXT:    ;;#ASMSTART
 222 ; FLATSCR-NEXT:    ;;#ASMEND
 223 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 offset:8 glc
 224 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 225 ; FLATSCR-NEXT:    s_movk_i32 s8, 0x1004
 226 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
 227 ; FLATSCR-NEXT:    ;;#ASMSTART
 228 ; FLATSCR-NEXT:    ;;#ASMEND
 229 ; FLATSCR-NEXT:    ;;#ASMSTART
 230 ; FLATSCR-NEXT:    ;;#ASMEND
 231 ; FLATSCR-NEXT:    ;;#ASMSTART
 232 ; FLATSCR-NEXT:    ;;#ASMEND
 233 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
 234 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 235 ; FLATSCR-NEXT:    ;;#ASMSTART
 236 ; FLATSCR-NEXT:    ;;#ASMEND
 237 ; FLATSCR-NEXT:    s_endpgm
 238 entry:
 239   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 240   ; fit in the instruction, and has to live in the SGPR offset.
 241   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 242
 243   %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 244
 245   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 246   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
 247   %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
 248   %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
 249   %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
 250   %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
 251   %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
 252   %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
 253   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
 254
 255   ; 0x40000 / 64 = 4096 (for wave64)
 256   %a = load volatile i32, ptr addrspace(5) %aptr
 257   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 258
 259   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 260   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
 261   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
 262   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
 263   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
 264   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
 265   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
 266   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
 267   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
 268
 269   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
 270    ; Force %a to spill with no free SGPRs
 271   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
 272   ret void
 273 }
 274
 275 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 276 ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel:
 277 ; MUBUF:       ; %bb.0: ; %entry
 278 ; MUBUF-NEXT:    s_add_u32 s0, s0, s15
 279 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
 280 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 281 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 282 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12 glc
 283 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 284 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill
 285 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 286 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
 287 ; MUBUF-NEXT:    ;;#ASMSTART
 288 ; MUBUF-NEXT:    ;;#ASMEND
 289 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
 290 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 291 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
 292 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
 293 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 294 ; MUBUF-NEXT:    ;;#ASMSTART
 295 ; MUBUF-NEXT:    ; v[0:1]
 296 ; MUBUF-NEXT:    ;;#ASMEND
 297 ; MUBUF-NEXT:    s_endpgm
 298 ;
 299 ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel:
 300 ; FLATSCR:       ; %bb.0: ; %entry
 301 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 302 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 303 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 304 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc
 305 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 306 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
 307 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
 308 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 309 ; FLATSCR-NEXT:    ;;#ASMSTART
 310 ; FLATSCR-NEXT:    ;;#ASMEND
 311 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
 312 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 313 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xff4
 314 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
 315 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 316 ; FLATSCR-NEXT:    ;;#ASMSTART
 317 ; FLATSCR-NEXT:    ; v[0:1]
 318 ; FLATSCR-NEXT:    ;;#ASMEND
 319 ; FLATSCR-NEXT:    s_endpgm
 320 entry:
 321   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 322   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
 323   ; the instruction offset field.
 324   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
 325   %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
 326   %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 327
 328   ; Force %a to spill.
 329   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 330
 331   ; Ensure the alloca sticks around.
 332   %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 333   %b = load volatile i32, ptr addrspace(5) %bptr
 334
 335   ; Ensure the spill is of the full super-reg.
 336   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 337
 338   ret void
 339 }
 340
 341 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 342 ; MUBUF-LABEL: test_inst_offset_subregs_kernel:
 343 ; MUBUF:       ; %bb.0: ; %entry
 344 ; MUBUF-NEXT:    s_add_u32 s0, s0, s15
 345 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
 346 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
 347 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 348 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
 349 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 350 ; MUBUF-NEXT:    s_mov_b32 s4, 0x3ff00
 351 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
 352 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 353 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
 354 ; MUBUF-NEXT:    ;;#ASMSTART
 355 ; MUBUF-NEXT:    ;;#ASMEND
 356 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
 357 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 358 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
 359 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
 360 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 361 ; MUBUF-NEXT:    ;;#ASMSTART
 362 ; MUBUF-NEXT:    ; v[0:1]
 363 ; MUBUF-NEXT:    ;;#ASMEND
 364 ; MUBUF-NEXT:    s_endpgm
 365 ;
 366 ; FLATSCR-LABEL: test_inst_offset_subregs_kernel:
 367 ; FLATSCR:       ; %bb.0: ; %entry
 368 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 369 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 370 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 371 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
 372 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 373 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
 374 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
 375 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 376 ; FLATSCR-NEXT:    ;;#ASMSTART
 377 ; FLATSCR-NEXT:    ;;#ASMEND
 378 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 offset:8 glc
 379 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 380 ; FLATSCR-NEXT:    s_movk_i32 s0, 0xffc
 381 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
 382 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 383 ; FLATSCR-NEXT:    ;;#ASMSTART
 384 ; FLATSCR-NEXT:    ; v[0:1]
 385 ; FLATSCR-NEXT:    ;;#ASMEND
 386 ; FLATSCR-NEXT:    s_endpgm
 387 entry:
 388   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
 389   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
 390   ; in the SGPR offset.
 391   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 392
 393   ; 0x3ff00 / 64 = 4092 (for wave64)
 394   %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
 395   %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 396
 397   ; Force %a to spill.
 398   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 399
 400   ; Ensure the alloca sticks around.
 401   %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 402   %b = load volatile i32, ptr addrspace(5) %bptr
 403
 404   ; Ensure the spill is of the full super-reg.
 405   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 406
 407   ret void
 408 }
 409
 410 define void @test_inst_offset_function() {
 411 ; MUBUF-LABEL: test_inst_offset_function:
 412 ; MUBUF:       ; %bb.0: ; %entry
 413 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 414 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
 415 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 416 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
 417 ; MUBUF-NEXT:    ;;#ASMSTART
 418 ; MUBUF-NEXT:    ;;#ASMEND
 419 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
 420 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 421 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 422 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 423 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 424 ;
 425 ; FLATSCR-LABEL: test_inst_offset_function:
 426 ; FLATSCR:       ; %bb.0: ; %entry
 427 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 428 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 429 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 430 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill
 431 ; FLATSCR-NEXT:    ;;#ASMSTART
 432 ; FLATSCR-NEXT:    ;;#ASMEND
 433 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload
 434 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 435 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
 436 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 437 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 438 entry:
 439   ; Occupy enough bytes of scratch, so the offset of the spill of %a
 440   ; just fits in the instruction offset field when the emergency stack
 441   ; slot is added. It's hard to hit the actual limit since we're also
 442   ; going to insert the emergency stack slot for large frames.
 443   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 444
 445   %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 446
 447
 448   %a = load volatile i32, ptr addrspace(5) %aptr
 449
 450   ; Force %a to spill.
 451   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 452
 453   %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 454   store volatile i32 %a, ptr addrspace(5) %outptr
 455
 456   ret void
 457 }
 458
 459 define void @test_sgpr_offset_function() {
 460 ; MUBUF-LABEL: test_sgpr_offset_function:
 461 ; MUBUF:       ; %bb.0: ; %entry
 462 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 463 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
 464 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 465 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100
 466 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
 467 ; MUBUF-NEXT:    ;;#ASMSTART
 468 ; MUBUF-NEXT:    ;;#ASMEND
 469 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100
 470 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
 471 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 472 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
 473 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 474 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 475 ;
 476 ; FLATSCR-LABEL: test_sgpr_offset_function:
 477 ; FLATSCR:       ; %bb.0: ; %entry
 478 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 479 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
 480 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 481 ; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004
 482 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
 483 ; FLATSCR-NEXT:    ;;#ASMSTART
 484 ; FLATSCR-NEXT:    ;;#ASMEND
 485 ; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004
 486 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
 487 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 488 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
 489 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 490 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 491 entry:
 492   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 493   ; fit in the instruction, and has to live in the SGPR offset.
 494   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 495
 496   %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 497   ; 0x40000 / 64 = 4096 (for wave64)
 498   %a = load volatile i32, ptr addrspace(5) %aptr
 499
 500   ; Force %a to spill
 501   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 502
 503   %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 504   store volatile i32 %a, ptr addrspace(5) %outptr
 505
 506   ret void
 507 }
 508
 509 define void @test_sgpr_offset_subregs_function() {
 510 ; MUBUF-LABEL: test_sgpr_offset_subregs_function:
 511 ; MUBUF:       ; %bb.0: ; %entry
 512 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 513 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
 514 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 515 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
 516 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 517 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill
 518 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 519 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
 520 ; MUBUF-NEXT:    ;;#ASMSTART
 521 ; MUBUF-NEXT:    ;;#ASMEND
 522 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
 523 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 524 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
 525 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
 526 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 527 ; MUBUF-NEXT:    ;;#ASMSTART
 528 ; MUBUF-NEXT:    ; v[0:1]
 529 ; MUBUF-NEXT:    ;;#ASMEND
 530 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 531 ;
 532 ; FLATSCR-LABEL: test_sgpr_offset_subregs_function:
 533 ; FLATSCR:       ; %bb.0: ; %entry
 534 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 535 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc
 536 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 537 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill
 538 ; FLATSCR-NEXT:    ;;#ASMSTART
 539 ; FLATSCR-NEXT:    ;;#ASMEND
 540 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 541 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 542 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
 543 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 544 ; FLATSCR-NEXT:    ;;#ASMSTART
 545 ; FLATSCR-NEXT:    ; v[0:1]
 546 ; FLATSCR-NEXT:    ;;#ASMEND
 547 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 548 entry:
 549   ; We want to test the spill of the last subreg of %a is the highest
 550   ; valid value for the immediate offset. We enable the emergency
 551   ; stack slot for large frames, so it's hard to get the frame layout
 552   ; exactly as we want to test it.
 553   ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
 554   ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
 555   ; the instruction offset field.
 556   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
 557   %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
 558   %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 559
 560   ; Force %a to spill.
 561   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 562
 563   ; Ensure the alloca sticks around.
 564   %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 565   %b = load volatile i32, ptr addrspace(5) %bptr
 566
 567   ; Ensure the spill is of the full super-reg.
 568   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 569
 570   ret void
 571 }
 572
 573 define void @test_inst_offset_subregs_function() {
 574 ; MUBUF-LABEL: test_inst_offset_subregs_function:
 575 ; MUBUF:       ; %bb.0: ; %entry
 576 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 577 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
 578 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 579 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
 580 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 581 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00
 582 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
 583 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 584 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
 585 ; MUBUF-NEXT:    ;;#ASMSTART
 586 ; MUBUF-NEXT:    ;;#ASMEND
 587 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
 588 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 589 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x3ff00
 590 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
 591 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
 592 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 593 ; MUBUF-NEXT:    ;;#ASMSTART
 594 ; MUBUF-NEXT:    ; v[0:1]
 595 ; MUBUF-NEXT:    ;;#ASMEND
 596 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 597 ;
 598 ; FLATSCR-LABEL: test_inst_offset_subregs_function:
 599 ; FLATSCR:       ; %bb.0: ; %entry
 600 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 601 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc
 602 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 603 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill
 604 ; FLATSCR-NEXT:    ;;#ASMSTART
 605 ; FLATSCR-NEXT:    ;;#ASMEND
 606 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
 607 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 608 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
 609 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 610 ; FLATSCR-NEXT:    ;;#ASMSTART
 611 ; FLATSCR-NEXT:    ; v[0:1]
 612 ; FLATSCR-NEXT:    ;;#ASMEND
 613 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 614 entry:
 615   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 616   ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
 617   ; in the SGPR offset.
 618   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 619
 620   ; 0x3ff0000 / 64 = 4092 (for wave64)
 621   %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
 622   %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 623
 624   ; Force %a to spill.
 625   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 626
 627   ; Ensure the alloca sticks around.
 628   %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 629   %b = load volatile i32, ptr addrspace(5) %bptr
 630
 631   ; Ensure the spill is of the full super-reg.
 632   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 633
 634   ret void
 635 }
 636
 637 attributes #0 = { nounwind }
 638 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
 639 attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
 640 attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }