llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll

   1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
   2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
   3
   4 ; Test that the VGPR spiller correctly switches to SGPR offsets when the
   5 ; instruction offset field would overflow, and that it accounts for memory
   6 ; swizzling.
   7
   8 ; GCN-LABEL: test_inst_offset_kernel
   9 define amdgpu_kernel void @test_inst_offset_kernel() {
  10 entry:
  11   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
  12   ; the instruction offset field.
  13   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
  14   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  15
  16   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  17   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
  18   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
  19   %a = load volatile i32, i32 addrspace(5)* %aptr
  20
  21   ; Force %a to spill.
  22   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  23
  24   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  25   store volatile i32 %a, i32 addrspace(5)* %outptr
  26
  27   ret void
  28 }
  29
  30 ; GCN-LABEL: test_sgpr_offset_kernel
  31 define amdgpu_kernel void @test_sgpr_offset_kernel() {
  32 entry:
  33   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  34   ; fit in the instruction, and has to live in the SGPR offset.
  35   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
  36   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  37
  38   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  39   ; 0x40000 / 64 = 4096 (for wave64)
  40   ; MUBUF:   s_mov_b32 s4, 0x40000
  41   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
  42   ; FLATSCR: s_movk_i32 s2, 0x1000
  43   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
  44   %a = load volatile i32, i32 addrspace(5)* %aptr
  45
  46   ; Force %a to spill
  47   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
  48
  49   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  50   store volatile i32 %a, i32 addrspace(5)* %outptr
  51
  52   ret void
  53 }
  54
  55 ; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
  56 ; pointer to temporarily update, so we just crash.
  57
  58 ; GCN-LABEL: test_sgpr_offset_function_scavenge_fail_func
  59 define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
  60 entry:
  61   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
  62   ; fit in the instruction, and has to live in the SGPR offset.
  63   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
  64   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
  65
  66   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
  67
  68   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
  69   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
  70   %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
  71   %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
  72   %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
  73   %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
  74   %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
  75   %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
  76   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
  77
  78   ; 0x40000 / 64 = 4096 (for wave64)
  79   %a = load volatile i32, i32 addrspace(5)* %aptr
  80
  81   ; MUBUF:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
  82   ; MUBUF-NEXT: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Spill
  83
  84 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004
  85   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
  86   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
  87
  88   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
  89   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
  90   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
  91   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
  92   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
  93   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
  94   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
  95   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
  96   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
  97
  98   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
  99
 100   ; MUBUF:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
 101   ; MUBUF-NEXT: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Reload
 102   ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004
 103   ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
 104
 105    ; Force %a to spill with no free SGPRs
 106   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
 107   ret void
 108 }
 109
 110 define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
 111 entry:
 112   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 113   ; fit in the instruction, and has to live in the SGPR offset.
 114   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 115   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 116
 117   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 118
 119   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 120   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
 121   %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
 122   %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
 123   %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
 124   %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
 125   %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
 126   %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
 127   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
 128
 129   ; 0x40000 / 64 = 4096 (for wave64)
 130   %a = load volatile i32, i32 addrspace(5)* %aptr
 131
 132   ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
 133   ; MUBUF: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Spill
 134
 135   ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004
 136   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
 137   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 138
 139   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
 140   %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
 141   %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
 142   %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
 143   %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
 144   %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
 145   %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
 146   %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
 147   %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
 148
 149   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
 150
 151   ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
 152   ; MUBUF: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Reload
 153   ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004
 154   ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
 155
 156    ; Force %a to spill with no free SGPRs
 157   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
 158   ret void
 159 }
 160
 161 ; GCN-LABEL: test_sgpr_offset_subregs_kernel
 162 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 163 entry:
 164   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 165   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
 166   ; the instruction offset field.
 167   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
 168   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 169   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 170
 171   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
 172   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
 173   ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
 174   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]]          ; 8-byte Folded Spill
 175   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 176   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 177
 178   ; Force %a to spill.
 179   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 180
 181   ; Ensure the alloca sticks around.
 182   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 183   %b = load volatile i32, i32 addrspace(5)* %bptr
 184
 185   ; Ensure the spill is of the full super-reg.
 186   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 187
 188   ret void
 189 }
 190
 191 ; GCN-LABEL: test_inst_offset_subregs_kernel
 192 define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
 193 entry:
 194   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
 195   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
 196   ; in the SGPR offset.
 197   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 198   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 199   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 200
 201   ; 0x3ff00 / 64 = 4092 (for wave64)
 202   ; MUBUF:   s_mov_b32 s4, 0x3ff00
 203   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 204   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
 205   ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
 206   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]]          ; 8-byte Folded Spill
 207   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 208   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 209
 210   ; Force %a to spill.
 211   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 212
 213   ; Ensure the alloca sticks around.
 214   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 215   %b = load volatile i32, i32 addrspace(5)* %bptr
 216
 217   ; Ensure the spill is of the full super-reg.
 218   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 219
 220   ret void
 221 }
 222
 223 ; GCN-LABEL: test_inst_offset_function
 224 define void @test_inst_offset_function() {
 225 entry:
 226   ; Occupy enough bytes of scratch, so the offset of the spill of %a
 227   ; just fits in the instruction offset field when the emergency stack
 228   ; slot is added. It's hard to hit the actual limit since we're also
 229   ; going to insert the emergency stack slot for large frames.
 230   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 231   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 232
 233   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 234   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
 235   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
 236   %a = load volatile i32, i32 addrspace(5)* %aptr
 237
 238   ; Force %a to spill.
 239   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 240
 241   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 242   store volatile i32 %a, i32 addrspace(5)* %outptr
 243
 244   ret void
 245 }
 246
 247 ; GCN-LABEL: test_sgpr_offset_function
 248 define void @test_sgpr_offset_function() {
 249 entry:
 250   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
 251   ; fit in the instruction, and has to live in the SGPR offset.
 252   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
 253   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 254
 255   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 256   ; 0x40000 / 64 = 4096 (for wave64)
 257   ; MUBUF:   s_add_i32 s4, s32, 0x40100
 258   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 259   ; FLATSCR: s_add_i32 s0, s32, 0x1004
 260   ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
 261   %a = load volatile i32, i32 addrspace(5)* %aptr
 262
 263   ; Force %a to spill
 264   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 265
 266   %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
 267   store volatile i32 %a, i32 addrspace(5)* %outptr
 268
 269   ret void
 270 }
 271
 272 ; GCN-LABEL: test_sgpr_offset_subregs_function
 273 define void @test_sgpr_offset_subregs_function() {
 274 entry:
 275   ; We want to test the spill of the last subreg of %a is the highest
 276   ; valid value for the immediate offset. We enable the emergency
 277   ; stack slot for large frames, so it's hard to get the frame layout
 278   ; exactly as we want to test it.
 279   ;
 280   ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
 281   ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
 282   ; the instruction offset field.
 283   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
 284   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 285   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 286
 287   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4084 ; 4-byte Folded Spill
 288   ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
 289   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4084 ; 8-byte Folded Spill
 290   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 291   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 292
 293   ; Force %a to spill.
 294   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 295
 296   ; Ensure the alloca sticks around.
 297   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 298   %b = load volatile i32, i32 addrspace(5)* %bptr
 299
 300   ; Ensure the spill is of the full super-reg.
 301   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 302
 303   ret void
 304 }
 305
 306 ; GCN-LABEL: test_inst_offset_subregs_function
 307 define void @test_inst_offset_subregs_function() {
 308 entry:
 309   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
 310   ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
 311   ; in the SGPR offset.
 312   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
 313   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 314   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 315
 316   ; 0x3ff0000 / 64 = 4092 (for wave64)
 317   ; MUBUF: s_add_i32 s4, s32, 0x3ff00
 318   ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
 319   ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
 320   ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4092 ; 8-byte Folded Spill
 321   %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
 322   %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
 323
 324   ; Force %a to spill.
 325   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 326
 327   ; Ensure the alloca sticks around.
 328   %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
 329   %b = load volatile i32, i32 addrspace(5)* %bptr
 330
 331   ; Ensure the spill is of the full super-reg.
 332   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
 333
 334   ret void
 335 }
 336
 337 attributes #0 = { nounwind }
 338 attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
 339 attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
 340 attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }