llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

   1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
   2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
   3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
   4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
   5
   6 ; Tests for indirect addressing on SI, which is implemented using dynamic
   7 ; indexing of vectors.
   8
   9 ; GCN-LABEL: {{^}}extract_w_offset:
  10 ; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
  11 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
  12 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
  13 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
  14 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
  15 ; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
  16
  17 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
  18 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
  19
  20 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
  21 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
  22 ; IDXMODE-NEXT: s_set_gpr_idx_off
  23 define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
  24 entry:
  25   %idx = add i32 %in, 1
  26   %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
  27   store float %elt, ptr addrspace(1) %out
  28   ret void
  29 }
  30
  31 ; XXX: Could do v_or_b32 directly
  32 ; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
  33 ; GCN-DAG: s_or_b32
  34 ; GCN-DAG: s_or_b32
  35 ; GCN-DAG: s_or_b32
  36 ; GCN-DAG: s_or_b32
  37 ; MOVREL: s_mov_b32 m0
  38 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
  39 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
  40 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
  41 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
  42
  43
  44 ; MOVREL: v_movrels_b32_e32
  45
  46 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
  47 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
  48 ; IDXMODE-NEXT: s_set_gpr_idx_off
  49 define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
  50 entry:
  51   %idx = add i32 %in, 1
  52   %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
  53   %elt = extractelement <16 x i32> %vec, i32 %idx
  54   store i32 %elt, ptr addrspace(1) %out
  55   ret void
  56 }
  57
  58 ; GCN-LABEL: {{^}}extract_wo_offset:
  59 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
  60 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
  61 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
  62 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
  63 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
  64
  65 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
  66 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
  67
  68 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
  69 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
  70 ; IDXMODE-NEXT: s_set_gpr_idx_off
  71 define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
  72 entry:
  73   %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
  74   store float %elt, ptr addrspace(1) %out
  75   ret void
  76 }
  77
  78 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
  79 ; The offset depends on the register that holds the first element of the vector.
  80 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
  81 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
  82
  83 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
  84 ; IDXMODE: v_mov_b32_e32 v14, 15
  85 ; IDXMODE: v_mov_b32_e32 v15, 16
  86 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
  87 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
  88 ; IDXMODE-NEXT: s_set_gpr_idx_off
  89 define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
  90 entry:
  91   %index = add i32 %offset, -512
  92   %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
  93   store i32 %value, ptr addrspace(1) %out
  94   ret void
  95 }
  96
  97 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
  98 ; The offset depends on the register that holds the first element of the vector.
  99 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
 100 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
 101
 102 ; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
 103 ; IDXMODE-DAG: v_mov_b32_e32 v0,
 104 ; IDXMODE: v_mov_b32_e32 v1,
 105 ; IDXMODE: v_mov_b32_e32 v2,
 106 ; IDXMODE: v_mov_b32_e32 v3,
 107 ; IDXMODE: v_mov_b32_e32 v4,
 108 ; IDXMODE: v_mov_b32_e32 v5,
 109 ; IDXMODE: v_mov_b32_e32 v6,
 110 ; IDXMODE: v_mov_b32_e32 v7,
 111 ; IDXMODE: v_mov_b32_e32 v8,
 112 ; IDXMODE: v_mov_b32_e32 v9,
 113 ; IDXMODE: v_mov_b32_e32 v10,
 114 ; IDXMODE: v_mov_b32_e32 v11,
 115 ; IDXMODE: v_mov_b32_e32 v12,
 116 ; IDXMODE: v_mov_b32_e32 v13,
 117 ; IDXMODE: v_mov_b32_e32 v14,
 118 ; IDXMODE: v_mov_b32_e32 v15,
 119 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
 120 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 121 ; IDXMODE-NEXT: s_set_gpr_idx_off
 122 define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
 123 entry:
 124   %index = add i32 %offset, -512
 125   %or = or <16 x i32> %vec0, %vec1
 126   %value = extractelement <16 x i32> %or, i32 %index
 127   store i32 %value, ptr addrspace(1) %out
 128   ret void
 129 }
 130
 131 ; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
 132 ; The offset depends on the register that holds the first element of the vector.
 133
 134 ; GCN: v_cmp_eq_u32_e32
 135 ; GCN-COUNT-14: v_cndmask_b32
 136 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16
 137 ; GCN: buffer_store_dword [[RESULT]]
 138 define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 139 entry:
 140   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
 141   %index = add i32 %id, -512
 142   %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
 143   store i32 %value, ptr addrspace(1) %out
 144   ret void
 145 }
 146
 147 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
 148 ; undefined behavior, but shouldn't crash compiler
 149 define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 150 entry:
 151   %ld = load volatile <4 x i32>, ptr addrspace(1) %in
 152   %value = extractelement <4 x i32> %ld, i32 undef
 153   store i32 %value, ptr addrspace(1) %out
 154   ret void
 155 }
 156
 157 ; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
 158 ; undefined behavior, but shouldn't crash compiler
 159 define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 160 entry:
 161   %ld = load <4 x i32>, ptr addrspace(1) %in
 162   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
 163   store <4 x i32> %value, ptr addrspace(1) %out
 164   ret void
 165 }
 166
 167 ; GCN-LABEL: {{^}}insert_w_offset:
 168 ; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
 169 ; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
 170 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
 171 ; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
 172 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
 173 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
 174 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
 175 ; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
 176 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
 177
 178 ; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
 179 ; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]]
 180 define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
 181 entry:
 182   %add = add i32 %in, 1
 183   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
 184   store <16 x float> %ins, ptr addrspace(1) %out
 185   ret void
 186 }
 187
 188 ; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset:
 189 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
 190 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
 191 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
 192 ; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff
 193
 194 ; MOVREL: s_mov_b32 m0, [[BASE]]
 195 ; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}}
 196
 197 ; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST)
 198 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
 199 ; IDXMODE-NEXT: s_set_gpr_idx_off
 200 define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
 201 entry:
 202   %base = zext i16 %in to i32
 203   %add = add i32 %base, 1
 204   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
 205   store <16 x float> %ins, ptr addrspace(1) %out
 206   ret void
 207 }
 208
 209 ; GCN-LABEL: {{^}}insert_signed_base_plus_offset:
 210 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
 211 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
 212 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
 213
 214 ; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]]
 215 ; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1
 216
 217 ; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]]
 218 ; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}}
 219
 220 ; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST)
 221 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
 222 ; IDXMODE-NEXT: s_set_gpr_idx_off
 223 define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
 224 entry:
 225   %base = sext i16 %in to i32
 226   %add = add i32 %base, 1
 227   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
 228   store <16 x float> %ins, ptr addrspace(1) %out
 229   ret void
 230 }
 231
 232
 233 ; GCN-LABEL: {{^}}insert_wo_offset:
 234 ; GCN: s_load_dword [[IN:s[0-9]+]]
 235
 236 ; MOVREL: s_mov_b32 m0, [[IN]]
 237 ; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
 238
 239 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST)
 240 ; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
 241 ; IDXMODE-NEXT: s_set_gpr_idx_off
 242
 243 ; GCN: buffer_store_dwordx4 v[[[ELT0]]:
 244 define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
 245 entry:
 246   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
 247   store <16 x float> %ins, ptr addrspace(1) %out
 248   ret void
 249 }
 250
 251 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
 252 ; The offset depends on the register that holds the first element of the vector.
 253 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
 254 ; MOVREL: v_movreld_b32_e32 v0, 16
 255
 256 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
 257 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
 258 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
 259 ; IDXMODE-NEXT: s_set_gpr_idx_off
 260 define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
 261 entry:
 262   %index = add i32 %offset, -512
 263   %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
 264   store <16 x i32> %value, ptr addrspace(1) %out
 265   ret void
 266 }
 267
 268 ; The vector indexed into is originally loaded into an SGPR rather
 269 ; than built with a reg_sequence
 270
 271 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
 272 ; The offset depends on the register that holds the first element of the vector.
 273 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
 274 ; MOVREL: v_movreld_b32_e32 v0, 5
 275
 276 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
 277 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
 278 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 279 ; IDXMODE-NEXT: s_set_gpr_idx_off
 280 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
 281 entry:
 282   %index = add i32 %offset, -512
 283   %value = insertelement <16 x i32> %vec, i32 5, i32 %index
 284   store <16 x i32> %value, ptr addrspace(1) %out
 285   ret void
 286 }
 287
 288 ; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
 289 ; The offset depends on the register that holds the first element of the vector.
 290
 291 ; GCN: v_cmp_eq_u32_e32
 292 ; GCN-COUNT-16: v_cndmask_b32
 293 ; GCN-COUNT-4:  buffer_store_dwordx4
 294 define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 295 entry:
 296   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
 297   %index = add i32 %id, -512
 298   %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
 299   store <16 x i32> %value, ptr addrspace(1) %out
 300   ret void
 301 }
 302
 303 ; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
 304
 305 ; GCN: v_cmp_eq_u32_e32
 306 ; GCN-COUNT-16: v_cndmask_b32
 307 ; GCN-COUNT-4:  buffer_store_dwordx4
 308 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 309 entry:
 310   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
 311   %index = add i32 %id, -16
 312   %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
 313   store <16 x i32> %value, ptr addrspace(1) %out
 314   ret void
 315 }
 316
 317 ; When the block is split to insert the loop, make sure any other
 318 ; places that need to be expanded in the same block are also handled.
 319
 320 ; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
 321
 322 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
 323 ; GCN: v_cmp_eq_u32
 324 ; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16,
 325 ; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16,
 326
 327 ; GCN: buffer_store_dword [[RESULT0]]
 328 ; GCN: buffer_store_dword [[RESULT1]]
 329 define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 {
 330 entry:
 331   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
 332   %id.ext = zext i32 %id to i64
 333   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
 334   %idx0 = load volatile i32, ptr addrspace(1) %gep
 335   %idx1 = add i32 %idx0, 1
 336   %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
 337   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
 338   %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
 339   store volatile i32 %val0, ptr addrspace(1) %out0
 340   store volatile i32 %val1, ptr addrspace(1) %out0
 341   %cmp = icmp eq i32 %id, 0
 342   br i1 %cmp, label %bb1, label %bb2
 343
 344 bb1:
 345   store volatile i32 %live.out.reg, ptr addrspace(1) undef
 346   br label %bb2
 347
 348 bb2:
 349   ret void
 350 }
 351
 352 ; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
 353 ; avoid very different schedule induced isses with gfx9.
 354 ; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
 355
 356
 357 ; GCN-LABEL: {{^}}insert_adjacent_blocks:
 358 define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
 359 bb:
 360   %tmp = icmp eq i32 %arg, 0
 361   br i1 %tmp, label %bb1, label %bb4
 362
 363 bb1:                                              ; preds = %bb
 364   %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
 365   %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
 366   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
 367   br label %bb7
 368
 369 bb4:                                              ; preds = %bb
 370   %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
 371   %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
 372   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
 373   br label %bb7
 374
 375 bb7:                                              ; preds = %bb4, %bb1
 376   %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
 377   store volatile <4 x float> %tmp8, ptr addrspace(1) undef
 378   ret void
 379 }
 380
 381 ; FIXME: Should be able to fold zero input to movreld to inline imm?
 382
 383 ; GCN-LABEL: {{^}}multi_same_block:
 384
 385 ; GCN: s_load_dword [[ARG:s[0-9]+]]
 386
 387 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
 388 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 389 ; MOVREL: s_waitcnt
 390 ; MOVREL: s_add_i32 m0, [[ARG]], -16
 391 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
 392 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
 393 ; MOVREL: s_mov_b32 m0, -1
 394
 395
 396 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
 397 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
 398 ; IDXMODE: s_waitcnt
 399 ; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
 400 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
 401 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
 402 ; IDXMODE: s_set_gpr_idx_off
 403
 404 ; GCN: ds_write_b32
 405 ; GCN: ds_write_b32
 406 ; GCN: s_endpgm
 407 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
 408 bb:
 409   %tmp1 = add i32 %arg, -16
 410   %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
 411   %tmp3 = add i32 %arg, -16
 412   %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
 413   %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
 414   %tmp6 = extractelement <9 x i32> %tmp5, i32 1
 415   %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
 416   %tmp8 = extractelement <9 x i32> %tmp7, i32 5
 417   store volatile i32 %tmp6, ptr addrspace(3) undef, align 4
 418   store volatile i32 %tmp8, ptr addrspace(3) undef, align 4
 419   ret void
 420 }
 421
 422 ; offset puts outside of superegister bounaries, so clamp to 1st element.
 423 ; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
 424 ; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
 425 ; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
 426 ; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
 427
 428 ; MOVREL: s_mov_b32 m0, [[IDX]]
 429 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
 430
 431 ; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0)
 432 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
 433 ; IDXMODE: s_set_gpr_idx_off
 434
 435 ; GCN: buffer_store_dword [[EXTRACT]]
 436 define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 437 entry:
 438   %ld = load volatile <16 x i32>, ptr addrspace(1) %in
 439   %offset = add i32 %idx, 15
 440   %value = extractelement <16 x i32> %ld, i32 %offset
 441   store i32 %value, ptr addrspace(1) %out
 442   ret void
 443 }
 444
 445 ; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
 446 ; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]]
 447 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
 448 ; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
 449
 450 ; MOVREL: s_mov_b32 m0, [[ADD_IDX]]
 451 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
 452
 453 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0)
 454 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
 455 ; IDXMODE: s_set_gpr_idx_off
 456
 457 ; GCN: buffer_store_dword [[EXTRACT]]
 458 define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 459 entry:
 460   %ld = load volatile <16 x i32>, ptr addrspace(1) %in
 461   %offset = add i32 %idx, 16
 462   %value = extractelement <16 x i32> %ld, i32 %offset
 463   store i32 %value, ptr addrspace(1) %out
 464   ret void
 465 }
 466
 467 ; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
 468 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
 469 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
 470
 471 ; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
 472 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 473
 474 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
 475 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 476 ; IDXMODE: s_set_gpr_idx_off
 477 define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
 478 entry:
 479   %ld = load volatile <16 x i32>, ptr addrspace(1) %in
 480   %idx.shl = shl i32 %idx.in, 2
 481   %idx = or i32 %idx.shl, 1
 482   %value = extractelement <16 x i32> %ld, i32 %idx
 483   store i32 %value, ptr addrspace(1) %out
 484   ret void
 485 }
 486
 487 ; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
 488 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
 489 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
 490
 491 ; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
 492 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 493
 494 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
 495 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 496 ; IDXMODE: s_set_gpr_idx_off
 497 define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
 498   %idx.shl = shl i32 %idx.in, 2
 499   %idx = or i32 %idx.shl, 1
 500   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
 501   store <16 x float> %vecins, ptr addrspace(1) %out, align 64
 502   ret void
 503 }
 504
 505 ; GCN-LABEL: {{^}}broken_phi_bb:
 506 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
 507
 508 ; GCN: {{.LBB[0-9]+_[0-9]+}}:
 509 ; GCN: [[BB2:.LBB[0-9]+_[0-9]+]]:
 510 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
 511 ; GCN: buffer_load_dword
 512
 513 ; GCN: [[REGLOOP:.LBB[0-9]+_[0-9]+]]:
 514 ; MOVREL: v_movreld_b32_e32
 515
 516 ; IDXMODE: s_set_gpr_idx_on
 517 ; IDXMODE: v_mov_b32_e32
 518 ; IDXMODE: s_set_gpr_idx_off
 519
 520 ; GCN: s_cbranch_execnz [[REGLOOP]]
 521
 522 ; GCN: {{^; %bb.[0-9]}}:
 523 ; GCN: s_mov_b64 exec,
 524 ; GCN: s_cbranch_execnz [[BB2]]
 525
 526 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 527 bb:
 528   br label %bb2
 529
 530 bb2:                                              ; preds = %bb4, %bb
 531   %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
 532   %tmp3 = icmp slt i32 %tmp, %arg
 533   br i1 %tmp3, label %bb4, label %bb8
 534
 535 bb4:                                              ; preds = %bb2
 536   %vgpr = load volatile i32, ptr addrspace(1) undef
 537   %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
 538   %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
 539   %tmp7 = extractelement <16 x i32> %tmp6, i32 0
 540   br label %bb2
 541
 542 bb8:                                              ; preds = %bb2
 543   ret void
 544 }
 545
 546 declare i32 @llvm.amdgcn.workitem.id.x() #1
 547 declare void @llvm.amdgcn.s.barrier() #2
 548
 549 attributes #0 = { nounwind }
 550 attributes #1 = { nounwind readnone }
 551 attributes #2 = { nounwind convergent }