1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
6 ; Tests for indirect addressing on SI, which is implemented using dynamic
9 ; GCN-LABEL: {{^}}extract_w_offset:
10 ; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
11 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
12 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
13 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
14 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
15 ; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
17 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
18 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
20 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
21 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
22 ; IDXMODE-NEXT: s_set_gpr_idx_off
23 define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
26 %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
27 store float %elt, float addrspace(1)* %out
31 ; XXX: Could do v_or_b32 directly
32 ; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
37 ; MOVREL: s_mov_b32 m0
38 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
39 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
40 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
41 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
44 ; MOVREL: v_movrels_b32_e32
46 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
47 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
48 ; IDXMODE-NEXT: s_set_gpr_idx_off
49 define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) {
52 %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
53 %elt = extractelement <16 x i32> %vec, i32 %idx
54 store i32 %elt, i32 addrspace(1)* %out
58 ; GCN-LABEL: {{^}}extract_wo_offset:
59 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
60 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
61 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
62 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
63 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
65 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
66 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
68 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
69 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
70 ; IDXMODE-NEXT: s_set_gpr_idx_off
71 define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
73 %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
74 store float %elt, float addrspace(1)* %out
78 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
79 ; The offset depends on the register that holds the first element of the vector.
80 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
81 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
83 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
84 ; IDXMODE: v_mov_b32_e32 v14, 15
85 ; IDXMODE: v_mov_b32_e32 v15, 16
86 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
87 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
88 ; IDXMODE-NEXT: s_set_gpr_idx_off
89 define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
91 %index = add i32 %offset, -512
92 %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
93 store i32 %value, i32 addrspace(1)* %out
97 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
98 ; The offset depends on the register that holds the first element of the vector.
99 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
100 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
102 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
103 ; IDXMODE: v_mov_b32_e32 v0,
104 ; IDXMODE: v_mov_b32_e32 v1,
105 ; IDXMODE: v_mov_b32_e32 v2,
106 ; IDXMODE: v_mov_b32_e32 v3,
107 ; IDXMODE: v_mov_b32_e32 v4,
108 ; IDXMODE: v_mov_b32_e32 v5,
109 ; IDXMODE: v_mov_b32_e32 v6,
110 ; IDXMODE: v_mov_b32_e32 v7,
111 ; IDXMODE: v_mov_b32_e32 v8,
112 ; IDXMODE: v_mov_b32_e32 v9,
113 ; IDXMODE: v_mov_b32_e32 v10,
114 ; IDXMODE: v_mov_b32_e32 v11,
115 ; IDXMODE: v_mov_b32_e32 v12,
116 ; IDXMODE: v_mov_b32_e32 v13,
117 ; IDXMODE: v_mov_b32_e32 v14,
118 ; IDXMODE: v_mov_b32_e32 v15,
119 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
120 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
121 ; IDXMODE-NEXT: s_set_gpr_idx_off
122 define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
124 %index = add i32 %offset, -512
125 %or = or <16 x i32> %vec0, %vec1
126 %value = extractelement <16 x i32> %or, i32 %index
127 store i32 %value, i32 addrspace(1)* %out
131 ; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
132 ; The offset depends on the register that holds the first element of the vector.
134 ; FIXME: The waitcnt for the argument load can go after the loop
135 ; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
136 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
137 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
138 ; GCN: s_and_saveexec_b64 vcc, vcc
140 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
141 ; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
143 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
144 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0)
145 ; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
146 ; IDXMODE: s_set_gpr_idx_off
148 ; GCN: s_cbranch_execnz
150 ; GCN: buffer_store_dword [[RESULT]]
151 define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
153 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
154 %index = add i32 %id, -512
155 %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
156 store i32 %value, i32 addrspace(1)* %out
160 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
161 ; undefined behavior, but shouldn't crash compiler
162 define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
164 %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
165 %value = extractelement <4 x i32> %ld, i32 undef
166 store i32 %value, i32 addrspace(1)* %out
170 ; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
171 ; undefined behavior, but shouldn't crash compiler
172 define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
174 %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
175 %value = insertelement <4 x i32> %ld, i32 5, i32 undef
176 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
180 ; GCN-LABEL: {{^}}insert_w_offset:
181 ; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
182 ; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
183 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
184 ; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
185 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
186 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
187 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
188 ; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
189 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
191 ; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
192 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
193 define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
195 %add = add i32 %in, 1
196 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
197 store <16 x float> %ins, <16 x float> addrspace(1)* %out
201 ; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset:
202 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
203 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
204 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
205 ; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff
207 ; MOVREL: s_mov_b32 m0, [[BASE]]
208 ; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}}
210 ; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST)
211 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
212 ; IDXMODE-NEXT: s_set_gpr_idx_off
213 define amdgpu_kernel void @insert_unsigned_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
215 %base = zext i16 %in to i32
216 %add = add i32 %base, 1
217 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
218 store <16 x float> %ins, <16 x float> addrspace(1)* %out
222 ; GCN-LABEL: {{^}}insert_signed_base_plus_offset:
223 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
224 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
225 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
227 ; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]]
228 ; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1
230 ; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]]
231 ; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}}
233 ; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST)
234 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
235 ; IDXMODE-NEXT: s_set_gpr_idx_off
236 define amdgpu_kernel void @insert_signed_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
238 %base = sext i16 %in to i32
239 %add = add i32 %base, 1
240 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
241 store <16 x float> %ins, <16 x float> addrspace(1)* %out
246 ; GCN-LABEL: {{^}}insert_wo_offset:
247 ; GCN: s_load_dword [[IN:s[0-9]+]]
249 ; MOVREL: s_mov_b32 m0, [[IN]]
250 ; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
252 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST)
253 ; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
254 ; IDXMODE-NEXT: s_set_gpr_idx_off
256 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
257 define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
259 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
260 store <16 x float> %ins, <16 x float> addrspace(1)* %out
264 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
265 ; The offset depends on the register that holds the first element of the vector.
266 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
267 ; MOVREL: v_movreld_b32_e32 v0, 16
269 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
270 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
271 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
272 ; IDXMODE-NEXT: s_set_gpr_idx_off
273 define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
275 %index = add i32 %offset, -512
276 %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
277 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
281 ; The vector indexed into is originally loaded into an SGPR rather
282 ; than built with a reg_sequence
284 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
285 ; The offset depends on the register that holds the first element of the vector.
286 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
287 ; MOVREL: v_movreld_b32_e32 v0, 5
289 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
290 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
291 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
292 ; IDXMODE-NEXT: s_set_gpr_idx_off
293 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
295 %index = add i32 %offset, -512
296 %value = insertelement <16 x i32> %vec, i32 5, i32 %index
297 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
301 ; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
302 ; The offset depends on the register that holds the first element of the vector.
304 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
305 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
306 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
307 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
308 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
309 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
310 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
311 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
312 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
313 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
314 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
315 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
316 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
317 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
318 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
319 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
321 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
322 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
323 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
324 ; GCN: s_and_saveexec_b64 vcc, vcc
326 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
327 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33
329 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
330 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
331 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33
332 ; IDXMODE: s_set_gpr_idx_off
334 ; GCN: s_cbranch_execnz [[LOOPBB]]
335 ; GCN: s_mov_b64 exec, [[SAVEEXEC]]
337 ; GCN: buffer_store_dword
338 define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
340 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
341 %index = add i32 %id, -512
342 %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
343 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
347 ; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
349 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
350 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
351 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
352 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
353 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
354 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
355 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
356 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
357 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
358 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
359 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
360 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
361 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
362 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
363 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
364 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
365 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
367 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
369 ; The offset depends on the register that holds the first element of the vector.
370 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
372 ; MOVREL: s_add_i32 m0, [[READLANE]], -16
373 ; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
375 ; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
376 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
377 ; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
378 ; IDXMODE: s_set_gpr_idx_off
380 ; GCN: s_cbranch_execnz
381 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
383 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
384 %index = add i32 %id, -16
385 %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
386 store <16 x i32> %value, <16 x i32> addrspace(1)* %out
390 ; When the block is split to insert the loop, make sure any other
391 ; places that need to be expanded in the same block are also handled.
393 ; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
395 ; FIXME: Why is vector copied in between?
397 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
398 ; GCN-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
399 ; GCN-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
400 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
401 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
403 ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
405 ; GCN: s_waitcnt vmcnt(0)
406 ; PREGFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]]
407 ; GFX9: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], 1, [[IDX0]]
410 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
411 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
412 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
413 ; GCN: s_and_saveexec_b64 vcc, vcc
415 ; MOVREL: s_mov_b32 m0, [[READLANE]]
416 ; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
418 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(SRC0)
419 ; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
420 ; IDXMODE: s_set_gpr_idx_off
422 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
423 ; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
425 ; FIXME: Redundant copy
426 ; GCN: s_mov_b64 exec, [[MASK]]
428 ; GCN: v_mov_b32_e32 [[VEC_ELT0_2:v[0-9]+]], [[S_ELT0]]
430 ; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
432 ; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
433 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX1]]
434 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX1]]
435 ; GCN: s_and_saveexec_b64 vcc, vcc
437 ; MOVREL: s_mov_b32 m0, [[READLANE]]
438 ; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]]
440 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], gpr_idx(SRC0)
441 ; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT0_2]]
442 ; IDXMODE: s_set_gpr_idx_off
444 ; GCN-NEXT: s_xor_b64 exec, exec, vcc
445 ; GCN: s_cbranch_execnz [[LOOP1]]
447 ; GCN: buffer_store_dword [[MOVREL0]]
448 ; GCN: buffer_store_dword [[MOVREL1]]
449 define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
451 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
452 %id.ext = zext i32 %id to i64
453 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
454 %idx0 = load volatile i32, i32 addrspace(1)* %gep
455 %idx1 = add i32 %idx0, 1
456 %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
457 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
458 %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
459 store volatile i32 %val0, i32 addrspace(1)* %out0
460 store volatile i32 %val1, i32 addrspace(1)* %out0
461 %cmp = icmp eq i32 %id, 0
462 br i1 %cmp, label %bb1, label %bb2
465 store volatile i32 %live.out.reg, i32 addrspace(1)* undef
472 ; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
473 ; avoid very different schedule induced isses with gfx9.
474 ; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
477 ; GCN-LABEL: {{^}}insert_adjacent_blocks:
478 define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
480 %tmp = icmp eq i32 %arg, 0
481 br i1 %tmp, label %bb1, label %bb4
484 %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
485 %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
486 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
490 %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
491 %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
492 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
495 bb7: ; preds = %bb4, %bb1
496 %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
497 store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
501 ; FIXME: Should be able to fold zero input to movreld to inline imm?
503 ; GCN-LABEL: {{^}}multi_same_block:
505 ; GCN: s_load_dword [[ARG:s[0-9]+]]
507 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
509 ; MOVREL: s_add_i32 m0, [[ARG]], -16
510 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
511 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
512 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
513 ; MOVREL: s_mov_b32 m0, -1
516 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
518 ; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
519 ; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
520 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
521 ; IDXMODE: s_set_gpr_idx_off
522 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
523 ; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST)
524 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
525 ; IDXMODE: s_set_gpr_idx_off
530 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
532 %tmp1 = add i32 %arg, -16
533 %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
534 %tmp3 = add i32 %arg, -16
535 %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
536 %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
537 %tmp6 = extractelement <9 x i32> %tmp5, i32 1
538 %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
539 %tmp8 = extractelement <9 x i32> %tmp7, i32 5
540 store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
541 store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
545 ; offset puts outside of superegister bounaries, so clamp to 1st element.
546 ; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
547 ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
548 ; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
549 ; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
551 ; MOVREL: s_mov_b32 m0, [[IDX]]
552 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
554 ; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0)
555 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
556 ; IDXMODE: s_set_gpr_idx_off
558 ; GCN: buffer_store_dword [[EXTRACT]]
559 define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
561 %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in
562 %offset = add i32 %idx, 15
563 %value = extractelement <16 x i32> %ld, i32 %offset
564 store i32 %value, i32 addrspace(1)* %out
568 ; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
569 ; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
570 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
571 ; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
573 ; MOVREL: s_mov_b32 m0, [[ADD_IDX]]
574 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
576 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0)
577 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
578 ; IDXMODE: s_set_gpr_idx_off
580 ; GCN: buffer_store_dword [[EXTRACT]]
581 define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
583 %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in
584 %offset = add i32 %idx, 16
585 %value = extractelement <16 x i32> %ld, i32 %offset
586 store i32 %value, i32 addrspace(1)* %out
590 ; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
591 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
592 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
593 ; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
595 ; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
596 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
598 ; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(SRC0)
599 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
600 ; IDXMODE: s_set_gpr_idx_off
601 define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
603 %ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in
604 %idx.shl = shl i32 %idx.in, 2
605 %idx = or i32 %idx.shl, 1
606 %value = extractelement <16 x i32> %ld, i32 %idx
607 store i32 %value, i32 addrspace(1)* %out
611 ; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
612 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
613 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
614 ; GCN: s_or_b32 [[IDX_FIN:s[0-9]+]], [[IDX_SHL]], 1
616 ; MOVREL: s_mov_b32 m0, [[IDX_FIN]]
617 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
619 ; IDXMODE: s_set_gpr_idx_on [[IDX_FIN]], gpr_idx(DST)
620 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
621 ; IDXMODE: s_set_gpr_idx_off
622 define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
623 %idx.shl = shl i32 %idx.in, 2
624 %idx = or i32 %idx.shl, 1
625 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
626 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
630 ; GCN-LABEL: {{^}}broken_phi_bb:
631 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
633 ; GCN: [[BB2:BB[0-9]+_[0-9]+]]:
634 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
635 ; GCN: buffer_load_dword
637 ; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
638 ; MOVREL: v_movreld_b32_e32
640 ; IDXMODE: s_set_gpr_idx_on
641 ; IDXMODE: v_mov_b32_e32
642 ; IDXMODE: s_set_gpr_idx_off
644 ; GCN: s_cbranch_execnz [[REGLOOP]]
646 ; GCN: {{^; %bb.[0-9]}}:
647 ; GCN: s_mov_b64 exec,
648 ; GCN: s_branch [[BB2]]
650 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
654 bb2: ; preds = %bb4, %bb
655 %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
656 %tmp3 = icmp slt i32 %tmp, %arg
657 br i1 %tmp3, label %bb4, label %bb8
660 %vgpr = load volatile i32, i32 addrspace(1)* undef
661 %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
662 %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
663 %tmp7 = extractelement <16 x i32> %tmp6, i32 0
670 declare i32 @llvm.amdgcn.workitem.id.x() #1
671 declare void @llvm.amdgcn.s.barrier() #2
673 attributes #0 = { nounwind }
674 attributes #1 = { nounwind readnone }
675 attributes #2 = { nounwind convergent }