1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
6 ; Tests for indirect addressing on SI, which is implemented using dynamic
9 ; GCN-LABEL: {{^}}extract_w_offset:
10 ; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
11 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
12 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
13 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
14 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
15 ; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
17 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
18 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
20 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
21 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
22 ; IDXMODE-NEXT: s_set_gpr_idx_off
23 define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
26 %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
27 store float %elt, ptr addrspace(1) %out
31 ; XXX: Could do v_or_b32 directly
32 ; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
37 ; MOVREL: s_mov_b32 m0
38 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
39 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
40 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
41 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
44 ; MOVREL: v_movrels_b32_e32
46 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
47 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
48 ; IDXMODE-NEXT: s_set_gpr_idx_off
49 define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
52 %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
53 %elt = extractelement <16 x i32> %vec, i32 %idx
54 store i32 %elt, ptr addrspace(1) %out
58 ; GCN-LABEL: {{^}}extract_wo_offset:
59 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
60 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
61 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
62 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
63 ; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
65 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
66 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
68 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
69 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
70 ; IDXMODE-NEXT: s_set_gpr_idx_off
71 define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
73 %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
74 store float %elt, ptr addrspace(1) %out
78 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
79 ; The offset depends on the register that holds the first element of the vector.
80 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
81 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
83 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
84 ; IDXMODE: v_mov_b32_e32 v14, 15
85 ; IDXMODE: v_mov_b32_e32 v15, 16
86 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
87 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
88 ; IDXMODE-NEXT: s_set_gpr_idx_off
89 define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
91 %index = add i32 %offset, -512
92 %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
93 store i32 %value, ptr addrspace(1) %out
97 ; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
98 ; The offset depends on the register that holds the first element of the vector.
99 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
100 ; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
102 ; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
103 ; IDXMODE-DAG: v_mov_b32_e32 v0,
104 ; IDXMODE: v_mov_b32_e32 v1,
105 ; IDXMODE: v_mov_b32_e32 v2,
106 ; IDXMODE: v_mov_b32_e32 v3,
107 ; IDXMODE: v_mov_b32_e32 v4,
108 ; IDXMODE: v_mov_b32_e32 v5,
109 ; IDXMODE: v_mov_b32_e32 v6,
110 ; IDXMODE: v_mov_b32_e32 v7,
111 ; IDXMODE: v_mov_b32_e32 v8,
112 ; IDXMODE: v_mov_b32_e32 v9,
113 ; IDXMODE: v_mov_b32_e32 v10,
114 ; IDXMODE: v_mov_b32_e32 v11,
115 ; IDXMODE: v_mov_b32_e32 v12,
116 ; IDXMODE: v_mov_b32_e32 v13,
117 ; IDXMODE: v_mov_b32_e32 v14,
118 ; IDXMODE: v_mov_b32_e32 v15,
119 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
120 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
121 ; IDXMODE-NEXT: s_set_gpr_idx_off
122 define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
124 %index = add i32 %offset, -512
125 %or = or <16 x i32> %vec0, %vec1
126 %value = extractelement <16 x i32> %or, i32 %index
127 store i32 %value, ptr addrspace(1) %out
131 ; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
132 ; The offset depends on the register that holds the first element of the vector.
134 ; GCN: v_cmp_eq_u32_e32
135 ; GCN-COUNT-14: v_cndmask_b32
136 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16
137 ; GCN: buffer_store_dword [[RESULT]]
138 define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
140 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
141 %index = add i32 %id, -512
142 %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
143 store i32 %value, ptr addrspace(1) %out
147 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
148 ; undefined behavior, but shouldn't crash compiler
149 define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
151 %ld = load volatile <4 x i32>, ptr addrspace(1) %in
152 %value = extractelement <4 x i32> %ld, i32 undef
153 store i32 %value, ptr addrspace(1) %out
157 ; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
158 ; undefined behavior, but shouldn't crash compiler
159 define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
161 %ld = load <4 x i32>, ptr addrspace(1) %in
162 %value = insertelement <4 x i32> %ld, i32 5, i32 undef
163 store <4 x i32> %value, ptr addrspace(1) %out
167 ; GCN-LABEL: {{^}}insert_w_offset:
168 ; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
169 ; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
170 ; MOVREL-DAG: s_mov_b32 m0, [[IN]]
171 ; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
172 ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
173 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
174 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
175 ; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
176 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
178 ; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
179 ; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]]
180 define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
182 %add = add i32 %in, 1
183 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
184 store <16 x float> %ins, ptr addrspace(1) %out
188 ; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset:
189 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
190 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
191 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
192 ; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff
194 ; MOVREL: s_mov_b32 m0, [[BASE]]
195 ; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}}
197 ; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST)
198 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
199 ; IDXMODE-NEXT: s_set_gpr_idx_off
200 define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
202 %base = zext i16 %in to i32
203 %add = add i32 %base, 1
204 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
205 store <16 x float> %ins, ptr addrspace(1) %out
209 ; GCN-LABEL: {{^}}insert_signed_base_plus_offset:
210 ; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
211 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
212 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
214 ; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]]
215 ; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1
217 ; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]]
218 ; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}}
220 ; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST)
221 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
222 ; IDXMODE-NEXT: s_set_gpr_idx_off
223 define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
225 %base = sext i16 %in to i32
226 %add = add i32 %base, 1
227 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
228 store <16 x float> %ins, ptr addrspace(1) %out
233 ; GCN-LABEL: {{^}}insert_wo_offset:
234 ; GCN: s_load_dword [[IN:s[0-9]+]]
236 ; MOVREL: s_mov_b32 m0, [[IN]]
237 ; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
239 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST)
240 ; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
241 ; IDXMODE-NEXT: s_set_gpr_idx_off
243 ; GCN: buffer_store_dwordx4 v[[[ELT0]]:
244 define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
246 %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
247 store <16 x float> %ins, ptr addrspace(1) %out
251 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
252 ; The offset depends on the register that holds the first element of the vector.
253 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
254 ; MOVREL: v_movreld_b32_e32 v0, 16
256 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
257 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
258 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
259 ; IDXMODE-NEXT: s_set_gpr_idx_off
260 define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
262 %index = add i32 %offset, -512
263 %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
264 store <16 x i32> %value, ptr addrspace(1) %out
268 ; The vector indexed into is originally loaded into an SGPR rather
269 ; than built with a reg_sequence
271 ; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
272 ; The offset depends on the register that holds the first element of the vector.
273 ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
274 ; MOVREL: v_movreld_b32_e32 v0, 5
276 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
277 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
278 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
279 ; IDXMODE-NEXT: s_set_gpr_idx_off
280 define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
282 %index = add i32 %offset, -512
283 %value = insertelement <16 x i32> %vec, i32 5, i32 %index
284 store <16 x i32> %value, ptr addrspace(1) %out
288 ; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
289 ; The offset depends on the register that holds the first element of the vector.
291 ; GCN: v_cmp_eq_u32_e32
292 ; GCN-COUNT-16: v_cndmask_b32
293 ; GCN-COUNT-4: buffer_store_dwordx4
294 define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
296 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
297 %index = add i32 %id, -512
298 %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
299 store <16 x i32> %value, ptr addrspace(1) %out
303 ; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
305 ; GCN: v_cmp_eq_u32_e32
306 ; GCN-COUNT-16: v_cndmask_b32
307 ; GCN-COUNT-4: buffer_store_dwordx4
308 define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
310 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
311 %index = add i32 %id, -16
312 %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
313 store <16 x i32> %value, ptr addrspace(1) %out
317 ; When the block is split to insert the loop, make sure any other
318 ; places that need to be expanded in the same block are also handled.
320 ; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
322 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
324 ; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16,
325 ; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16,
327 ; GCN: buffer_store_dword [[RESULT0]]
328 ; GCN: buffer_store_dword [[RESULT1]]
329 define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 {
331 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
332 %id.ext = zext i32 %id to i64
333 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
334 %idx0 = load volatile i32, ptr addrspace(1) %gep
335 %idx1 = add i32 %idx0, 1
336 %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
337 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
338 %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
339 store volatile i32 %val0, ptr addrspace(1) %out0
340 store volatile i32 %val1, ptr addrspace(1) %out0
341 %cmp = icmp eq i32 %id, 0
342 br i1 %cmp, label %bb1, label %bb2
345 store volatile i32 %live.out.reg, ptr addrspace(1) undef
352 ; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
353 ; avoid very different schedule induced isses with gfx9.
354 ; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
357 ; GCN-LABEL: {{^}}insert_adjacent_blocks:
358 define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
360 %tmp = icmp eq i32 %arg, 0
361 br i1 %tmp, label %bb1, label %bb4
364 %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
365 %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
366 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
370 %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
371 %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
372 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
375 bb7: ; preds = %bb4, %bb1
376 %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
377 store volatile <4 x float> %tmp8, ptr addrspace(1) undef
381 ; FIXME: Should be able to fold zero input to movreld to inline imm?
383 ; GCN-LABEL: {{^}}multi_same_block:
385 ; GCN: s_load_dword [[ARG:s[0-9]+]]
387 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
388 ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
390 ; MOVREL: s_add_i32 m0, [[ARG]], -16
391 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
392 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
393 ; MOVREL: s_mov_b32 m0, -1
396 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
397 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
399 ; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
400 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
401 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
402 ; IDXMODE: s_set_gpr_idx_off
407 define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
409 %tmp1 = add i32 %arg, -16
410 %tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
411 %tmp3 = add i32 %arg, -16
412 %tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
413 %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
414 %tmp6 = extractelement <9 x i32> %tmp5, i32 1
415 %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
416 %tmp8 = extractelement <9 x i32> %tmp7, i32 5
417 store volatile i32 %tmp6, ptr addrspace(3) undef, align 4
418 store volatile i32 %tmp8, ptr addrspace(3) undef, align 4
422 ; offset puts outside of superegister bounaries, so clamp to 1st element.
423 ; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
424 ; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
425 ; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
426 ; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
428 ; MOVREL: s_mov_b32 m0, [[IDX]]
429 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
431 ; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0)
432 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
433 ; IDXMODE: s_set_gpr_idx_off
435 ; GCN: buffer_store_dword [[EXTRACT]]
436 define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
438 %ld = load volatile <16 x i32>, ptr addrspace(1) %in
439 %offset = add i32 %idx, 15
440 %value = extractelement <16 x i32> %ld, i32 %offset
441 store i32 %value, ptr addrspace(1) %out
445 ; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
446 ; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]]
447 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
448 ; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
450 ; MOVREL: s_mov_b32 m0, [[ADD_IDX]]
451 ; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
453 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0)
454 ; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
455 ; IDXMODE: s_set_gpr_idx_off
457 ; GCN: buffer_store_dword [[EXTRACT]]
458 define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
460 %ld = load volatile <16 x i32>, ptr addrspace(1) %in
461 %offset = add i32 %idx, 16
462 %value = extractelement <16 x i32> %ld, i32 %offset
463 store i32 %value, ptr addrspace(1) %out
467 ; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
468 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
469 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
471 ; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
472 ; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
474 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
475 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
476 ; IDXMODE: s_set_gpr_idx_off
477 define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
479 %ld = load volatile <16 x i32>, ptr addrspace(1) %in
480 %idx.shl = shl i32 %idx.in, 2
481 %idx = or i32 %idx.shl, 1
482 %value = extractelement <16 x i32> %ld, i32 %idx
483 store i32 %value, ptr addrspace(1) %out
487 ; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
488 ; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
489 ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
491 ; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
492 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
494 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
495 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
496 ; IDXMODE: s_set_gpr_idx_off
497 define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
498 %idx.shl = shl i32 %idx.in, 2
499 %idx = or i32 %idx.shl, 1
500 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
501 store <16 x float> %vecins, ptr addrspace(1) %out, align 64
505 ; GCN-LABEL: {{^}}broken_phi_bb:
506 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
508 ; GCN: {{.LBB[0-9]+_[0-9]+}}:
509 ; GCN: [[BB2:.LBB[0-9]+_[0-9]+]]:
510 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
511 ; GCN: buffer_load_dword
513 ; GCN: [[REGLOOP:.LBB[0-9]+_[0-9]+]]:
514 ; MOVREL: v_movreld_b32_e32
516 ; IDXMODE: s_set_gpr_idx_on
517 ; IDXMODE: v_mov_b32_e32
518 ; IDXMODE: s_set_gpr_idx_off
520 ; GCN: s_cbranch_execnz [[REGLOOP]]
522 ; GCN: {{^; %bb.[0-9]}}:
523 ; GCN: s_mov_b64 exec,
524 ; GCN: s_cbranch_execnz [[BB2]]
526 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
530 bb2: ; preds = %bb4, %bb
531 %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
532 %tmp3 = icmp slt i32 %tmp, %arg
533 br i1 %tmp3, label %bb4, label %bb8
536 %vgpr = load volatile i32, ptr addrspace(1) undef
537 %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
538 %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
539 %tmp7 = extractelement <16 x i32> %tmp6, i32 0
546 declare i32 @llvm.amdgcn.workitem.id.x() #1
547 declare void @llvm.amdgcn.s.barrier() #2
549 attributes #0 = { nounwind }
550 attributes #1 = { nounwind readnone }
551 attributes #2 = { nounwind convergent }