1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2 ; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s
4 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
5 ; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout(
6 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
8 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
9 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
10 ; GFX906-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4
11 ; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
12 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
13 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
14 ; GFX906-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4
15 ; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
16 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
17 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
18 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
20 ; GFX906-NEXT: br label [[BB_2]]
22 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
23 ; GFX906-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24
24 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
25 ; GFX906-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
26 ; GFX906-NEXT: ret void
29 %idx = call i32 @llvm.amdgcn.workitem.id.x()
30 %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
31 %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
32 %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
33 %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
34 %cmp = icmp ult i32 %idx, 15
35 br i1 %cmp, label %bb.1, label %bb.2
40 %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
41 store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
45 define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
46 ; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout(
47 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
49 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
50 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
51 ; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
52 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
53 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
54 ; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
55 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
56 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
57 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
59 ; GFX906-NEXT: br label [[BB_2]]
61 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
62 ; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
63 ; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
64 ; GFX906-NEXT: ret void
67 %idx = call i32 @llvm.amdgcn.workitem.id.x()
68 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
69 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
70 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
71 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
72 %cmp = icmp ult i32 %idx, 15
73 br i1 %cmp, label %bb.1, label %bb.2
78 %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
79 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
83 define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
84 ; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout(
85 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
87 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
88 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
89 ; GFX906-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8
90 ; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
91 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
92 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
93 ; GFX906-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8
94 ; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
95 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
96 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
97 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
99 ; GFX906-NEXT: br label [[BB_2]]
101 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
102 ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
103 ; GFX906-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
104 ; GFX906-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
105 ; GFX906-NEXT: ret void
108 %idx = call i32 @llvm.amdgcn.workitem.id.x()
109 %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
110 %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
111 %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
112 %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
113 %cmp = icmp ult i32 %idx, 15
114 br i1 %cmp, label %bb.1, label %bb.2
119 %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
120 store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
124 define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
125 ; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout(
126 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
127 ; GFX906-NEXT: entry:
128 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
129 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
130 ; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
131 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
132 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
133 ; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
134 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
135 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
136 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
138 ; GFX906-NEXT: br label [[BB_2]]
140 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
141 ; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
142 ; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
143 ; GFX906-NEXT: ret void
146 %idx = call i32 @llvm.amdgcn.workitem.id.x()
147 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
148 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
149 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
150 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
151 %cmp = icmp ult i32 %idx, 15
152 br i1 %cmp, label %bb.1, label %bb.2
157 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
158 store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
162 define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
163 ; GFX906-LABEL: define amdgpu_kernel void @repeat_successor(
164 ; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
165 ; GFX906-NEXT: entry:
166 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
167 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
168 ; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
169 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
170 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
171 ; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4
172 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32
173 ; GFX906-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [
174 ; GFX906-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]]
175 ; GFX906-NEXT: i32 2, label [[RETURN_SINK_SPLIT]]
176 ; GFX906-NEXT: i32 3, label [[SW_BB5:%.*]]
179 ; GFX906-NEXT: br label [[RETURN_SINK_SPLIT]]
180 ; GFX906: return.sink.split:
181 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
182 ; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8>
183 ; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4
184 ; GFX906-NEXT: ret void
186 ; GFX906-NEXT: ret void
189 %idx = call i32 @llvm.amdgcn.workitem.id.x()
190 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
191 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
192 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
193 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
194 switch i32 %in, label %return [
195 i32 1, label %return.sink.split
196 i32 2, label %return.sink.split
201 br label %return.sink.split
204 %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
205 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
212 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
213 ; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain(
214 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST0:%.*]], ptr addrspace(1) captures(none) [[DST1:%.*]]) #[[ATTR0]] {
215 ; GFX906-NEXT: entry:
216 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
217 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
218 ; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
219 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
220 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
221 ; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
222 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
223 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
224 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]]
226 ; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
227 ; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
229 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
230 ; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
231 ; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
232 ; GFX906-NEXT: br label [[BB_3]]
234 ; GFX906-NEXT: [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ]
235 ; GFX906-NEXT: [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8>
236 ; GFX906-NEXT: store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
237 ; GFX906-NEXT: ret void
240 %idx = call i32 @llvm.amdgcn.workitem.id.x()
241 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
242 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
243 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
244 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
245 %cmp = icmp ult i32 %idx, 15
246 br i1 %cmp, label %bb.1, label %bb.2
248 %cmp2 = icmp ult i32 %idx, 7
249 br i1 %cmp2, label %bb.2, label %bb.3
252 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
253 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
257 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
258 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
262 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
263 ; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block(
264 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST0:%.*]], ptr addrspace(1) captures(none) [[DST1:%.*]]) #[[ATTR0]] {
265 ; GFX906-NEXT: entry:
266 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
267 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
268 ; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8
269 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32>
270 ; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]]
271 ; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8
272 ; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32>
273 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
274 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]]
276 ; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7
277 ; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]]
279 ; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8>
280 ; GFX906-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4
281 ; GFX906-NEXT: br label [[BB_3]]
283 ; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
284 ; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8>
285 ; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
286 ; GFX906-NEXT: ret void
289 %idx = call i32 @llvm.amdgcn.workitem.id.x()
290 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
291 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
292 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
293 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
294 %cmp = icmp ult i32 %idx, 15
295 br i1 %cmp, label %bb.1, label %bb.3
297 %cmp2 = icmp ult i32 %idx, 7
298 br i1 %cmp2, label %bb.2, label %bb.3
301 store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
305 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
306 store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
310 define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
311 ; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried(
312 ; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] {
313 ; GFX906-NEXT: entry:
314 ; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
315 ; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]]
316 ; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4
317 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32
318 ; GFX906-NEXT: br label [[BB_1:%.*]]
320 ; GFX906-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
321 ; GFX906-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
322 ; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
323 ; GFX906-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
324 ; GFX906-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32
325 ; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15
326 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]]
328 ; GFX906-NEXT: br label [[BB_2]]
330 ; GFX906-NEXT: [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8>
331 ; GFX906-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4
332 ; GFX906-NEXT: ret void
335 %idx = call i32 @llvm.amdgcn.workitem.id.x()
336 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
337 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
341 %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
342 %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
343 %cmp = icmp ult i32 %idx, 15
344 br i1 %cmp, label %bb.1, label %bb.2
348 store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
352 ; Should not produce a broken phi
354 define void @broken_phi() {
355 ; GFX906-LABEL: define void @broken_phi(
356 ; GFX906-SAME: ) #[[ATTR0]] {
358 ; GFX906-NEXT: br label [[BB1:%.*]]
360 ; GFX906-NEXT: [[I:%.*]] = phi <4 x i8> [ splat (i8 1), [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ]
361 ; GFX906-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]]
363 ; GFX906-NEXT: br label [[BB3]]
365 ; GFX906-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ]
366 ; GFX906-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]]
368 ; GFX906-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer)
369 ; GFX906-NEXT: br label [[BB7]]
371 ; GFX906-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ]
372 ; GFX906-NEXT: br label [[BB1]]
377 %i = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, %bb ], [ %i8, %bb7 ]
378 br i1 false, label %bb3, label %bb2
382 %i4 = phi <4 x i8> [ zeroinitializer, %bb2 ], [ %i, %bb1 ]
383 br i1 false, label %bb7, label %bb5
385 %i6 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %i4, <4 x i8> zeroinitializer)
388 %i8 = phi <4 x i8> [ zeroinitializer, %bb5 ], [ zeroinitializer, %bb3 ]
392 ; %sel1 should just use %sel0 instead of trying to convert back the
393 ; converted version of %sel0
395 define amdgpu_kernel void @reuseOp() {
396 ; GFX906-LABEL: define amdgpu_kernel void @reuseOp(
397 ; GFX906-SAME: ) #[[ATTR0]] {
398 ; GFX906-NEXT: entry:
399 ; GFX906-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
400 ; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32>
401 ; GFX906-NEXT: br label [[BB_1:%.*]]
403 ; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8>
404 ; GFX906-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
405 ; GFX906-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32>
406 ; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]]
407 ; GFX906-NEXT: br label [[BB_2:%.*]]
409 ; GFX906-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8>
410 ; GFX906-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0
411 ; GFX906-NEXT: ret void
414 %vec1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
418 %sel0 = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
419 %sel1 = select i1 false, <16 x i8> %vec1, <16 x i8> %sel0
423 %val = extractelement <16 x i8> %sel0, i64 0
428 define amdgpu_kernel void @deletedPHI(i32 %in0, i1 %cmp, <10 x i8> %invec0) {
429 ; GFX906-LABEL: define amdgpu_kernel void @deletedPHI(
430 ; GFX906-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] {
431 ; GFX906-NEXT: entry:
432 ; GFX906-NEXT: br label [[BB_1:%.*]]
434 ; GFX906-NEXT: [[PHI0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB_11:%.*]] ]
435 ; GFX906-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY]] ], [ [[VEC1:%.*]], [[BB_11]] ]
436 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]]
438 ; GFX906-NEXT: br label [[BB_3]]
440 ; GFX906-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI1]], [[BB_1]] ]
441 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]]
443 ; GFX906-NEXT: [[VEC0:%.*]] = insertelement <10 x i8> [[PHI2]], i8 0, i64 0
444 ; GFX906-NEXT: br label [[BB_5]]
446 ; GFX906-NEXT: [[PHI3:%.*]] = phi <10 x i8> [ [[VEC0]], [[BB_4]] ], [ [[PHI2]], [[BB_3]] ]
447 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]]
449 ; GFX906-NEXT: br label [[BB_7]]
451 ; GFX906-NEXT: [[PHI4:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_6]] ], [ [[PHI3]], [[BB_5]] ]
452 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_9:%.*]], label [[BB_8:%.*]]
454 ; GFX906-NEXT: br label [[BB_9]]
456 ; GFX906-NEXT: [[PHI5:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_8]] ], [ [[PHI4]], [[BB_7]] ]
457 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_11]], label [[BB_10:%.*]]
459 ; GFX906-NEXT: br label [[BB_11]]
461 ; GFX906-NEXT: [[PHI6:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_10]] ], [ [[PHI5]], [[BB_9]] ]
462 ; GFX906-NEXT: [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19>
463 ; GFX906-NEXT: br label [[BB_1]]
469 %phi0 = phi i32 [ 0, %entry ], [ 1, %bb.11 ]
470 %phi1 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %vec1, %bb.11 ]
471 br i1 %cmp, label %bb.3, label %bb.2
477 %phi2 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi1, %bb.1 ]
478 br i1 %cmp, label %bb.5, label %bb.4
481 %vec0 = insertelement <10 x i8> %phi2, i8 0, i64 0
484 bb.5: ; preds = %bb.4, %bb.3
485 %phi3 = phi <10 x i8> [ %vec0, %bb.4 ], [ %phi2, %bb.3 ]
486 br i1 %cmp, label %bb.7, label %bb.6
491 bb.7: ; preds = %bb.6, %bb.5
492 %phi4 = phi <10 x i8> [ %invec0, %bb.6 ], [ %phi3, %bb.5 ]
493 br i1 %cmp, label %bb.9, label %bb.8
499 %phi5 = phi <10 x i8> [ %invec0, %bb.8 ], [ %phi4, %bb.7 ]
500 br i1 %cmp, label %bb.11, label %bb.10
506 %phi6 = phi <10 x i8> [ zeroinitializer, %bb.10 ], [ %phi5, %bb.9 ]
507 %vec1 = shufflevector <10 x i8> %phi6, <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19>
511 define amdgpu_kernel void @multiple_unwind(i1 %cmp, <10 x i8> %invec) {
512 ; GFX906-LABEL: define amdgpu_kernel void @multiple_unwind(
513 ; GFX906-SAME: i1 [[CMP:%.*]], <10 x i8> [[INVEC:%.*]]) #[[ATTR0]] {
514 ; GFX906-NEXT: entry:
515 ; GFX906-NEXT: br label [[BB_1:%.*]]
517 ; GFX906-NEXT: [[PHI0:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY:%.*]] ], [ [[PHI3:%.*]], [[BB_8:%.*]] ]
518 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]]
520 ; GFX906-NEXT: br label [[BB_3]]
522 ; GFX906-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI0]], [[BB_1]] ]
523 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]]
525 ; GFX906-NEXT: br label [[BB_5]]
527 ; GFX906-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ [[PHI0]], [[BB_4]] ], [ [[PHI1]], [[BB_3]] ]
528 ; GFX906-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]]
530 ; GFX906-NEXT: br label [[BB_7]]
532 ; GFX906-NEXT: [[PHI3]] = phi <10 x i8> [ [[INVEC]], [[BB_6]] ], [ [[PHI2]], [[BB_5]] ]
533 ; GFX906-NEXT: br label [[BB_8]]
535 ; GFX906-NEXT: br label [[BB_1]]
541 %phi0 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %phi3, %bb.8 ]
542 br i1 %cmp, label %bb.3, label %bb.2
548 %phi1 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi0, %bb.1 ]
549 br i1 %cmp, label %bb.5, label %bb.4
555 %phi2 = phi <10 x i8> [ %phi0, %bb.4 ], [ %phi1, %bb.3 ]
556 br i1 %cmp, label %bb.7, label %bb.6
558 bb.6: ; preds = %bb.5
562 %phi3 = phi <10 x i8> [ %invec, %bb.6 ], [ %phi2, %bb.5 ]
571 declare i32 @llvm.amdgcn.workitem.id.x()