1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1010 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1100 %s
5 ; Test that unused lanes in the s_xor result are masked out with v_cndmask.
7 define i32 @combine_add_zext_xor() {
8 ; GFX1010-LABEL: combine_add_zext_xor:
9 ; GFX1010: ; %bb.0: ; %.entry
10 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0
12 ; GFX1010-NEXT: s_branch .LBB0_2
13 ; GFX1010-NEXT: .LBB0_1: ; %bb9
14 ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1
15 ; GFX1010-NEXT: s_xor_b32 s4, s4, -1
16 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
17 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
18 ; GFX1010-NEXT: v_add_nc_u32_e32 v2, v1, v0
19 ; GFX1010-NEXT: v_mov_b32_e32 v1, v2
20 ; GFX1010-NEXT: s_cbranch_vccz .LBB0_4
21 ; GFX1010-NEXT: .LBB0_2: ; %.a
22 ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
23 ; GFX1010-NEXT: ; implicit-def: $sgpr4
24 ; GFX1010-NEXT: s_cbranch_scc1 .LBB0_1
25 ; GFX1010-NEXT: ; %bb.3: ; %bb
26 ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1
27 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc
28 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
29 ; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
30 ; GFX1010-NEXT: s_branch .LBB0_1
31 ; GFX1010-NEXT: .LBB0_4: ; %.exit
32 ; GFX1010-NEXT: s_setpc_b64 s[30:31]
34 ; GFX1100-LABEL: combine_add_zext_xor:
35 ; GFX1100: ; %bb.0: ; %.entry
36 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
38 ; GFX1100-NEXT: s_branch .LBB0_2
39 ; GFX1100-NEXT: .LBB0_1: ; %bb9
40 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
41 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
42 ; GFX1100-NEXT: s_xor_b32 s0, s0, -1
43 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
44 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
45 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
46 ; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
47 ; GFX1100-NEXT: v_mov_b32_e32 v1, v2
48 ; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
49 ; GFX1100-NEXT: .LBB0_2: ; %.a
50 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
51 ; GFX1100-NEXT: ; implicit-def: $sgpr0
52 ; GFX1100-NEXT: s_cbranch_scc1 .LBB0_1
53 ; GFX1100-NEXT: ; %bb.3: ; %bb
54 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
55 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc
56 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
57 ; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
58 ; GFX1100-NEXT: s_branch .LBB0_1
59 ; GFX1100-NEXT: .LBB0_4: ; %.exit
60 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
64 .a: ; preds = %bb9, %.entry
65 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
66 br i1 undef, label %bb9, label %bb
69 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
70 %i5 = icmp eq i32 %.i3, 0
73 bb9: ; preds = %bb, %.a
74 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
75 %.2.0.in = xor i1 %.2.0.in.in, true
76 %.2.0 = zext i1 %.2.0.in to i32
77 %i11 = add i32 %.2, %.2.0
78 %i12 = icmp sgt i32 %.2, -1050
79 br i1 %i12, label %.a, label %.exit
85 ; Test that unused lanes in the s_xor result are masked out with v_cndmask.
87 define i32 @combine_sub_zext_xor() {
88 ; GFX1010-LABEL: combine_sub_zext_xor:
89 ; GFX1010: ; %bb.0: ; %.entry
90 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0
92 ; GFX1010-NEXT: s_branch .LBB1_2
93 ; GFX1010-NEXT: .LBB1_1: ; %bb9
94 ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1
95 ; GFX1010-NEXT: s_xor_b32 s4, s4, -1
96 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
97 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
98 ; GFX1010-NEXT: v_sub_nc_u32_e32 v2, v1, v0
99 ; GFX1010-NEXT: v_mov_b32_e32 v1, v2
100 ; GFX1010-NEXT: s_cbranch_vccz .LBB1_4
101 ; GFX1010-NEXT: .LBB1_2: ; %.a
102 ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
103 ; GFX1010-NEXT: ; implicit-def: $sgpr4
104 ; GFX1010-NEXT: s_cbranch_scc1 .LBB1_1
105 ; GFX1010-NEXT: ; %bb.3: ; %bb
106 ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1
107 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc
108 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
109 ; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
110 ; GFX1010-NEXT: s_branch .LBB1_1
111 ; GFX1010-NEXT: .LBB1_4: ; %.exit
112 ; GFX1010-NEXT: s_setpc_b64 s[30:31]
114 ; GFX1100-LABEL: combine_sub_zext_xor:
115 ; GFX1100: ; %bb.0: ; %.entry
116 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
118 ; GFX1100-NEXT: s_branch .LBB1_2
119 ; GFX1100-NEXT: .LBB1_1: ; %bb9
120 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
121 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
122 ; GFX1100-NEXT: s_xor_b32 s0, s0, -1
123 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
124 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
125 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
126 ; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
127 ; GFX1100-NEXT: v_mov_b32_e32 v1, v2
128 ; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
129 ; GFX1100-NEXT: .LBB1_2: ; %.a
130 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
131 ; GFX1100-NEXT: ; implicit-def: $sgpr0
132 ; GFX1100-NEXT: s_cbranch_scc1 .LBB1_1
133 ; GFX1100-NEXT: ; %bb.3: ; %bb
134 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
135 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc
136 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
137 ; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
138 ; GFX1100-NEXT: s_branch .LBB1_1
139 ; GFX1100-NEXT: .LBB1_4: ; %.exit
140 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
144 .a: ; preds = %bb9, %.entry
145 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
146 br i1 undef, label %bb9, label %bb
149 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
150 %i5 = icmp eq i32 %.i3, 0
153 bb9: ; preds = %bb, %.a
154 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
155 %.2.0.in = xor i1 %.2.0.in.in, true
156 %.2.0 = zext i1 %.2.0.in to i32
157 %i11 = sub i32 %.2, %.2.0
158 %i12 = icmp sgt i32 %.2, -1050
159 br i1 %i12, label %.a, label %.exit
161 .exit: ; preds = %bb9
165 ; Test that unused lanes in the s_or result are masked out with v_cndmask.
167 define i32 @combine_add_zext_or() {
168 ; GFX1010-LABEL: combine_add_zext_or:
169 ; GFX1010: ; %bb.0: ; %.entry
170 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; GFX1010-NEXT: s_mov_b32 s4, 0
172 ; GFX1010-NEXT: s_branch .LBB2_2
173 ; GFX1010-NEXT: .LBB2_1: ; %bb9
174 ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1
175 ; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6
176 ; GFX1010-NEXT: s_cselect_b32 s6, -1, 0
177 ; GFX1010-NEXT: s_add_i32 s4, s4, 1
178 ; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s6
179 ; GFX1010-NEXT: s_cbranch_vccz .LBB2_4
180 ; GFX1010-NEXT: .LBB2_2: ; %.a
181 ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
182 ; GFX1010-NEXT: ; implicit-def: $sgpr5
183 ; GFX1010-NEXT: s_cbranch_scc1 .LBB2_1
184 ; GFX1010-NEXT: ; %bb.3: ; %bb
185 ; GFX1010-NEXT: ; in Loop: Header=BB2_2 Depth=1
186 ; GFX1010-NEXT: v_mov_b32_e32 v0, s4
187 ; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc
188 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
189 ; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0
190 ; GFX1010-NEXT: s_branch .LBB2_1
191 ; GFX1010-NEXT: .LBB2_4: ; %.exit
192 ; GFX1010-NEXT: s_or_b32 s4, s5, s6
193 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
194 ; GFX1010-NEXT: s_setpc_b64 s[30:31]
196 ; GFX1100-LABEL: combine_add_zext_or:
197 ; GFX1100: ; %bb.0: ; %.entry
198 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199 ; GFX1100-NEXT: s_mov_b32 s0, 0
200 ; GFX1100-NEXT: s_branch .LBB2_2
201 ; GFX1100-NEXT: .LBB2_1: ; %bb9
202 ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1
203 ; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6
204 ; GFX1100-NEXT: s_cselect_b32 s2, -1, 0
205 ; GFX1100-NEXT: s_add_i32 s0, s0, 1
206 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s2
207 ; GFX1100-NEXT: s_cbranch_vccz .LBB2_4
208 ; GFX1100-NEXT: .LBB2_2: ; %.a
209 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
210 ; GFX1100-NEXT: ; implicit-def: $sgpr1
211 ; GFX1100-NEXT: s_cbranch_scc1 .LBB2_1
212 ; GFX1100-NEXT: ; %bb.3: ; %bb
213 ; GFX1100-NEXT: ; in Loop: Header=BB2_2 Depth=1
214 ; GFX1100-NEXT: v_mov_b32_e32 v0, s0
215 ; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc
216 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
217 ; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0
218 ; GFX1100-NEXT: s_branch .LBB2_1
219 ; GFX1100-NEXT: .LBB2_4: ; %.exit
220 ; GFX1100-NEXT: s_or_b32 s0, s1, s2
221 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
222 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
223 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
227 .a: ; preds = %bb9, %.entry
228 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
229 br i1 undef, label %bb9, label %bb
232 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
233 %i5 = icmp eq i32 %.i3, 0
236 bb9: ; preds = %bb, %.a
237 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
238 %t = icmp sgt i32 %.2, -1050
239 %.2.0.in = or i1 %.2.0.in.in, %t
240 %.2.0 = zext i1 %.2.0.in to i32
241 %i11 = add i32 %.2, %.2.0
242 %i12 = icmp sgt i32 %.2, -1050
243 br i1 %i12, label %.a, label %.exit
245 .exit: ; preds = %bb9
249 ; Test that unused lanes in the s_or result are masked out with v_cndmask.
251 define i32 @combine_sub_zext_or() {
252 ; GFX1010-LABEL: combine_sub_zext_or:
253 ; GFX1010: ; %bb.0: ; %.entry
254 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255 ; GFX1010-NEXT: s_mov_b32 s4, 0
256 ; GFX1010-NEXT: s_branch .LBB3_2
257 ; GFX1010-NEXT: .LBB3_1: ; %bb9
258 ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1
259 ; GFX1010-NEXT: s_cmpk_gt_i32 s4, 0xfbe6
260 ; GFX1010-NEXT: s_cselect_b32 s6, -1, 0
261 ; GFX1010-NEXT: s_add_i32 s4, s4, -1
262 ; GFX1010-NEXT: s_and_b32 vcc_lo, exec_lo, s6
263 ; GFX1010-NEXT: s_cbranch_vccz .LBB3_4
264 ; GFX1010-NEXT: .LBB3_2: ; %.a
265 ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
266 ; GFX1010-NEXT: ; implicit-def: $sgpr5
267 ; GFX1010-NEXT: s_cbranch_scc1 .LBB3_1
268 ; GFX1010-NEXT: ; %bb.3: ; %bb
269 ; GFX1010-NEXT: ; in Loop: Header=BB3_2 Depth=1
270 ; GFX1010-NEXT: v_mov_b32_e32 v0, s4
271 ; GFX1010-NEXT: buffer_load_dword v0, v0, s[4:7], 64 offen glc
272 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
273 ; GFX1010-NEXT: v_cmp_eq_u32_e64 s5, 0, v0
274 ; GFX1010-NEXT: s_branch .LBB3_1
275 ; GFX1010-NEXT: .LBB3_4: ; %.exit
276 ; GFX1010-NEXT: s_or_b32 s4, s5, s6
277 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
278 ; GFX1010-NEXT: s_setpc_b64 s[30:31]
280 ; GFX1100-LABEL: combine_sub_zext_or:
281 ; GFX1100: ; %bb.0: ; %.entry
282 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283 ; GFX1100-NEXT: s_mov_b32 s0, 0
284 ; GFX1100-NEXT: s_branch .LBB3_2
285 ; GFX1100-NEXT: .LBB3_1: ; %bb9
286 ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1
287 ; GFX1100-NEXT: s_cmpk_gt_i32 s0, 0xfbe6
288 ; GFX1100-NEXT: s_cselect_b32 s2, -1, 0
289 ; GFX1100-NEXT: s_add_i32 s0, s0, -1
290 ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s2
291 ; GFX1100-NEXT: s_cbranch_vccz .LBB3_4
292 ; GFX1100-NEXT: .LBB3_2: ; %.a
293 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
294 ; GFX1100-NEXT: ; implicit-def: $sgpr1
295 ; GFX1100-NEXT: s_cbranch_scc1 .LBB3_1
296 ; GFX1100-NEXT: ; %bb.3: ; %bb
297 ; GFX1100-NEXT: ; in Loop: Header=BB3_2 Depth=1
298 ; GFX1100-NEXT: v_mov_b32_e32 v0, s0
299 ; GFX1100-NEXT: buffer_load_b32 v0, v0, s[0:3], 64 offen glc
300 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
301 ; GFX1100-NEXT: v_cmp_eq_u32_e64 s1, 0, v0
302 ; GFX1100-NEXT: s_branch .LBB3_1
303 ; GFX1100-NEXT: .LBB3_4: ; %.exit
304 ; GFX1100-NEXT: s_or_b32 s0, s1, s2
305 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
306 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
307 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
311 .a: ; preds = %bb9, %.entry
312 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
313 br i1 undef, label %bb9, label %bb
316 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
317 %i5 = icmp eq i32 %.i3, 0
320 bb9: ; preds = %bb, %.a
321 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
322 %t = icmp sgt i32 %.2, -1050
323 %.2.0.in = or i1 %.2.0.in.in, %t
324 %.2.0 = zext i1 %.2.0.in to i32
325 %i11 = sub i32 %.2, %.2.0
326 %i12 = icmp sgt i32 %.2, -1050
327 br i1 %i12, label %.a, label %.exit
329 .exit: ; preds = %bb9
333 ; Test that unused lanes in the s_and result are masked out with v_cndmask.
335 define i32 @combine_add_zext_and() {
336 ; GFX1010-LABEL: combine_add_zext_and:
337 ; GFX1010: ; %bb.0: ; %.entry
338 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0
340 ; GFX1010-NEXT: s_branch .LBB4_2
341 ; GFX1010-NEXT: .LBB4_1: ; %bb9
342 ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1
343 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
344 ; GFX1010-NEXT: s_and_b32 s4, s4, vcc_lo
345 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
346 ; GFX1010-NEXT: v_add_nc_u32_e32 v1, v1, v0
347 ; GFX1010-NEXT: s_cbranch_vccz .LBB4_4
348 ; GFX1010-NEXT: .LBB4_2: ; %.a
349 ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
350 ; GFX1010-NEXT: ; implicit-def: $sgpr4
351 ; GFX1010-NEXT: s_cbranch_scc1 .LBB4_1
352 ; GFX1010-NEXT: ; %bb.3: ; %bb
353 ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1
354 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc
355 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
356 ; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
357 ; GFX1010-NEXT: s_branch .LBB4_1
358 ; GFX1010-NEXT: .LBB4_4: ; %.exit
359 ; GFX1010-NEXT: s_setpc_b64 s[30:31]
361 ; GFX1100-LABEL: combine_add_zext_and:
362 ; GFX1100: ; %bb.0: ; %.entry
363 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
365 ; GFX1100-NEXT: s_branch .LBB4_2
366 ; GFX1100-NEXT: .LBB4_1: ; %bb9
367 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1
368 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
369 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
370 ; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo
371 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
372 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
373 ; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0
374 ; GFX1100-NEXT: s_cbranch_vccz .LBB4_4
375 ; GFX1100-NEXT: .LBB4_2: ; %.a
376 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
377 ; GFX1100-NEXT: ; implicit-def: $sgpr0
378 ; GFX1100-NEXT: s_cbranch_scc1 .LBB4_1
379 ; GFX1100-NEXT: ; %bb.3: ; %bb
380 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1
381 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc
382 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
383 ; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
384 ; GFX1100-NEXT: s_branch .LBB4_1
385 ; GFX1100-NEXT: .LBB4_4: ; %.exit
386 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
390 .a: ; preds = %bb9, %.entry
391 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
392 br i1 undef, label %bb9, label %bb
395 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
396 %i5 = icmp eq i32 %.i3, 0
399 bb9: ; preds = %bb, %.a
400 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
401 %t = icmp sgt i32 %.2, -1050
402 %.2.0.in = and i1 %.2.0.in.in, %t
403 %.2.0 = zext i1 %.2.0.in to i32
404 %i11 = add i32 %.2, %.2.0
405 %i12 = icmp sgt i32 %.2, -1050
406 br i1 %i12, label %.a, label %.exit
408 .exit: ; preds = %bb9
412 ; Test that unused lanes in the s_and result are masked out with v_cndmask.
414 define i32 @combine_sub_zext_and() {
415 ; GFX1010-LABEL: combine_sub_zext_and:
416 ; GFX1010: ; %bb.0: ; %.entry
417 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418 ; GFX1010-NEXT: v_mov_b32_e32 v1, 0
419 ; GFX1010-NEXT: s_branch .LBB5_2
420 ; GFX1010-NEXT: .LBB5_1: ; %bb9
421 ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1
422 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
423 ; GFX1010-NEXT: s_and_b32 s4, s4, vcc_lo
424 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
425 ; GFX1010-NEXT: v_sub_nc_u32_e32 v1, v1, v0
426 ; GFX1010-NEXT: s_cbranch_vccz .LBB5_4
427 ; GFX1010-NEXT: .LBB5_2: ; %.a
428 ; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
429 ; GFX1010-NEXT: ; implicit-def: $sgpr4
430 ; GFX1010-NEXT: s_cbranch_scc1 .LBB5_1
431 ; GFX1010-NEXT: ; %bb.3: ; %bb
432 ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1
433 ; GFX1010-NEXT: buffer_load_dword v0, v1, s[4:7], 64 offen glc
434 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
435 ; GFX1010-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
436 ; GFX1010-NEXT: s_branch .LBB5_1
437 ; GFX1010-NEXT: .LBB5_4: ; %.exit
438 ; GFX1010-NEXT: s_setpc_b64 s[30:31]
440 ; GFX1100-LABEL: combine_sub_zext_and:
441 ; GFX1100: ; %bb.0: ; %.entry
442 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
444 ; GFX1100-NEXT: s_branch .LBB5_2
445 ; GFX1100-NEXT: .LBB5_1: ; %bb9
446 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1
447 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
448 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
449 ; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo
450 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
451 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
452 ; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0
453 ; GFX1100-NEXT: s_cbranch_vccz .LBB5_4
454 ; GFX1100-NEXT: .LBB5_2: ; %.a
455 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
456 ; GFX1100-NEXT: ; implicit-def: $sgpr0
457 ; GFX1100-NEXT: s_cbranch_scc1 .LBB5_1
458 ; GFX1100-NEXT: ; %bb.3: ; %bb
459 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1
460 ; GFX1100-NEXT: buffer_load_b32 v0, v1, s[0:3], 64 offen glc
461 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
462 ; GFX1100-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
463 ; GFX1100-NEXT: s_branch .LBB5_1
464 ; GFX1100-NEXT: .LBB5_4: ; %.exit
465 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
469 .a: ; preds = %bb9, %.entry
470 %.2 = phi i32 [ 0, %.entry ], [ %i11, %bb9 ]
471 br i1 undef, label %bb9, label %bb
474 %.i3 = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) undef, i32 %.2, i32 64, i32 1)
475 %i5 = icmp eq i32 %.i3, 0
478 bb9: ; preds = %bb, %.a
479 %.2.0.in.in = phi i1 [ %i5, %bb ], [ undef, %.a ]
480 %t = icmp sgt i32 %.2, -1050
481 %.2.0.in = and i1 %.2.0.in.in, %t
482 %.2.0 = zext i1 %.2.0.in to i32
483 %i11 = sub i32 %.2, %.2.0
484 %i12 = icmp sgt i32 %.2, -1050
485 br i1 %i12, label %.a, label %.exit
487 .exit: ; preds = %bb9
492 ; Function Attrs: nounwind readonly willreturn
493 declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture, i32, i32, i32 immarg) #0
495 attributes #0 = { nounwind willreturn memory(argmem: read) }