1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
7 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
9 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
10 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
11 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
13 declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
14 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
15 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
17 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
19 define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
20 ; SI-LABEL: s_ctlz_zero_undef_i32:
22 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
23 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
24 ; SI-NEXT: s_mov_b32 s3, 0xf000
25 ; SI-NEXT: s_waitcnt lgkmcnt(0)
26 ; SI-NEXT: s_flbit_i32_b32 s4, s2
27 ; SI-NEXT: s_mov_b32 s2, -1
28 ; SI-NEXT: v_mov_b32_e32 v0, s4
29 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
32 ; VI-LABEL: s_ctlz_zero_undef_i32:
34 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
35 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
36 ; VI-NEXT: s_waitcnt lgkmcnt(0)
37 ; VI-NEXT: s_flbit_i32_b32 s2, s2
38 ; VI-NEXT: v_mov_b32_e32 v0, s0
39 ; VI-NEXT: v_mov_b32_e32 v1, s1
40 ; VI-NEXT: v_mov_b32_e32 v2, s2
41 ; VI-NEXT: flat_store_dword v[0:1], v2
44 ; EG-LABEL: s_ctlz_zero_undef_i32:
46 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
47 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
50 ; EG-NEXT: ALU clause starting at 4:
51 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
52 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
53 ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z,
55 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32:
56 ; GFX9-GISEL: ; %bb.0:
57 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
58 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
59 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
60 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4
62 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
63 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
64 ; GFX9-GISEL-NEXT: s_endpgm
65 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
66 store i32 %ctlz, ptr addrspace(1) %out, align 4
70 define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
71 ; SI-LABEL: v_ctlz_zero_undef_i32:
73 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
74 ; SI-NEXT: s_mov_b32 s7, 0xf000
75 ; SI-NEXT: s_mov_b32 s10, 0
76 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
77 ; SI-NEXT: v_mov_b32_e32 v1, 0
78 ; SI-NEXT: s_mov_b32 s11, s7
79 ; SI-NEXT: s_waitcnt lgkmcnt(0)
80 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
81 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
82 ; SI-NEXT: s_mov_b32 s6, -1
83 ; SI-NEXT: s_mov_b32 s4, s0
84 ; SI-NEXT: s_mov_b32 s5, s1
85 ; SI-NEXT: s_waitcnt vmcnt(0)
86 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
87 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
90 ; VI-LABEL: v_ctlz_zero_undef_i32:
92 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
93 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
94 ; VI-NEXT: s_waitcnt lgkmcnt(0)
95 ; VI-NEXT: v_mov_b32_e32 v1, s3
96 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
97 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
98 ; VI-NEXT: flat_load_dword v0, v[0:1]
99 ; VI-NEXT: s_waitcnt vmcnt(0)
100 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
101 ; VI-NEXT: v_mov_b32_e32 v0, s0
102 ; VI-NEXT: v_mov_b32_e32 v1, s1
103 ; VI-NEXT: flat_store_dword v[0:1], v2
106 ; EG-LABEL: v_ctlz_zero_undef_i32:
108 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
110 ; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
111 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
114 ; EG-NEXT: Fetch clause starting at 6:
115 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
116 ; EG-NEXT: ALU clause starting at 8:
117 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
118 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
119 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
120 ; EG-NEXT: ALU clause starting at 11:
121 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
122 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
123 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
125 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
126 ; GFX9-GISEL: ; %bb.0:
127 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
128 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
129 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
130 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
131 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
132 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
133 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
134 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
135 ; GFX9-GISEL-NEXT: s_endpgm
136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
138 %val = load i32, ptr addrspace(1) %in.gep, align 4
139 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
140 store i32 %ctlz, ptr addrspace(1) %out, align 4
144 define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
145 ; SI-LABEL: v_ctlz_zero_undef_v2i32:
147 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
148 ; SI-NEXT: s_mov_b32 s7, 0xf000
149 ; SI-NEXT: s_mov_b32 s10, 0
150 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
151 ; SI-NEXT: v_mov_b32_e32 v1, 0
152 ; SI-NEXT: s_mov_b32 s11, s7
153 ; SI-NEXT: s_waitcnt lgkmcnt(0)
154 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
155 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
156 ; SI-NEXT: s_mov_b32 s6, -1
157 ; SI-NEXT: s_mov_b32 s4, s0
158 ; SI-NEXT: s_mov_b32 s5, s1
159 ; SI-NEXT: s_waitcnt vmcnt(0)
160 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
161 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
162 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
165 ; VI-LABEL: v_ctlz_zero_undef_v2i32:
167 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
168 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
169 ; VI-NEXT: s_waitcnt lgkmcnt(0)
170 ; VI-NEXT: v_mov_b32_e32 v1, s3
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
173 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
174 ; VI-NEXT: v_mov_b32_e32 v3, s1
175 ; VI-NEXT: v_mov_b32_e32 v2, s0
176 ; VI-NEXT: s_waitcnt vmcnt(0)
177 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
178 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
179 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
182 ; EG-LABEL: v_ctlz_zero_undef_v2i32:
184 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
186 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
187 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
190 ; EG-NEXT: Fetch clause starting at 6:
191 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
192 ; EG-NEXT: ALU clause starting at 8:
193 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
194 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
195 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
196 ; EG-NEXT: ALU clause starting at 11:
197 ; EG-NEXT: FFBH_UINT * T0.Y, T0.Y,
198 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
199 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
200 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
202 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
203 ; GFX9-GISEL: ; %bb.0:
204 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
205 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
206 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
207 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
209 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
210 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
211 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
212 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
213 ; GFX9-GISEL-NEXT: s_endpgm
214 %tid = call i32 @llvm.amdgcn.workitem.id.x()
215 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
216 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
217 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
218 store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8
222 define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
223 ; SI-LABEL: v_ctlz_zero_undef_v4i32:
225 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
226 ; SI-NEXT: s_mov_b32 s7, 0xf000
227 ; SI-NEXT: s_mov_b32 s10, 0
228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
229 ; SI-NEXT: v_mov_b32_e32 v1, 0
230 ; SI-NEXT: s_mov_b32 s11, s7
231 ; SI-NEXT: s_waitcnt lgkmcnt(0)
232 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
233 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
234 ; SI-NEXT: s_mov_b32 s6, -1
235 ; SI-NEXT: s_mov_b32 s4, s0
236 ; SI-NEXT: s_mov_b32 s5, s1
237 ; SI-NEXT: s_waitcnt vmcnt(0)
238 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
239 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
240 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
241 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
242 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
245 ; VI-LABEL: v_ctlz_zero_undef_v4i32:
247 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
248 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
249 ; VI-NEXT: s_waitcnt lgkmcnt(0)
250 ; VI-NEXT: v_mov_b32_e32 v1, s3
251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
252 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
253 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
254 ; VI-NEXT: v_mov_b32_e32 v5, s1
255 ; VI-NEXT: v_mov_b32_e32 v4, s0
256 ; VI-NEXT: s_waitcnt vmcnt(0)
257 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
258 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
259 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
260 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
261 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
264 ; EG-LABEL: v_ctlz_zero_undef_v4i32:
266 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
268 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
269 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
272 ; EG-NEXT: Fetch clause starting at 6:
273 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
274 ; EG-NEXT: ALU clause starting at 8:
275 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
276 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
277 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
278 ; EG-NEXT: ALU clause starting at 11:
279 ; EG-NEXT: FFBH_UINT * T0.W, T0.W,
280 ; EG-NEXT: FFBH_UINT * T0.Z, T0.Z,
281 ; EG-NEXT: FFBH_UINT * T0.Y, T0.Y,
282 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
283 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
284 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
286 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
287 ; GFX9-GISEL: ; %bb.0:
288 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
289 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
290 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
291 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
292 ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
293 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
294 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
295 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
296 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
297 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
298 ; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
299 ; GFX9-GISEL-NEXT: s_endpgm
300 %tid = call i32 @llvm.amdgcn.workitem.id.x()
301 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
302 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
303 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
304 store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16
308 define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
309 ; SI-LABEL: v_ctlz_zero_undef_i8:
311 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
312 ; SI-NEXT: s_mov_b32 s7, 0xf000
313 ; SI-NEXT: v_mov_b32_e32 v1, 0
314 ; SI-NEXT: s_mov_b32 s10, 0
315 ; SI-NEXT: s_mov_b32 s11, s7
316 ; SI-NEXT: s_waitcnt lgkmcnt(0)
317 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
318 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
319 ; SI-NEXT: s_mov_b32 s6, -1
320 ; SI-NEXT: s_mov_b32 s4, s0
321 ; SI-NEXT: s_mov_b32 s5, s1
322 ; SI-NEXT: s_waitcnt vmcnt(0)
323 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
324 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
325 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
328 ; VI-LABEL: v_ctlz_zero_undef_i8:
330 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
331 ; VI-NEXT: s_waitcnt lgkmcnt(0)
332 ; VI-NEXT: v_mov_b32_e32 v1, s3
333 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
334 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
335 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
336 ; VI-NEXT: s_waitcnt vmcnt(0)
337 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
338 ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
339 ; VI-NEXT: v_add_u16_e32 v2, -8, v0
340 ; VI-NEXT: v_mov_b32_e32 v0, s0
341 ; VI-NEXT: v_mov_b32_e32 v1, s1
342 ; VI-NEXT: flat_store_byte v[0:1], v2
345 ; EG-LABEL: v_ctlz_zero_undef_i8:
347 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
349 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
350 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
353 ; EG-NEXT: Fetch clause starting at 6:
354 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
355 ; EG-NEXT: ALU clause starting at 8:
356 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
357 ; EG-NEXT: ALU clause starting at 9:
358 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
359 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
360 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
361 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
362 ; EG-NEXT: -24(nan), 0(0.000000e+00)
363 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
364 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
365 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
366 ; EG-NEXT: LSHL T0.X, PV.W, PS,
367 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
368 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
369 ; EG-NEXT: MOV T0.Y, 0.0,
370 ; EG-NEXT: MOV * T0.Z, 0.0,
371 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
372 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
374 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
375 ; GFX9-GISEL: ; %bb.0:
376 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
377 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
378 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
380 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
381 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
382 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
383 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
384 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
385 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
386 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
387 ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
388 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
389 ; GFX9-GISEL-NEXT: s_endpgm
390 %tid = call i32 @llvm.amdgcn.workitem.id.x()
391 %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
392 %val = load i8, ptr addrspace(1) %in.gep
393 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
394 store i8 %ctlz, ptr addrspace(1) %out
398 define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
399 ; SI-LABEL: s_ctlz_zero_undef_i64:
401 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
402 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
403 ; SI-NEXT: s_mov_b32 s3, 0xf000
404 ; SI-NEXT: s_mov_b32 s2, -1
405 ; SI-NEXT: s_waitcnt lgkmcnt(0)
406 ; SI-NEXT: s_flbit_i32_b32 s4, s4
407 ; SI-NEXT: s_flbit_i32_b32 s5, s5
408 ; SI-NEXT: s_add_i32 s4, s4, 32
409 ; SI-NEXT: s_min_u32 s4, s4, s5
410 ; SI-NEXT: v_mov_b32_e32 v1, 0
411 ; SI-NEXT: v_mov_b32_e32 v0, s4
412 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
415 ; VI-LABEL: s_ctlz_zero_undef_i64:
417 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
418 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
419 ; VI-NEXT: v_mov_b32_e32 v1, 0
420 ; VI-NEXT: s_waitcnt lgkmcnt(0)
421 ; VI-NEXT: s_flbit_i32_b32 s2, s2
422 ; VI-NEXT: s_flbit_i32_b32 s3, s3
423 ; VI-NEXT: s_add_i32 s2, s2, 32
424 ; VI-NEXT: s_min_u32 s2, s2, s3
425 ; VI-NEXT: v_mov_b32_e32 v3, s1
426 ; VI-NEXT: v_mov_b32_e32 v0, s2
427 ; VI-NEXT: v_mov_b32_e32 v2, s0
428 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
431 ; EG-LABEL: s_ctlz_zero_undef_i64:
433 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
434 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
437 ; EG-NEXT: ALU clause starting at 4:
438 ; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W,
439 ; EG-NEXT: FFBH_UINT T1.W, KC0[5].X,
440 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
441 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
442 ; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W,
443 ; EG-NEXT: MOV T0.Y, 0.0,
444 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
445 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
447 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64:
448 ; GFX9-GISEL: ; %bb.0:
449 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
450 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
451 ; GFX9-GISEL-NEXT: s_mov_b32 s1, 0
452 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
453 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
454 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3]
455 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
456 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
457 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
458 ; GFX9-GISEL-NEXT: s_endpgm
459 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
460 store i64 %ctlz, ptr addrspace(1) %out
464 define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
465 ; SI-LABEL: s_ctlz_zero_undef_i64_trunc:
467 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
468 ; SI-NEXT: s_mov_b32 s7, 0xf000
469 ; SI-NEXT: s_waitcnt lgkmcnt(0)
470 ; SI-NEXT: s_flbit_i32_b32 s2, s2
471 ; SI-NEXT: s_flbit_i32_b32 s3, s3
472 ; SI-NEXT: s_add_i32 s2, s2, 32
473 ; SI-NEXT: s_min_u32 s2, s2, s3
474 ; SI-NEXT: s_mov_b32 s6, -1
475 ; SI-NEXT: s_mov_b32 s4, s0
476 ; SI-NEXT: s_mov_b32 s5, s1
477 ; SI-NEXT: v_mov_b32_e32 v0, s2
478 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
481 ; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
483 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
484 ; VI-NEXT: s_waitcnt lgkmcnt(0)
485 ; VI-NEXT: s_flbit_i32_b32 s2, s2
486 ; VI-NEXT: s_flbit_i32_b32 s3, s3
487 ; VI-NEXT: s_add_i32 s2, s2, 32
488 ; VI-NEXT: s_min_u32 s2, s2, s3
489 ; VI-NEXT: v_mov_b32_e32 v0, s0
490 ; VI-NEXT: v_mov_b32_e32 v1, s1
491 ; VI-NEXT: v_mov_b32_e32 v2, s2
492 ; VI-NEXT: flat_store_dword v[0:1], v2
495 ; EG-LABEL: s_ctlz_zero_undef_i64_trunc:
497 ; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
498 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
501 ; EG-NEXT: ALU clause starting at 4:
502 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W,
503 ; EG-NEXT: FFBH_UINT T1.W, KC0[3].X,
504 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
505 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
506 ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W,
507 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
508 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
510 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
511 ; GFX9-GISEL: ; %bb.0:
512 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
513 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
514 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3]
516 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
517 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
518 ; GFX9-GISEL-NEXT: s_endpgm
519 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
520 %trunc = trunc i64 %ctlz to i32
521 store i32 %trunc, ptr addrspace(1) %out
525 define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
526 ; SI-LABEL: v_ctlz_zero_undef_i64:
528 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
529 ; SI-NEXT: s_mov_b32 s7, 0xf000
530 ; SI-NEXT: s_mov_b32 s6, 0
531 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
532 ; SI-NEXT: v_mov_b32_e32 v1, 0
533 ; SI-NEXT: s_waitcnt lgkmcnt(0)
534 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
535 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
536 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
537 ; SI-NEXT: s_waitcnt vmcnt(0)
538 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
539 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
540 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
541 ; SI-NEXT: v_min_u32_e32 v2, v2, v3
542 ; SI-NEXT: v_mov_b32_e32 v3, v1
543 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
546 ; VI-LABEL: v_ctlz_zero_undef_i64:
548 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
549 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
550 ; VI-NEXT: v_mov_b32_e32 v2, 0
551 ; VI-NEXT: s_waitcnt lgkmcnt(0)
552 ; VI-NEXT: v_mov_b32_e32 v1, s3
553 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3
554 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
555 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
556 ; VI-NEXT: v_mov_b32_e32 v4, s1
557 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
558 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
559 ; VI-NEXT: s_waitcnt vmcnt(0)
560 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
561 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
562 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
563 ; VI-NEXT: v_min_u32_e32 v1, v0, v1
564 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
567 ; EG-LABEL: v_ctlz_zero_undef_i64:
569 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
571 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
572 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
575 ; EG-NEXT: Fetch clause starting at 6:
576 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
577 ; EG-NEXT: ALU clause starting at 8:
578 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
579 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
580 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
581 ; EG-NEXT: ALU clause starting at 11:
582 ; EG-NEXT: FFBH_UINT * T1.W, T0.X,
583 ; EG-NEXT: FFBH_UINT T2.W, T0.Y,
584 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
585 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
586 ; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W,
587 ; EG-NEXT: MOV T0.Y, 0.0,
588 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
589 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
590 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
592 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
593 ; GFX9-GISEL: ; %bb.0:
594 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
595 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
596 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
598 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
599 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
600 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
601 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
602 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0
603 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
604 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
605 ; GFX9-GISEL-NEXT: s_endpgm
606 %tid = call i32 @llvm.amdgcn.workitem.id.x()
607 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
608 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
609 %val = load i64, ptr addrspace(1) %in.gep
610 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
611 store i64 %ctlz, ptr addrspace(1) %out.gep
615 define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
616 ; SI-LABEL: v_ctlz_zero_undef_i64_trunc:
618 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
619 ; SI-NEXT: s_mov_b32 s7, 0xf000
620 ; SI-NEXT: s_mov_b32 s6, 0
621 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
622 ; SI-NEXT: v_mov_b32_e32 v2, 0
623 ; SI-NEXT: s_waitcnt lgkmcnt(0)
624 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
625 ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
626 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
627 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
628 ; SI-NEXT: s_waitcnt vmcnt(0)
629 ; SI-NEXT: v_ffbh_u32_e32 v0, v3
630 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
631 ; SI-NEXT: v_ffbh_u32_e32 v3, v4
632 ; SI-NEXT: v_min_u32_e32 v0, v0, v3
633 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
636 ; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
638 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
639 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
640 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
641 ; VI-NEXT: s_waitcnt lgkmcnt(0)
642 ; VI-NEXT: v_mov_b32_e32 v2, s3
643 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
644 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
645 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
646 ; VI-NEXT: v_mov_b32_e32 v4, s1
647 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
648 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
649 ; VI-NEXT: s_waitcnt vmcnt(0)
650 ; VI-NEXT: v_ffbh_u32_e32 v0, v1
651 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
652 ; VI-NEXT: v_ffbh_u32_e32 v1, v2
653 ; VI-NEXT: v_min_u32_e32 v0, v0, v1
654 ; VI-NEXT: flat_store_dword v[3:4], v0
657 ; EG-LABEL: v_ctlz_zero_undef_i64_trunc:
659 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
661 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
662 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
665 ; EG-NEXT: Fetch clause starting at 6:
666 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
667 ; EG-NEXT: ALU clause starting at 8:
668 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
669 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
670 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
671 ; EG-NEXT: ALU clause starting at 11:
672 ; EG-NEXT: FFBH_UINT * T0.W, T1.X,
673 ; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
674 ; EG-NEXT: FFBH_UINT T1.W, T1.Y,
675 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y,
676 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
677 ; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W,
678 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
679 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
680 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
682 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
683 ; GFX9-GISEL: ; %bb.0:
684 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
685 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
686 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
687 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
688 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
689 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
690 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
691 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
692 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1
693 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1
694 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
695 ; GFX9-GISEL-NEXT: s_endpgm
696 %tid = call i32 @llvm.amdgcn.workitem.id.x()
697 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
698 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
699 %val = load i64, ptr addrspace(1) %in.gep
700 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
701 %trunc = trunc i64 %ctlz to i32
702 store i32 %trunc, ptr addrspace(1) %out.gep
706 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
707 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
709 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
710 ; SI-NEXT: s_mov_b32 s7, 0xf000
711 ; SI-NEXT: s_mov_b32 s10, 0
712 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
713 ; SI-NEXT: v_mov_b32_e32 v1, 0
714 ; SI-NEXT: s_mov_b32 s11, s7
715 ; SI-NEXT: s_waitcnt lgkmcnt(0)
716 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
717 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
718 ; SI-NEXT: s_mov_b32 s6, -1
719 ; SI-NEXT: s_mov_b32 s4, s0
720 ; SI-NEXT: s_mov_b32 s5, s1
721 ; SI-NEXT: s_waitcnt vmcnt(0)
722 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
723 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
726 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
728 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
729 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
730 ; VI-NEXT: s_waitcnt lgkmcnt(0)
731 ; VI-NEXT: v_mov_b32_e32 v1, s3
732 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
733 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
734 ; VI-NEXT: flat_load_dword v0, v[0:1]
735 ; VI-NEXT: s_waitcnt vmcnt(0)
736 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
737 ; VI-NEXT: v_mov_b32_e32 v0, s0
738 ; VI-NEXT: v_mov_b32_e32 v1, s1
739 ; VI-NEXT: flat_store_dword v[0:1], v2
742 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
744 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
746 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
747 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
750 ; EG-NEXT: Fetch clause starting at 6:
751 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
752 ; EG-NEXT: ALU clause starting at 8:
753 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
754 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
755 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
756 ; EG-NEXT: ALU clause starting at 11:
757 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
758 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
759 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
760 ; EG-NEXT: -1(nan), 2(2.802597e-45)
762 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
763 ; GFX9-GISEL: ; %bb.0:
764 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
765 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
766 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
767 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
768 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
769 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
770 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
771 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
772 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
773 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
774 ; GFX9-GISEL-NEXT: s_endpgm
775 %tid = call i32 @llvm.amdgcn.workitem.id.x()
776 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
777 %val = load i32, ptr addrspace(1) %in.gep
778 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
779 %cmp = icmp eq i32 %val, 0
780 %sel = select i1 %cmp, i32 -1, i32 %ctlz
781 store i32 %sel, ptr addrspace(1) %out
785 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
786 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
788 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
789 ; SI-NEXT: s_mov_b32 s7, 0xf000
790 ; SI-NEXT: s_mov_b32 s10, 0
791 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
792 ; SI-NEXT: v_mov_b32_e32 v1, 0
793 ; SI-NEXT: s_mov_b32 s11, s7
794 ; SI-NEXT: s_waitcnt lgkmcnt(0)
795 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
796 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
797 ; SI-NEXT: s_mov_b32 s6, -1
798 ; SI-NEXT: s_mov_b32 s4, s0
799 ; SI-NEXT: s_mov_b32 s5, s1
800 ; SI-NEXT: s_waitcnt vmcnt(0)
801 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
802 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
805 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
807 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
808 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
809 ; VI-NEXT: s_waitcnt lgkmcnt(0)
810 ; VI-NEXT: v_mov_b32_e32 v1, s3
811 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
812 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
813 ; VI-NEXT: flat_load_dword v0, v[0:1]
814 ; VI-NEXT: s_waitcnt vmcnt(0)
815 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
816 ; VI-NEXT: v_mov_b32_e32 v0, s0
817 ; VI-NEXT: v_mov_b32_e32 v1, s1
818 ; VI-NEXT: flat_store_dword v[0:1], v2
821 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
823 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
825 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
826 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
829 ; EG-NEXT: Fetch clause starting at 6:
830 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
831 ; EG-NEXT: ALU clause starting at 8:
832 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
833 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
834 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
835 ; EG-NEXT: ALU clause starting at 11:
836 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
837 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
838 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
839 ; EG-NEXT: -1(nan), 2(2.802597e-45)
841 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
842 ; GFX9-GISEL: ; %bb.0:
843 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
844 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
845 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
846 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
847 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
848 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
849 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
850 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
851 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
852 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
853 ; GFX9-GISEL-NEXT: s_endpgm
854 %tid = call i32 @llvm.amdgcn.workitem.id.x()
855 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
856 %val = load i32, ptr addrspace(1) %in.gep
857 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
858 %cmp = icmp ne i32 %val, 0
859 %sel = select i1 %cmp, i32 %ctlz, i32 -1
860 store i32 %sel, ptr addrspace(1) %out
864 define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
865 ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
867 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
868 ; SI-NEXT: s_mov_b32 s7, 0xf000
869 ; SI-NEXT: v_mov_b32_e32 v1, 0
870 ; SI-NEXT: s_mov_b32 s10, 0
871 ; SI-NEXT: s_mov_b32 s11, s7
872 ; SI-NEXT: s_waitcnt lgkmcnt(0)
873 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
874 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
875 ; SI-NEXT: s_mov_b32 s6, -1
876 ; SI-NEXT: s_mov_b32 s4, s0
877 ; SI-NEXT: s_mov_b32 s5, s1
878 ; SI-NEXT: s_waitcnt vmcnt(0)
879 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
880 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
883 ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
885 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
886 ; VI-NEXT: s_waitcnt lgkmcnt(0)
887 ; VI-NEXT: v_mov_b32_e32 v1, s3
888 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
889 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
890 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
891 ; VI-NEXT: s_waitcnt vmcnt(0)
892 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
893 ; VI-NEXT: v_mov_b32_e32 v0, s0
894 ; VI-NEXT: v_mov_b32_e32 v1, s1
895 ; VI-NEXT: flat_store_byte v[0:1], v2
898 ; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
900 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
902 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
903 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
906 ; EG-NEXT: Fetch clause starting at 6:
907 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
908 ; EG-NEXT: ALU clause starting at 8:
909 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
910 ; EG-NEXT: ALU clause starting at 9:
911 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
912 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
913 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
914 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
915 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
916 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
917 ; EG-NEXT: LSHL T0.X, PV.W, PS,
918 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
919 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
920 ; EG-NEXT: MOV T0.Y, 0.0,
921 ; EG-NEXT: MOV * T0.Z, 0.0,
922 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
923 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
925 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
926 ; GFX9-GISEL: ; %bb.0:
927 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
928 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
929 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
930 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
931 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
932 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
933 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
934 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
935 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
936 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
937 ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
938 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
939 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
940 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
941 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
942 ; GFX9-GISEL-NEXT: s_endpgm
943 %tid = call i32 @llvm.amdgcn.workitem.id.x()
944 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
945 %val = load i8, ptr addrspace(1) %valptr.gep
946 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
947 %cmp = icmp eq i8 %val, 0
948 %sel = select i1 %cmp, i8 -1, i8 %ctlz
949 store i8 %sel, ptr addrspace(1) %out
953 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
954 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
956 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
957 ; SI-NEXT: s_mov_b32 s7, 0xf000
958 ; SI-NEXT: s_mov_b32 s10, 0
959 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
960 ; SI-NEXT: v_mov_b32_e32 v1, 0
961 ; SI-NEXT: s_mov_b32 s11, s7
962 ; SI-NEXT: s_waitcnt lgkmcnt(0)
963 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
964 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
965 ; SI-NEXT: s_mov_b32 s6, -1
966 ; SI-NEXT: s_mov_b32 s4, s0
967 ; SI-NEXT: s_mov_b32 s5, s1
968 ; SI-NEXT: s_waitcnt vmcnt(0)
969 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
970 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
971 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
972 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
973 ; SI-NEXT: s_waitcnt vmcnt(0)
974 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
975 ; SI-NEXT: s_waitcnt vmcnt(0)
978 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
980 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
981 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
982 ; VI-NEXT: s_waitcnt lgkmcnt(0)
983 ; VI-NEXT: v_mov_b32_e32 v1, s3
984 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
985 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
986 ; VI-NEXT: flat_load_dword v2, v[0:1]
987 ; VI-NEXT: v_mov_b32_e32 v0, s0
988 ; VI-NEXT: v_mov_b32_e32 v1, s1
989 ; VI-NEXT: s_waitcnt vmcnt(0)
990 ; VI-NEXT: v_ffbh_u32_e32 v3, v2
991 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
992 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
993 ; VI-NEXT: flat_store_dword v[0:1], v3
994 ; VI-NEXT: s_waitcnt vmcnt(0)
995 ; VI-NEXT: flat_store_byte v[0:1], v2
996 ; VI-NEXT: s_waitcnt vmcnt(0)
999 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1001 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1003 ; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
1004 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
1005 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T2.X
1007 ; EG-NEXT: Fetch clause starting at 6:
1008 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1009 ; EG-NEXT: ALU clause starting at 8:
1010 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1011 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1012 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1013 ; EG-NEXT: ALU clause starting at 11:
1014 ; EG-NEXT: SETE_INT * T0.W, T0.X, 0.0,
1015 ; EG-NEXT: AND_INT T1.X, PV.W, 1,
1016 ; EG-NEXT: MOV * T1.W, literal.x,
1017 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1018 ; EG-NEXT: MOV T1.Y, 0.0,
1019 ; EG-NEXT: MOV * T1.Z, 0.0,
1020 ; EG-NEXT: MOV T2.X, literal.x,
1021 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1022 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
1023 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1024 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.y,
1025 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1027 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1028 ; GFX9-GISEL: ; %bb.0:
1029 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1030 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1031 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1032 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1033 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1034 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1035 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0
1036 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1037 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
1038 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1039 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1040 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1041 ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off
1042 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1043 ; GFX9-GISEL-NEXT: s_endpgm
1044 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1045 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1046 %val = load i32, ptr addrspace(1) %in.gep
1047 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1048 %cmp = icmp eq i32 %val, 0
1049 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1050 store volatile i32 %sel, ptr addrspace(1) %out
1051 store volatile i1 %cmp, ptr addrspace(1) undef
1055 ; Selected on wrong constant
1056 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1057 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1059 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1060 ; SI-NEXT: s_mov_b32 s7, 0xf000
1061 ; SI-NEXT: s_mov_b32 s10, 0
1062 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1063 ; SI-NEXT: v_mov_b32_e32 v1, 0
1064 ; SI-NEXT: s_mov_b32 s11, s7
1065 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1066 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1067 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1068 ; SI-NEXT: s_mov_b32 s6, -1
1069 ; SI-NEXT: s_mov_b32 s4, s0
1070 ; SI-NEXT: s_mov_b32 s5, s1
1071 ; SI-NEXT: s_waitcnt vmcnt(0)
1072 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1073 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1074 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1075 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1078 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1080 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1081 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1082 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1083 ; VI-NEXT: v_mov_b32_e32 v1, s3
1084 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1085 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1086 ; VI-NEXT: flat_load_dword v0, v[0:1]
1087 ; VI-NEXT: s_waitcnt vmcnt(0)
1088 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1089 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1090 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1091 ; VI-NEXT: v_mov_b32_e32 v0, s0
1092 ; VI-NEXT: v_mov_b32_e32 v1, s1
1093 ; VI-NEXT: flat_store_dword v[0:1], v2
1096 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1098 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1100 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1101 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1104 ; EG-NEXT: Fetch clause starting at 6:
1105 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1106 ; EG-NEXT: ALU clause starting at 8:
1107 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1108 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1109 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1110 ; EG-NEXT: ALU clause starting at 11:
1111 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1112 ; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W,
1113 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1114 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1116 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1117 ; GFX9-GISEL: ; %bb.0:
1118 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1119 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1120 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1121 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1122 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1123 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1124 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1125 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
1126 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1127 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1128 ; GFX9-GISEL-NEXT: s_endpgm
1129 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1130 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1131 %val = load i32, ptr addrspace(1) %in.gep
1132 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1133 %cmp = icmp eq i32 %val, 0
1134 %sel = select i1 %cmp, i32 0, i32 %ctlz
1135 store i32 %sel, ptr addrspace(1) %out
1139 ; Selected on wrong constant
1140 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1141 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1143 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1144 ; SI-NEXT: s_mov_b32 s7, 0xf000
1145 ; SI-NEXT: s_mov_b32 s10, 0
1146 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1147 ; SI-NEXT: v_mov_b32_e32 v1, 0
1148 ; SI-NEXT: s_mov_b32 s11, s7
1149 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1150 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1151 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1152 ; SI-NEXT: s_mov_b32 s6, -1
1153 ; SI-NEXT: s_mov_b32 s4, s0
1154 ; SI-NEXT: s_mov_b32 s5, s1
1155 ; SI-NEXT: s_waitcnt vmcnt(0)
1156 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1157 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1158 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1159 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1162 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1164 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1165 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1166 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1167 ; VI-NEXT: v_mov_b32_e32 v1, s3
1168 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1169 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1170 ; VI-NEXT: flat_load_dword v0, v[0:1]
1171 ; VI-NEXT: s_waitcnt vmcnt(0)
1172 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1173 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1174 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1175 ; VI-NEXT: v_mov_b32_e32 v0, s0
1176 ; VI-NEXT: v_mov_b32_e32 v1, s1
1177 ; VI-NEXT: flat_store_dword v[0:1], v2
1180 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1182 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1184 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1185 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1188 ; EG-NEXT: Fetch clause starting at 6:
1189 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1190 ; EG-NEXT: ALU clause starting at 8:
1191 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1192 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1193 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1194 ; EG-NEXT: ALU clause starting at 11:
1195 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1196 ; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W,
1197 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1198 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1200 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1201 ; GFX9-GISEL: ; %bb.0:
1202 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1203 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1204 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1205 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1206 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1208 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1209 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1210 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1211 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1212 ; GFX9-GISEL-NEXT: s_endpgm
1213 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1214 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1215 %val = load i32, ptr addrspace(1) %in.gep
1216 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1217 %cmp = icmp ne i32 %val, 0
1218 %sel = select i1 %cmp, i32 %ctlz, i32 0
1219 store i32 %sel, ptr addrspace(1) %out
1223 ; Compare on wrong constant
1224 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1225 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1227 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1228 ; SI-NEXT: s_mov_b32 s7, 0xf000
1229 ; SI-NEXT: s_mov_b32 s10, 0
1230 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1231 ; SI-NEXT: v_mov_b32_e32 v1, 0
1232 ; SI-NEXT: s_mov_b32 s11, s7
1233 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1234 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1235 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1236 ; SI-NEXT: s_mov_b32 s6, -1
1237 ; SI-NEXT: s_mov_b32 s4, s0
1238 ; SI-NEXT: s_mov_b32 s5, s1
1239 ; SI-NEXT: s_waitcnt vmcnt(0)
1240 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1241 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
1242 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1243 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1246 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1248 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1249 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1250 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1251 ; VI-NEXT: v_mov_b32_e32 v1, s3
1252 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1254 ; VI-NEXT: flat_load_dword v0, v[0:1]
1255 ; VI-NEXT: s_waitcnt vmcnt(0)
1256 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1257 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
1258 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1259 ; VI-NEXT: v_mov_b32_e32 v0, s0
1260 ; VI-NEXT: v_mov_b32_e32 v1, s1
1261 ; VI-NEXT: flat_store_dword v[0:1], v2
1264 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1266 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1268 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
1269 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1272 ; EG-NEXT: Fetch clause starting at 6:
1273 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1274 ; EG-NEXT: ALU clause starting at 8:
1275 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1276 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1277 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1278 ; EG-NEXT: ALU clause starting at 11:
1279 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1280 ; EG-NEXT: SETE_INT * T1.W, T0.X, 1,
1281 ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
1282 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1283 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1285 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
1286 ; GFX9-GISEL: ; %bb.0:
1287 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1288 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1289 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1290 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1291 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1292 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1293 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
1294 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
1295 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1296 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1297 ; GFX9-GISEL-NEXT: s_endpgm
1298 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1299 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1300 %val = load i32, ptr addrspace(1) %in.gep
1301 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1302 %cmp = icmp eq i32 %val, 1
1303 %sel = select i1 %cmp, i32 0, i32 %ctlz
1304 store i32 %sel, ptr addrspace(1) %out
1308 ; Selected on wrong constant
1309 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1310 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1312 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1313 ; SI-NEXT: s_mov_b32 s7, 0xf000
1314 ; SI-NEXT: s_mov_b32 s10, 0
1315 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1316 ; SI-NEXT: v_mov_b32_e32 v1, 0
1317 ; SI-NEXT: s_mov_b32 s11, s7
1318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1319 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1320 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1321 ; SI-NEXT: s_mov_b32 s6, -1
1322 ; SI-NEXT: s_mov_b32 s4, s0
1323 ; SI-NEXT: s_mov_b32 s5, s1
1324 ; SI-NEXT: s_waitcnt vmcnt(0)
1325 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1326 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
1327 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1328 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1331 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1333 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1334 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1335 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1336 ; VI-NEXT: v_mov_b32_e32 v1, s3
1337 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1338 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1339 ; VI-NEXT: flat_load_dword v0, v[0:1]
1340 ; VI-NEXT: s_waitcnt vmcnt(0)
1341 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1342 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
1343 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1344 ; VI-NEXT: v_mov_b32_e32 v0, s0
1345 ; VI-NEXT: v_mov_b32_e32 v1, s1
1346 ; VI-NEXT: flat_store_dword v[0:1], v2
1349 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1351 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1353 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
1354 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1357 ; EG-NEXT: Fetch clause starting at 6:
1358 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1359 ; EG-NEXT: ALU clause starting at 8:
1360 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1361 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1362 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1363 ; EG-NEXT: ALU clause starting at 11:
1364 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1365 ; EG-NEXT: SETNE_INT * T1.W, T0.X, 1,
1366 ; EG-NEXT: CNDE_INT T0.X, PS, 0.0, PV.W,
1367 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1368 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1370 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
1371 ; GFX9-GISEL: ; %bb.0:
1372 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1373 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1374 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1375 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1376 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1377 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1378 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
1379 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1380 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1381 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1382 ; GFX9-GISEL-NEXT: s_endpgm
1383 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1384 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1385 %val = load i32, ptr addrspace(1) %in.gep
1386 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1387 %cmp = icmp ne i32 %val, 1
1388 %sel = select i1 %cmp, i32 %ctlz, i32 0
1389 store i32 %sel, ptr addrspace(1) %out