1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
7 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
8 declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
9 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
10 declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
12 declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
13 declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone
15 declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone
16 declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone
17 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
19 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
20 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
21 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
23 declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
24 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
25 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
27 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
29 define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
30 ; SI-LABEL: s_ctlz_zero_undef_i32:
32 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
33 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
34 ; SI-NEXT: s_mov_b32 s3, 0xf000
35 ; SI-NEXT: s_waitcnt lgkmcnt(0)
36 ; SI-NEXT: s_flbit_i32_b32 s4, s2
37 ; SI-NEXT: s_mov_b32 s2, -1
38 ; SI-NEXT: v_mov_b32_e32 v0, s4
39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
42 ; VI-LABEL: s_ctlz_zero_undef_i32:
44 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
45 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
46 ; VI-NEXT: s_waitcnt lgkmcnt(0)
47 ; VI-NEXT: s_flbit_i32_b32 s2, s2
48 ; VI-NEXT: v_mov_b32_e32 v0, s0
49 ; VI-NEXT: v_mov_b32_e32 v1, s1
50 ; VI-NEXT: v_mov_b32_e32 v2, s2
51 ; VI-NEXT: flat_store_dword v[0:1], v2
54 ; EG-LABEL: s_ctlz_zero_undef_i32:
56 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
57 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
60 ; EG-NEXT: ALU clause starting at 4:
61 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
62 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
63 ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z,
65 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32:
66 ; GFX9-GISEL: ; %bb.0:
67 ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
68 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
69 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
70 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
72 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
73 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
74 ; GFX9-GISEL-NEXT: s_endpgm
75 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
76 store i32 %ctlz, ptr addrspace(1) %out, align 4
80 define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
81 ; SI-LABEL: v_ctlz_zero_undef_i32:
83 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
84 ; SI-NEXT: s_mov_b32 s7, 0xf000
85 ; SI-NEXT: s_mov_b32 s10, 0
86 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
87 ; SI-NEXT: v_mov_b32_e32 v1, 0
88 ; SI-NEXT: s_mov_b32 s11, s7
89 ; SI-NEXT: s_waitcnt lgkmcnt(0)
90 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
91 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
92 ; SI-NEXT: s_mov_b32 s6, -1
93 ; SI-NEXT: s_mov_b32 s4, s0
94 ; SI-NEXT: s_mov_b32 s5, s1
95 ; SI-NEXT: s_waitcnt vmcnt(0)
96 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
97 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
100 ; VI-LABEL: v_ctlz_zero_undef_i32:
102 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
103 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
104 ; VI-NEXT: s_waitcnt lgkmcnt(0)
105 ; VI-NEXT: v_mov_b32_e32 v1, s3
106 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
107 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
108 ; VI-NEXT: flat_load_dword v0, v[0:1]
109 ; VI-NEXT: s_waitcnt vmcnt(0)
110 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
111 ; VI-NEXT: v_mov_b32_e32 v0, s0
112 ; VI-NEXT: v_mov_b32_e32 v1, s1
113 ; VI-NEXT: flat_store_dword v[0:1], v2
116 ; EG-LABEL: v_ctlz_zero_undef_i32:
118 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
120 ; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
121 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
124 ; EG-NEXT: Fetch clause starting at 6:
125 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
126 ; EG-NEXT: ALU clause starting at 8:
127 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
128 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
129 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
130 ; EG-NEXT: ALU clause starting at 11:
131 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
132 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
133 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
135 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
136 ; GFX9-GISEL: ; %bb.0:
137 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
138 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
139 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
140 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
142 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
143 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
144 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
145 ; GFX9-GISEL-NEXT: s_endpgm
146 %tid = call i32 @llvm.amdgcn.workitem.id.x()
147 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
148 %val = load i32, ptr addrspace(1) %in.gep, align 4
149 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
150 store i32 %ctlz, ptr addrspace(1) %out, align 4
154 define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
155 ; SI-LABEL: v_ctlz_zero_undef_v2i32:
157 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
158 ; SI-NEXT: s_mov_b32 s7, 0xf000
159 ; SI-NEXT: s_mov_b32 s10, 0
160 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
161 ; SI-NEXT: v_mov_b32_e32 v1, 0
162 ; SI-NEXT: s_mov_b32 s11, s7
163 ; SI-NEXT: s_waitcnt lgkmcnt(0)
164 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
165 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
166 ; SI-NEXT: s_mov_b32 s6, -1
167 ; SI-NEXT: s_mov_b32 s4, s0
168 ; SI-NEXT: s_mov_b32 s5, s1
169 ; SI-NEXT: s_waitcnt vmcnt(0)
170 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
171 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
172 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
175 ; VI-LABEL: v_ctlz_zero_undef_v2i32:
177 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
178 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
179 ; VI-NEXT: s_waitcnt lgkmcnt(0)
180 ; VI-NEXT: v_mov_b32_e32 v1, s3
181 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
182 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
183 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
184 ; VI-NEXT: v_mov_b32_e32 v3, s1
185 ; VI-NEXT: v_mov_b32_e32 v2, s0
186 ; VI-NEXT: s_waitcnt vmcnt(0)
187 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
188 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
189 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
192 ; EG-LABEL: v_ctlz_zero_undef_v2i32:
194 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
196 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
197 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
200 ; EG-NEXT: Fetch clause starting at 6:
201 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
202 ; EG-NEXT: ALU clause starting at 8:
203 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
204 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
205 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
206 ; EG-NEXT: ALU clause starting at 11:
207 ; EG-NEXT: FFBH_UINT * T0.Y, T0.Y,
208 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
209 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
210 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
212 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
213 ; GFX9-GISEL: ; %bb.0:
214 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
215 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
216 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
217 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
218 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
219 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
220 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
221 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
222 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
223 ; GFX9-GISEL-NEXT: s_endpgm
224 %tid = call i32 @llvm.amdgcn.workitem.id.x()
225 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
226 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
227 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
228 store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8
232 define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
233 ; SI-LABEL: v_ctlz_zero_undef_v4i32:
235 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
236 ; SI-NEXT: s_mov_b32 s7, 0xf000
237 ; SI-NEXT: s_mov_b32 s10, 0
238 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
239 ; SI-NEXT: v_mov_b32_e32 v1, 0
240 ; SI-NEXT: s_mov_b32 s11, s7
241 ; SI-NEXT: s_waitcnt lgkmcnt(0)
242 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
243 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
244 ; SI-NEXT: s_mov_b32 s6, -1
245 ; SI-NEXT: s_mov_b32 s4, s0
246 ; SI-NEXT: s_mov_b32 s5, s1
247 ; SI-NEXT: s_waitcnt vmcnt(0)
248 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
249 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
250 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
251 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
252 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
255 ; VI-LABEL: v_ctlz_zero_undef_v4i32:
257 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
258 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
259 ; VI-NEXT: s_waitcnt lgkmcnt(0)
260 ; VI-NEXT: v_mov_b32_e32 v1, s3
261 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
262 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
263 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
264 ; VI-NEXT: v_mov_b32_e32 v5, s1
265 ; VI-NEXT: v_mov_b32_e32 v4, s0
266 ; VI-NEXT: s_waitcnt vmcnt(0)
267 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
268 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
269 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
270 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
271 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
274 ; EG-LABEL: v_ctlz_zero_undef_v4i32:
276 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
278 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
279 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
282 ; EG-NEXT: Fetch clause starting at 6:
283 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
284 ; EG-NEXT: ALU clause starting at 8:
285 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
286 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
287 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
288 ; EG-NEXT: ALU clause starting at 11:
289 ; EG-NEXT: FFBH_UINT * T0.W, T0.W,
290 ; EG-NEXT: FFBH_UINT * T0.Z, T0.Z,
291 ; EG-NEXT: FFBH_UINT * T0.Y, T0.Y,
292 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
293 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
294 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
296 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
297 ; GFX9-GISEL: ; %bb.0:
298 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
299 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
300 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
301 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
302 ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
303 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
304 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
305 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
306 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
307 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
308 ; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
309 ; GFX9-GISEL-NEXT: s_endpgm
310 %tid = call i32 @llvm.amdgcn.workitem.id.x()
311 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
312 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
313 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
314 store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16
318 define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
319 ; SI-LABEL: s_ctlz_zero_undef_i8_with_select:
321 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
322 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
323 ; SI-NEXT: s_mov_b32 s3, 0xf000
324 ; SI-NEXT: s_waitcnt lgkmcnt(0)
325 ; SI-NEXT: s_lshl_b32 s2, s2, 24
326 ; SI-NEXT: s_flbit_i32_b32 s4, s2
327 ; SI-NEXT: s_mov_b32 s2, -1
328 ; SI-NEXT: v_mov_b32_e32 v0, s4
329 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
332 ; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
334 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
335 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
336 ; VI-NEXT: s_waitcnt lgkmcnt(0)
337 ; VI-NEXT: s_lshl_b32 s2, s2, 24
338 ; VI-NEXT: s_flbit_i32_b32 s2, s2
339 ; VI-NEXT: v_mov_b32_e32 v0, s0
340 ; VI-NEXT: v_mov_b32_e32 v1, s1
341 ; VI-NEXT: v_mov_b32_e32 v2, s2
342 ; VI-NEXT: flat_store_byte v[0:1], v2
345 ; EG-LABEL: s_ctlz_zero_undef_i8_with_select:
347 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
349 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
350 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
353 ; EG-NEXT: Fetch clause starting at 6:
354 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
355 ; EG-NEXT: ALU clause starting at 8:
356 ; EG-NEXT: MOV * T0.X, 0.0,
357 ; EG-NEXT: ALU clause starting at 9:
358 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
359 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
360 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
361 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
362 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
363 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
364 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
365 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
366 ; EG-NEXT: LSHL T0.X, PV.W, PS,
367 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
368 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
369 ; EG-NEXT: MOV T0.Y, 0.0,
370 ; EG-NEXT: MOV * T0.Z, 0.0,
371 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
372 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
374 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select:
375 ; GFX9-GISEL: ; %bb.0:
376 ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
377 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
378 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
379 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 24
381 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
382 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
383 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
384 ; GFX9-GISEL-NEXT: s_endpgm
385 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
386 %ctlz_ret = icmp ne i8 %val, 0
387 %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
388 store i8 %ctlz, ptr addrspace(1) %out, align 4
392 define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
393 ; SI-LABEL: s_ctlz_zero_undef_i16_with_select:
395 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
396 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
397 ; SI-NEXT: s_mov_b32 s3, 0xf000
398 ; SI-NEXT: s_waitcnt lgkmcnt(0)
399 ; SI-NEXT: s_lshl_b32 s2, s2, 16
400 ; SI-NEXT: s_flbit_i32_b32 s4, s2
401 ; SI-NEXT: s_mov_b32 s2, -1
402 ; SI-NEXT: v_mov_b32_e32 v0, s4
403 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
406 ; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
408 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
409 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
410 ; VI-NEXT: s_waitcnt lgkmcnt(0)
411 ; VI-NEXT: s_lshl_b32 s2, s2, 16
412 ; VI-NEXT: s_flbit_i32_b32 s2, s2
413 ; VI-NEXT: v_mov_b32_e32 v0, s0
414 ; VI-NEXT: v_mov_b32_e32 v1, s1
415 ; VI-NEXT: v_mov_b32_e32 v2, s2
416 ; VI-NEXT: flat_store_short v[0:1], v2
419 ; EG-LABEL: s_ctlz_zero_undef_i16_with_select:
421 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
423 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
424 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
427 ; EG-NEXT: Fetch clause starting at 6:
428 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
429 ; EG-NEXT: ALU clause starting at 8:
430 ; EG-NEXT: MOV * T0.X, 0.0,
431 ; EG-NEXT: ALU clause starting at 9:
432 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
433 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
434 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
435 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
436 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
437 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
438 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
439 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
440 ; EG-NEXT: LSHL T0.X, PV.W, PS,
441 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
442 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
443 ; EG-NEXT: MOV T0.Y, 0.0,
444 ; EG-NEXT: MOV * T0.Z, 0.0,
445 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
446 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
448 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select:
449 ; GFX9-GISEL: ; %bb.0:
450 ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
451 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
452 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
453 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
454 ; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 16
455 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
456 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
457 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1]
458 ; GFX9-GISEL-NEXT: s_endpgm
459 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
460 %ctlz_ret = icmp ne i16 %val, 0
461 %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
462 store i16 %ctlz, ptr addrspace(1) %out, align 4
466 define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
467 ; SI-LABEL: s_ctlz_zero_undef_i32_with_select:
469 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
470 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
471 ; SI-NEXT: s_mov_b32 s3, 0xf000
472 ; SI-NEXT: s_waitcnt lgkmcnt(0)
473 ; SI-NEXT: s_flbit_i32_b32 s4, s2
474 ; SI-NEXT: s_mov_b32 s2, -1
475 ; SI-NEXT: v_mov_b32_e32 v0, s4
476 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
479 ; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
481 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
482 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
483 ; VI-NEXT: s_waitcnt lgkmcnt(0)
484 ; VI-NEXT: s_flbit_i32_b32 s2, s2
485 ; VI-NEXT: v_mov_b32_e32 v0, s0
486 ; VI-NEXT: v_mov_b32_e32 v1, s1
487 ; VI-NEXT: v_mov_b32_e32 v2, s2
488 ; VI-NEXT: flat_store_dword v[0:1], v2
491 ; EG-LABEL: s_ctlz_zero_undef_i32_with_select:
493 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
494 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
497 ; EG-NEXT: ALU clause starting at 4:
498 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
499 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
500 ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z,
502 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select:
503 ; GFX9-GISEL: ; %bb.0:
504 ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
505 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
506 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
507 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
509 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
510 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
511 ; GFX9-GISEL-NEXT: s_endpgm
512 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
513 %ctlz_ret = icmp ne i32 %val, 0
514 %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
515 store i32 %ctlz, ptr addrspace(1) %out, align 4
519 define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
520 ; SI-LABEL: s_ctlz_zero_undef_i64_with_select:
522 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
523 ; SI-NEXT: s_mov_b32 s7, 0xf000
524 ; SI-NEXT: s_mov_b32 s6, -1
525 ; SI-NEXT: s_waitcnt lgkmcnt(0)
526 ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3]
527 ; SI-NEXT: v_mov_b32_e32 v1, 0
528 ; SI-NEXT: s_mov_b32 s4, s0
529 ; SI-NEXT: s_mov_b32 s5, s1
530 ; SI-NEXT: v_mov_b32_e32 v0, s2
531 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
534 ; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
536 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
537 ; VI-NEXT: v_mov_b32_e32 v1, 0
538 ; VI-NEXT: s_waitcnt lgkmcnt(0)
539 ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
540 ; VI-NEXT: v_mov_b32_e32 v3, s1
541 ; VI-NEXT: v_mov_b32_e32 v0, s2
542 ; VI-NEXT: v_mov_b32_e32 v2, s0
543 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
546 ; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
548 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
549 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
552 ; EG-NEXT: ALU clause starting at 4:
553 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W,
554 ; EG-NEXT: FFBH_UINT T1.W, KC0[3].X,
555 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
556 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
557 ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W,
558 ; EG-NEXT: MOV T0.Y, 0.0,
559 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
560 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
562 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
563 ; GFX9-GISEL: ; %bb.0:
564 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
565 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0
566 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
567 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3]
569 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
570 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
571 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
572 ; GFX9-GISEL-NEXT: s_endpgm
573 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
574 %ctlz_ret = icmp ne i64 %val, 0
575 %ret = select i1 %ctlz_ret, i64 %ctlz, i64 32
576 store i64 %ctlz, ptr addrspace(1) %out, align 4
580 define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
581 ; SI-LABEL: v_ctlz_zero_undef_i8_with_select:
583 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
584 ; SI-NEXT: s_mov_b32 s7, 0xf000
585 ; SI-NEXT: s_mov_b32 s6, -1
586 ; SI-NEXT: s_mov_b32 s10, s6
587 ; SI-NEXT: s_mov_b32 s11, s7
588 ; SI-NEXT: s_waitcnt lgkmcnt(0)
589 ; SI-NEXT: s_mov_b32 s8, s2
590 ; SI-NEXT: s_mov_b32 s9, s3
591 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
592 ; SI-NEXT: s_mov_b32 s4, s0
593 ; SI-NEXT: s_mov_b32 s5, s1
594 ; SI-NEXT: s_waitcnt vmcnt(0)
595 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
596 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
597 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
598 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
599 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
602 ; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
604 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
605 ; VI-NEXT: s_waitcnt lgkmcnt(0)
606 ; VI-NEXT: v_mov_b32_e32 v0, s2
607 ; VI-NEXT: v_mov_b32_e32 v1, s3
608 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
609 ; VI-NEXT: s_waitcnt vmcnt(0)
610 ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
611 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
612 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
613 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
614 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
615 ; VI-NEXT: v_mov_b32_e32 v0, s0
616 ; VI-NEXT: v_mov_b32_e32 v1, s1
617 ; VI-NEXT: flat_store_byte v[0:1], v2
620 ; EG-LABEL: v_ctlz_zero_undef_i8_with_select:
622 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
624 ; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
625 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
628 ; EG-NEXT: Fetch clause starting at 6:
629 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
630 ; EG-NEXT: ALU clause starting at 8:
631 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
632 ; EG-NEXT: ALU clause starting at 9:
633 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
634 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
635 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
636 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
637 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
638 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
639 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
640 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
641 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
642 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
643 ; EG-NEXT: LSHL T0.X, PV.W, PS,
644 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
645 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
646 ; EG-NEXT: MOV T0.Y, 0.0,
647 ; EG-NEXT: MOV * T0.Z, 0.0,
648 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
649 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
651 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
652 ; GFX9-GISEL: ; %bb.0:
653 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
654 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
655 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
656 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
657 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
658 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1
659 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
660 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
661 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
662 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
663 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
664 ; GFX9-GISEL-NEXT: s_endpgm
665 %val = load i8, ptr addrspace(1) %arrayidx, align 1
666 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
667 %ctlz_ret = icmp ne i8 %val, 0
668 %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
669 store i8 %ret, ptr addrspace(1) %out, align 4
673 define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
674 ; SI-LABEL: v_ctlz_zero_undef_i16_with_select:
676 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
677 ; SI-NEXT: s_mov_b32 s7, 0xf000
678 ; SI-NEXT: s_mov_b32 s6, -1
679 ; SI-NEXT: s_mov_b32 s10, s6
680 ; SI-NEXT: s_mov_b32 s11, s7
681 ; SI-NEXT: s_waitcnt lgkmcnt(0)
682 ; SI-NEXT: s_mov_b32 s8, s2
683 ; SI-NEXT: s_mov_b32 s9, s3
684 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
685 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
686 ; SI-NEXT: s_mov_b32 s4, s0
687 ; SI-NEXT: s_mov_b32 s5, s1
688 ; SI-NEXT: s_waitcnt vmcnt(1)
689 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
690 ; SI-NEXT: s_waitcnt vmcnt(0)
691 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
692 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
693 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
694 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
695 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
696 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
699 ; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
701 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
702 ; VI-NEXT: s_waitcnt lgkmcnt(0)
703 ; VI-NEXT: s_add_u32 s4, s2, 1
704 ; VI-NEXT: s_addc_u32 s5, s3, 0
705 ; VI-NEXT: v_mov_b32_e32 v2, s4
706 ; VI-NEXT: v_mov_b32_e32 v0, s2
707 ; VI-NEXT: v_mov_b32_e32 v3, s5
708 ; VI-NEXT: v_mov_b32_e32 v1, s3
709 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
710 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
711 ; VI-NEXT: s_waitcnt vmcnt(1)
712 ; VI-NEXT: v_readfirstlane_b32 s2, v2
713 ; VI-NEXT: s_waitcnt vmcnt(0)
714 ; VI-NEXT: v_readfirstlane_b32 s3, v0
715 ; VI-NEXT: s_lshl_b32 s2, s2, 8
716 ; VI-NEXT: s_or_b32 s2, s2, s3
717 ; VI-NEXT: s_lshl_b32 s3, s2, 16
718 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
719 ; VI-NEXT: s_flbit_i32_b32 s3, s3
720 ; VI-NEXT: s_cmp_lg_u32 s2, 0
721 ; VI-NEXT: s_cselect_b32 s2, s3, 32
722 ; VI-NEXT: v_mov_b32_e32 v0, s0
723 ; VI-NEXT: v_mov_b32_e32 v1, s1
724 ; VI-NEXT: v_mov_b32_e32 v2, s2
725 ; VI-NEXT: flat_store_short v[0:1], v2
728 ; EG-LABEL: v_ctlz_zero_undef_i16_with_select:
730 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
732 ; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
733 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
736 ; EG-NEXT: Fetch clause starting at 6:
737 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
738 ; EG-NEXT: ALU clause starting at 8:
739 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
740 ; EG-NEXT: ALU clause starting at 9:
741 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
742 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
743 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
744 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
745 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
746 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
747 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
748 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
749 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
750 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
751 ; EG-NEXT: LSHL T0.X, PV.W, PS,
752 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
753 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
754 ; EG-NEXT: MOV T0.Y, 0.0,
755 ; EG-NEXT: MOV * T0.Z, 0.0,
756 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
757 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
759 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
760 ; GFX9-GISEL: ; %bb.0:
761 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
762 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
763 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
764 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
765 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
766 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
767 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
768 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1
769 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
770 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
771 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
772 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
773 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
774 ; GFX9-GISEL-NEXT: s_endpgm
775 %val = load i16, ptr addrspace(1) %arrayidx, align 1
776 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
777 %ctlz_ret = icmp ne i16 %val, 0
778 %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
779 store i16 %ret, ptr addrspace(1) %out, align 4
783 define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
784 ; SI-LABEL: v_ctlz_zero_undef_i32_with_select:
786 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
787 ; SI-NEXT: s_mov_b32 s7, 0xf000
788 ; SI-NEXT: s_mov_b32 s6, -1
789 ; SI-NEXT: s_mov_b32 s10, s6
790 ; SI-NEXT: s_mov_b32 s11, s7
791 ; SI-NEXT: s_waitcnt lgkmcnt(0)
792 ; SI-NEXT: s_mov_b32 s8, s2
793 ; SI-NEXT: s_mov_b32 s9, s3
794 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
795 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3
796 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
797 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2
798 ; SI-NEXT: s_mov_b32 s4, s0
799 ; SI-NEXT: s_mov_b32 s5, s1
800 ; SI-NEXT: s_waitcnt vmcnt(3)
801 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
802 ; SI-NEXT: s_waitcnt vmcnt(2)
803 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
804 ; SI-NEXT: s_waitcnt vmcnt(1)
805 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
806 ; SI-NEXT: s_waitcnt vmcnt(0)
807 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
808 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
809 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
810 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
811 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
812 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
815 ; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
817 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
818 ; VI-NEXT: s_waitcnt lgkmcnt(0)
819 ; VI-NEXT: s_add_u32 s4, s2, 3
820 ; VI-NEXT: s_addc_u32 s5, s3, 0
821 ; VI-NEXT: v_mov_b32_e32 v2, s4
822 ; VI-NEXT: v_mov_b32_e32 v3, s5
823 ; VI-NEXT: s_add_u32 s4, s2, 2
824 ; VI-NEXT: v_mov_b32_e32 v0, s2
825 ; VI-NEXT: s_addc_u32 s5, s3, 0
826 ; VI-NEXT: v_mov_b32_e32 v1, s3
827 ; VI-NEXT: s_add_u32 s2, s2, 1
828 ; VI-NEXT: s_addc_u32 s3, s3, 0
829 ; VI-NEXT: v_mov_b32_e32 v4, s4
830 ; VI-NEXT: v_mov_b32_e32 v7, s3
831 ; VI-NEXT: v_mov_b32_e32 v5, s5
832 ; VI-NEXT: v_mov_b32_e32 v6, s2
833 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
834 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
835 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
836 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
837 ; VI-NEXT: s_waitcnt vmcnt(3)
838 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
839 ; VI-NEXT: s_waitcnt vmcnt(2)
840 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
841 ; VI-NEXT: s_waitcnt vmcnt(1)
842 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
843 ; VI-NEXT: s_waitcnt vmcnt(0)
844 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
845 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
846 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
847 ; VI-NEXT: v_min_u32_e32 v2, 32, v0
848 ; VI-NEXT: v_mov_b32_e32 v0, s0
849 ; VI-NEXT: v_mov_b32_e32 v1, s1
850 ; VI-NEXT: flat_store_dword v[0:1], v2
853 ; EG-LABEL: v_ctlz_zero_undef_i32_with_select:
855 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
857 ; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
858 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
861 ; EG-NEXT: Fetch clause starting at 6:
862 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
863 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
864 ; EG-NEXT: ALU clause starting at 10:
865 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
866 ; EG-NEXT: ALU clause starting at 11:
867 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
868 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
869 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
870 ; EG-NEXT: FFBH_UINT * T1.W, PV.W,
871 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
872 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
873 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
875 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
876 ; GFX9-GISEL: ; %bb.0:
877 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
878 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
879 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
880 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
881 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
882 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
883 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
884 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
885 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
886 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
887 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
888 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
889 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4
890 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
891 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
892 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
893 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
894 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
895 ; GFX9-GISEL-NEXT: s_endpgm
896 %val = load i32, ptr addrspace(1) %arrayidx, align 1
897 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
898 %ctlz_ret = icmp ne i32 %val, 0
899 %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
900 store i32 %ret, ptr addrspace(1) %out, align 4
904 define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
905 ; SI-LABEL: v_ctlz_zero_undef_i64_with_select:
907 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
908 ; SI-NEXT: s_mov_b32 s3, 0xf000
909 ; SI-NEXT: s_mov_b32 s2, -1
910 ; SI-NEXT: s_mov_b32 s10, s2
911 ; SI-NEXT: s_mov_b32 s11, s3
912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
913 ; SI-NEXT: s_mov_b32 s8, s6
914 ; SI-NEXT: s_mov_b32 s9, s7
915 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:5
916 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:7
917 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
918 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:1
919 ; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2
920 ; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:3
921 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4
922 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6
923 ; SI-NEXT: s_mov_b32 s0, s4
924 ; SI-NEXT: s_mov_b32 s1, s5
925 ; SI-NEXT: s_waitcnt vmcnt(7)
926 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
927 ; SI-NEXT: s_waitcnt vmcnt(6)
928 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
929 ; SI-NEXT: s_waitcnt vmcnt(4)
930 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
931 ; SI-NEXT: s_waitcnt vmcnt(2)
932 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
933 ; SI-NEXT: s_waitcnt vmcnt(1)
934 ; SI-NEXT: v_or_b32_e32 v0, v0, v6
935 ; SI-NEXT: s_waitcnt vmcnt(0)
936 ; SI-NEXT: v_or_b32_e32 v1, v1, v7
937 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
938 ; SI-NEXT: v_or_b32_e32 v3, v5, v4
939 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
940 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
941 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
942 ; SI-NEXT: v_or_b32_e32 v1, v3, v2
943 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
944 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
945 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1
946 ; SI-NEXT: v_min_u32_e32 v0, v1, v0
947 ; SI-NEXT: v_min_u32_e32 v0, 64, v0
948 ; SI-NEXT: v_mov_b32_e32 v1, 0
949 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
952 ; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
954 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
955 ; VI-NEXT: s_waitcnt lgkmcnt(0)
956 ; VI-NEXT: s_add_u32 s4, s2, 5
957 ; VI-NEXT: s_addc_u32 s5, s3, 0
958 ; VI-NEXT: v_mov_b32_e32 v0, s4
959 ; VI-NEXT: v_mov_b32_e32 v1, s5
960 ; VI-NEXT: s_add_u32 s4, s2, 4
961 ; VI-NEXT: s_addc_u32 s5, s3, 0
962 ; VI-NEXT: v_mov_b32_e32 v2, s4
963 ; VI-NEXT: v_mov_b32_e32 v3, s5
964 ; VI-NEXT: s_add_u32 s4, s2, 7
965 ; VI-NEXT: s_addc_u32 s5, s3, 0
966 ; VI-NEXT: v_mov_b32_e32 v4, s4
967 ; VI-NEXT: v_mov_b32_e32 v5, s5
968 ; VI-NEXT: s_add_u32 s4, s2, 6
969 ; VI-NEXT: s_addc_u32 s5, s3, 0
970 ; VI-NEXT: v_mov_b32_e32 v7, s5
971 ; VI-NEXT: v_mov_b32_e32 v6, s4
972 ; VI-NEXT: s_add_u32 s4, s2, 3
973 ; VI-NEXT: s_addc_u32 s5, s3, 0
974 ; VI-NEXT: v_mov_b32_e32 v9, s5
975 ; VI-NEXT: v_mov_b32_e32 v8, s4
976 ; VI-NEXT: s_add_u32 s4, s2, 2
977 ; VI-NEXT: s_addc_u32 s5, s3, 0
978 ; VI-NEXT: v_mov_b32_e32 v11, s5
979 ; VI-NEXT: v_mov_b32_e32 v10, s4
980 ; VI-NEXT: s_add_u32 s4, s2, 1
981 ; VI-NEXT: flat_load_ubyte v12, v[0:1]
982 ; VI-NEXT: flat_load_ubyte v13, v[2:3]
983 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
984 ; VI-NEXT: flat_load_ubyte v5, v[6:7]
985 ; VI-NEXT: s_addc_u32 s5, s3, 0
986 ; VI-NEXT: v_mov_b32_e32 v0, s4
987 ; VI-NEXT: flat_load_ubyte v6, v[8:9]
988 ; VI-NEXT: v_mov_b32_e32 v2, s2
989 ; VI-NEXT: v_mov_b32_e32 v1, s5
990 ; VI-NEXT: v_mov_b32_e32 v3, s3
991 ; VI-NEXT: flat_load_ubyte v7, v[10:11]
992 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
993 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
994 ; VI-NEXT: v_mov_b32_e32 v1, 0
995 ; VI-NEXT: s_waitcnt vmcnt(7)
996 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12
997 ; VI-NEXT: s_waitcnt vmcnt(6)
998 ; VI-NEXT: v_or_b32_e32 v3, v3, v13
999 ; VI-NEXT: s_waitcnt vmcnt(5)
1000 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
1001 ; VI-NEXT: s_waitcnt vmcnt(4)
1002 ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1003 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
1004 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
1005 ; VI-NEXT: s_waitcnt vmcnt(3)
1006 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
1007 ; VI-NEXT: s_waitcnt vmcnt(2)
1008 ; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1009 ; VI-NEXT: s_waitcnt vmcnt(1)
1010 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1011 ; VI-NEXT: s_waitcnt vmcnt(0)
1012 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1013 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
1014 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1015 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1016 ; VI-NEXT: v_min_u32_e32 v0, v0, v3
1017 ; VI-NEXT: v_mov_b32_e32 v3, s1
1018 ; VI-NEXT: v_min_u32_e32 v0, 64, v0
1019 ; VI-NEXT: v_mov_b32_e32 v2, s0
1020 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1023 ; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
1025 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1027 ; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
1028 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1031 ; EG-NEXT: Fetch clause starting at 6:
1032 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1033 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 4, #1
1034 ; EG-NEXT: VTX_READ_16 T3.X, T0.X, 6, #1
1035 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1036 ; EG-NEXT: ALU clause starting at 14:
1037 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1038 ; EG-NEXT: ALU clause starting at 15:
1039 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1040 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1041 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1042 ; EG-NEXT: FFBH_UINT T1.W, PV.W,
1043 ; EG-NEXT: LSHL * T2.W, T3.X, literal.x,
1044 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1045 ; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W,
1046 ; EG-NEXT: OR_INT * T1.W, PS, T2.X,
1047 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1048 ; EG-NEXT: FFBH_UINT T2.W, PS,
1049 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1050 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1051 ; EG-NEXT: CNDE_INT T0.X, T1.W, PS, PV.W,
1052 ; EG-NEXT: MOV T0.Y, 0.0,
1053 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1054 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1056 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
1057 ; GFX9-GISEL: ; %bb.0:
1058 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1059 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1060 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1061 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3]
1062 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1
1063 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2
1064 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
1065 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
1066 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5
1067 ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6
1068 ; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7
1069 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6)
1070 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0
1071 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5)
1072 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1073 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4)
1074 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
1075 ; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0
1076 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1077 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5
1078 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1079 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v7
1080 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1081 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v8, 24, v5
1082 ; GFX9-GISEL-NEXT: v_or3_b32 v3, v0, v4, 0
1083 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v2
1084 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v4, v3
1085 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
1086 ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1087 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0
1088 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
1089 ; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
1090 ; GFX9-GISEL-NEXT: s_endpgm
1091 %val = load i64, ptr addrspace(1) %arrayidx, align 1
1092 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
1093 %ctlz_ret = icmp ne i64 %val, 0
1094 %ret = select i1 %ctlz_ret, i64 %ctlz, i64 64
1095 store i64 %ret, ptr addrspace(1) %out, align 4
1099 define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1100 ; SI-LABEL: v_ctlz_zero_undef_i8:
1102 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1103 ; SI-NEXT: s_mov_b32 s7, 0xf000
1104 ; SI-NEXT: v_mov_b32_e32 v1, 0
1105 ; SI-NEXT: s_mov_b32 s10, 0
1106 ; SI-NEXT: s_mov_b32 s11, s7
1107 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1108 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1109 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1110 ; SI-NEXT: s_mov_b32 s6, -1
1111 ; SI-NEXT: s_mov_b32 s4, s0
1112 ; SI-NEXT: s_mov_b32 s5, s1
1113 ; SI-NEXT: s_waitcnt vmcnt(0)
1114 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1115 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1116 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1119 ; VI-LABEL: v_ctlz_zero_undef_i8:
1121 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1123 ; VI-NEXT: v_mov_b32_e32 v1, s3
1124 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1125 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1126 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1127 ; VI-NEXT: s_waitcnt vmcnt(0)
1128 ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1129 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1130 ; VI-NEXT: v_mov_b32_e32 v0, s0
1131 ; VI-NEXT: v_mov_b32_e32 v1, s1
1132 ; VI-NEXT: flat_store_byte v[0:1], v2
1135 ; EG-LABEL: v_ctlz_zero_undef_i8:
1137 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1139 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
1140 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1143 ; EG-NEXT: Fetch clause starting at 6:
1144 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1145 ; EG-NEXT: ALU clause starting at 8:
1146 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1147 ; EG-NEXT: ALU clause starting at 9:
1148 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1149 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1150 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
1151 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1152 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1153 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1154 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1155 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1156 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1157 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1158 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1159 ; EG-NEXT: MOV T0.Y, 0.0,
1160 ; EG-NEXT: MOV * T0.Z, 0.0,
1161 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1162 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1164 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
1165 ; GFX9-GISEL: ; %bb.0:
1166 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1167 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1168 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
1170 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
1171 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
1172 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
1173 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1174 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1175 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1176 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1177 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1178 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
1179 ; GFX9-GISEL-NEXT: s_endpgm
1180 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1181 %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1182 %val = load i8, ptr addrspace(1) %in.gep
1183 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
1184 store i8 %ctlz, ptr addrspace(1) %out
1188 define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
1189 ; SI-LABEL: s_ctlz_zero_undef_i64:
1191 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
1192 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1193 ; SI-NEXT: s_mov_b32 s3, 0xf000
1194 ; SI-NEXT: s_mov_b32 s2, -1
1195 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1196 ; SI-NEXT: s_flbit_i32_b64 s4, s[6:7]
1197 ; SI-NEXT: v_mov_b32_e32 v1, 0
1198 ; SI-NEXT: v_mov_b32_e32 v0, s4
1199 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1202 ; VI-LABEL: s_ctlz_zero_undef_i64:
1204 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
1205 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
1206 ; VI-NEXT: v_mov_b32_e32 v1, 0
1207 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1208 ; VI-NEXT: s_flbit_i32_b64 s0, s[0:1]
1209 ; VI-NEXT: v_mov_b32_e32 v2, s2
1210 ; VI-NEXT: v_mov_b32_e32 v0, s0
1211 ; VI-NEXT: v_mov_b32_e32 v3, s3
1212 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1215 ; EG-LABEL: s_ctlz_zero_undef_i64:
1217 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1218 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1221 ; EG-NEXT: ALU clause starting at 4:
1222 ; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W,
1223 ; EG-NEXT: FFBH_UINT T1.W, KC0[5].X,
1224 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1225 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1226 ; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W,
1227 ; EG-NEXT: MOV T0.Y, 0.0,
1228 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1229 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1231 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64:
1232 ; GFX9-GISEL: ; %bb.0:
1233 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c
1234 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
1235 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0
1236 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
1237 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1238 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[0:1]
1239 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
1240 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
1241 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1242 ; GFX9-GISEL-NEXT: s_endpgm
1243 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1244 store i64 %ctlz, ptr addrspace(1) %out
1248 define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
1249 ; SI-LABEL: s_ctlz_zero_undef_i64_trunc:
1251 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1252 ; SI-NEXT: s_mov_b32 s7, 0xf000
1253 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1254 ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3]
1255 ; SI-NEXT: s_mov_b32 s6, -1
1256 ; SI-NEXT: s_mov_b32 s4, s0
1257 ; SI-NEXT: s_mov_b32 s5, s1
1258 ; SI-NEXT: v_mov_b32_e32 v0, s2
1259 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1262 ; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
1264 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1265 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1266 ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
1267 ; VI-NEXT: v_mov_b32_e32 v0, s0
1268 ; VI-NEXT: v_mov_b32_e32 v1, s1
1269 ; VI-NEXT: v_mov_b32_e32 v2, s2
1270 ; VI-NEXT: flat_store_dword v[0:1], v2
1273 ; EG-LABEL: s_ctlz_zero_undef_i64_trunc:
1275 ; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
1276 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1279 ; EG-NEXT: ALU clause starting at 4:
1280 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W,
1281 ; EG-NEXT: FFBH_UINT T1.W, KC0[3].X,
1282 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1283 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1284 ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W,
1285 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1286 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1288 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
1289 ; GFX9-GISEL: ; %bb.0:
1290 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1291 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1292 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1293 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3]
1294 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
1295 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1296 ; GFX9-GISEL-NEXT: s_endpgm
1297 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1298 %trunc = trunc i64 %ctlz to i32
1299 store i32 %trunc, ptr addrspace(1) %out
1303 define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1304 ; SI-LABEL: v_ctlz_zero_undef_i64:
1306 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1307 ; SI-NEXT: s_mov_b32 s7, 0xf000
1308 ; SI-NEXT: s_mov_b32 s6, 0
1309 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1310 ; SI-NEXT: v_mov_b32_e32 v1, 0
1311 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1312 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1313 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
1314 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1315 ; SI-NEXT: s_waitcnt vmcnt(0)
1316 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
1317 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
1318 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
1319 ; SI-NEXT: v_min_u32_e32 v2, v2, v3
1320 ; SI-NEXT: v_mov_b32_e32 v3, v1
1321 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
1324 ; VI-LABEL: v_ctlz_zero_undef_i64:
1326 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1327 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1328 ; VI-NEXT: v_mov_b32_e32 v2, 0
1329 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1330 ; VI-NEXT: v_mov_b32_e32 v1, s3
1331 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3
1332 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1333 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1334 ; VI-NEXT: v_mov_b32_e32 v4, s1
1335 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
1336 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1337 ; VI-NEXT: s_waitcnt vmcnt(0)
1338 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1339 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1340 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
1341 ; VI-NEXT: v_min_u32_e32 v1, v0, v1
1342 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
1345 ; EG-LABEL: v_ctlz_zero_undef_i64:
1347 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1349 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1350 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1353 ; EG-NEXT: Fetch clause starting at 6:
1354 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1355 ; EG-NEXT: ALU clause starting at 8:
1356 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1357 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1358 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1359 ; EG-NEXT: ALU clause starting at 11:
1360 ; EG-NEXT: FFBH_UINT * T1.W, T0.X,
1361 ; EG-NEXT: FFBH_UINT T2.W, T0.Y,
1362 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
1363 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1364 ; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W,
1365 ; EG-NEXT: MOV T0.Y, 0.0,
1366 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1367 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1368 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1370 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
1371 ; GFX9-GISEL: ; %bb.0:
1372 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1373 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1374 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1375 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1376 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1377 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1378 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
1379 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
1380 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0
1381 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1382 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1383 ; GFX9-GISEL-NEXT: s_endpgm
1384 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1385 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1386 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
1387 %val = load i64, ptr addrspace(1) %in.gep
1388 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1389 store i64 %ctlz, ptr addrspace(1) %out.gep
1393 define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1394 ; SI-LABEL: v_ctlz_zero_undef_i64_trunc:
1396 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1397 ; SI-NEXT: s_mov_b32 s7, 0xf000
1398 ; SI-NEXT: s_mov_b32 s6, 0
1399 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1400 ; SI-NEXT: v_mov_b32_e32 v2, 0
1401 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1402 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1403 ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
1404 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1405 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1406 ; SI-NEXT: s_waitcnt vmcnt(0)
1407 ; SI-NEXT: v_ffbh_u32_e32 v0, v3
1408 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
1409 ; SI-NEXT: v_ffbh_u32_e32 v3, v4
1410 ; SI-NEXT: v_min_u32_e32 v0, v0, v3
1411 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1414 ; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
1416 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1417 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1418 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1419 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1420 ; VI-NEXT: v_mov_b32_e32 v2, s3
1421 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1422 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1423 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
1424 ; VI-NEXT: v_mov_b32_e32 v4, s1
1425 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
1426 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1427 ; VI-NEXT: s_waitcnt vmcnt(0)
1428 ; VI-NEXT: v_ffbh_u32_e32 v0, v1
1429 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1430 ; VI-NEXT: v_ffbh_u32_e32 v1, v2
1431 ; VI-NEXT: v_min_u32_e32 v0, v0, v1
1432 ; VI-NEXT: flat_store_dword v[3:4], v0
1435 ; EG-LABEL: v_ctlz_zero_undef_i64_trunc:
1437 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1439 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1440 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1443 ; EG-NEXT: Fetch clause starting at 6:
1444 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
1445 ; EG-NEXT: ALU clause starting at 8:
1446 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1447 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1448 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
1449 ; EG-NEXT: ALU clause starting at 11:
1450 ; EG-NEXT: FFBH_UINT * T0.W, T1.X,
1451 ; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
1452 ; EG-NEXT: FFBH_UINT T1.W, T1.Y,
1453 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y,
1454 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
1455 ; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W,
1456 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
1457 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1458 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1460 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
1461 ; GFX9-GISEL: ; %bb.0:
1462 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1463 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1464 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1465 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1466 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
1467 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1468 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
1469 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
1470 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1
1471 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1
1472 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
1473 ; GFX9-GISEL-NEXT: s_endpgm
1474 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1475 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1476 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
1477 %val = load i64, ptr addrspace(1) %in.gep
1478 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1479 %trunc = trunc i64 %ctlz to i32
1480 store i32 %trunc, ptr addrspace(1) %out.gep
1484 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1485 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1487 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1488 ; SI-NEXT: s_mov_b32 s7, 0xf000
1489 ; SI-NEXT: s_mov_b32 s10, 0
1490 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1491 ; SI-NEXT: v_mov_b32_e32 v1, 0
1492 ; SI-NEXT: s_mov_b32 s11, s7
1493 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1494 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1495 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1496 ; SI-NEXT: s_mov_b32 s6, -1
1497 ; SI-NEXT: s_mov_b32 s4, s0
1498 ; SI-NEXT: s_mov_b32 s5, s1
1499 ; SI-NEXT: s_waitcnt vmcnt(0)
1500 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1501 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1504 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1506 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1507 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1509 ; VI-NEXT: v_mov_b32_e32 v1, s3
1510 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1511 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1512 ; VI-NEXT: flat_load_dword v0, v[0:1]
1513 ; VI-NEXT: s_waitcnt vmcnt(0)
1514 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1515 ; VI-NEXT: v_mov_b32_e32 v0, s0
1516 ; VI-NEXT: v_mov_b32_e32 v1, s1
1517 ; VI-NEXT: flat_store_dword v[0:1], v2
1520 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1522 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1524 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1525 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1528 ; EG-NEXT: Fetch clause starting at 6:
1529 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1530 ; EG-NEXT: ALU clause starting at 8:
1531 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1532 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1533 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1534 ; EG-NEXT: ALU clause starting at 11:
1535 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1536 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1537 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1538 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1540 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1541 ; GFX9-GISEL: ; %bb.0:
1542 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1543 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1544 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1545 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1546 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1547 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1548 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1549 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
1550 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1551 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1552 ; GFX9-GISEL-NEXT: s_endpgm
1553 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1554 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1555 %val = load i32, ptr addrspace(1) %in.gep
1556 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1557 %cmp = icmp eq i32 %val, 0
1558 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1559 store i32 %sel, ptr addrspace(1) %out
1563 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1564 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1566 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1567 ; SI-NEXT: s_mov_b32 s7, 0xf000
1568 ; SI-NEXT: s_mov_b32 s10, 0
1569 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1570 ; SI-NEXT: v_mov_b32_e32 v1, 0
1571 ; SI-NEXT: s_mov_b32 s11, s7
1572 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1573 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1574 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1575 ; SI-NEXT: s_mov_b32 s6, -1
1576 ; SI-NEXT: s_mov_b32 s4, s0
1577 ; SI-NEXT: s_mov_b32 s5, s1
1578 ; SI-NEXT: s_waitcnt vmcnt(0)
1579 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1580 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1583 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1585 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1586 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1587 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1588 ; VI-NEXT: v_mov_b32_e32 v1, s3
1589 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1590 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1591 ; VI-NEXT: flat_load_dword v0, v[0:1]
1592 ; VI-NEXT: s_waitcnt vmcnt(0)
1593 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1594 ; VI-NEXT: v_mov_b32_e32 v0, s0
1595 ; VI-NEXT: v_mov_b32_e32 v1, s1
1596 ; VI-NEXT: flat_store_dword v[0:1], v2
1599 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1601 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1603 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1604 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1607 ; EG-NEXT: Fetch clause starting at 6:
1608 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1609 ; EG-NEXT: ALU clause starting at 8:
1610 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1611 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1612 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1613 ; EG-NEXT: ALU clause starting at 11:
1614 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1615 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1616 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1617 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1619 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1620 ; GFX9-GISEL: ; %bb.0:
1621 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1622 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1623 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1624 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1625 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1626 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1627 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1628 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
1629 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1630 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1631 ; GFX9-GISEL-NEXT: s_endpgm
1632 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1633 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1634 %val = load i32, ptr addrspace(1) %in.gep
1635 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1636 %cmp = icmp ne i32 %val, 0
1637 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1638 store i32 %sel, ptr addrspace(1) %out
1642 define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1643 ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1645 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1646 ; SI-NEXT: s_mov_b32 s7, 0xf000
1647 ; SI-NEXT: v_mov_b32_e32 v1, 0
1648 ; SI-NEXT: s_mov_b32 s10, 0
1649 ; SI-NEXT: s_mov_b32 s11, s7
1650 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1651 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1652 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1653 ; SI-NEXT: s_mov_b32 s6, -1
1654 ; SI-NEXT: s_mov_b32 s4, s0
1655 ; SI-NEXT: s_mov_b32 s5, s1
1656 ; SI-NEXT: s_waitcnt vmcnt(0)
1657 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1658 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1661 ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1663 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1664 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1665 ; VI-NEXT: v_mov_b32_e32 v1, s3
1666 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1667 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1668 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1669 ; VI-NEXT: s_waitcnt vmcnt(0)
1670 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1671 ; VI-NEXT: v_mov_b32_e32 v0, s0
1672 ; VI-NEXT: v_mov_b32_e32 v1, s1
1673 ; VI-NEXT: flat_store_byte v[0:1], v2
1676 ; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1678 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1680 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1681 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1684 ; EG-NEXT: Fetch clause starting at 6:
1685 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1686 ; EG-NEXT: ALU clause starting at 8:
1687 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1688 ; EG-NEXT: ALU clause starting at 9:
1689 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1690 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1691 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1692 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1693 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1694 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1695 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1696 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1697 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1698 ; EG-NEXT: MOV T0.Y, 0.0,
1699 ; EG-NEXT: MOV * T0.Z, 0.0,
1700 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1701 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1703 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1704 ; GFX9-GISEL: ; %bb.0:
1705 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1706 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1707 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1708 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
1709 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
1710 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
1711 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
1712 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1713 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1714 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff
1715 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1716 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v0
1717 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
1718 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
1719 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
1720 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
1721 ; GFX9-GISEL-NEXT: s_endpgm
1722 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1723 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1724 %val = load i8, ptr addrspace(1) %valptr.gep
1725 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
1726 %cmp = icmp eq i8 %val, 0
1727 %sel = select i1 %cmp, i8 -1, i8 %ctlz
1728 store i8 %sel, ptr addrspace(1) %out
1732 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1733 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1735 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1736 ; SI-NEXT: s_mov_b32 s7, 0xf000
1737 ; SI-NEXT: s_mov_b32 s10, 0
1738 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1739 ; SI-NEXT: v_mov_b32_e32 v1, 0
1740 ; SI-NEXT: s_mov_b32 s11, s7
1741 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1742 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1743 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1744 ; SI-NEXT: s_mov_b32 s6, -1
1745 ; SI-NEXT: s_mov_b32 s4, s0
1746 ; SI-NEXT: s_mov_b32 s5, s1
1747 ; SI-NEXT: s_waitcnt vmcnt(0)
1748 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1749 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1750 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1751 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
1752 ; SI-NEXT: s_waitcnt vmcnt(0)
1753 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1754 ; SI-NEXT: s_waitcnt vmcnt(0)
1757 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1759 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1760 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1761 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1762 ; VI-NEXT: v_mov_b32_e32 v1, s3
1763 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1764 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1765 ; VI-NEXT: flat_load_dword v2, v[0:1]
1766 ; VI-NEXT: v_mov_b32_e32 v0, s0
1767 ; VI-NEXT: v_mov_b32_e32 v1, s1
1768 ; VI-NEXT: s_waitcnt vmcnt(0)
1769 ; VI-NEXT: v_ffbh_u32_e32 v3, v2
1770 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
1771 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1772 ; VI-NEXT: flat_store_dword v[0:1], v3
1773 ; VI-NEXT: s_waitcnt vmcnt(0)
1774 ; VI-NEXT: flat_store_byte v[0:1], v2
1775 ; VI-NEXT: s_waitcnt vmcnt(0)
1778 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1780 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1782 ; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
1783 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
1784 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T2.X
1786 ; EG-NEXT: Fetch clause starting at 6:
1787 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1788 ; EG-NEXT: ALU clause starting at 8:
1789 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1790 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1791 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1792 ; EG-NEXT: ALU clause starting at 11:
1793 ; EG-NEXT: SETE_INT * T0.W, T0.X, 0.0,
1794 ; EG-NEXT: AND_INT T1.X, PV.W, 1,
1795 ; EG-NEXT: MOV * T1.W, literal.x,
1796 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1797 ; EG-NEXT: MOV T1.Y, 0.0,
1798 ; EG-NEXT: MOV * T1.Z, 0.0,
1799 ; EG-NEXT: MOV T2.X, literal.x,
1800 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1801 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
1802 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1803 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.y,
1804 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1806 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1807 ; GFX9-GISEL: ; %bb.0:
1808 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1809 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1810 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1811 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1812 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1813 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1814 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0
1815 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1816 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
1817 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1818 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1819 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1820 ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off
1821 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1822 ; GFX9-GISEL-NEXT: s_endpgm
1823 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1824 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1825 %val = load i32, ptr addrspace(1) %in.gep
1826 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1827 %cmp = icmp eq i32 %val, 0
1828 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1829 store volatile i32 %sel, ptr addrspace(1) %out
1830 store volatile i1 %cmp, ptr addrspace(1) undef
1834 ; Selected on wrong constant
1835 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1836 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1838 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1839 ; SI-NEXT: s_mov_b32 s7, 0xf000
1840 ; SI-NEXT: s_mov_b32 s10, 0
1841 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1842 ; SI-NEXT: v_mov_b32_e32 v1, 0
1843 ; SI-NEXT: s_mov_b32 s11, s7
1844 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1845 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1846 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1847 ; SI-NEXT: s_mov_b32 s6, -1
1848 ; SI-NEXT: s_mov_b32 s4, s0
1849 ; SI-NEXT: s_mov_b32 s5, s1
1850 ; SI-NEXT: s_waitcnt vmcnt(0)
1851 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1852 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1853 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1854 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1857 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1859 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1860 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1861 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1862 ; VI-NEXT: v_mov_b32_e32 v1, s3
1863 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1864 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1865 ; VI-NEXT: flat_load_dword v0, v[0:1]
1866 ; VI-NEXT: s_waitcnt vmcnt(0)
1867 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1868 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1869 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1870 ; VI-NEXT: v_mov_b32_e32 v0, s0
1871 ; VI-NEXT: v_mov_b32_e32 v1, s1
1872 ; VI-NEXT: flat_store_dword v[0:1], v2
1875 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1877 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1879 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1880 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1883 ; EG-NEXT: Fetch clause starting at 6:
1884 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1885 ; EG-NEXT: ALU clause starting at 8:
1886 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1887 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1888 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1889 ; EG-NEXT: ALU clause starting at 11:
1890 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1891 ; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W,
1892 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1893 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1895 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1896 ; GFX9-GISEL: ; %bb.0:
1897 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1898 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1899 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1900 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1901 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1902 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1903 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1904 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
1905 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1906 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1907 ; GFX9-GISEL-NEXT: s_endpgm
1908 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1909 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1910 %val = load i32, ptr addrspace(1) %in.gep
1911 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1912 %cmp = icmp eq i32 %val, 0
1913 %sel = select i1 %cmp, i32 0, i32 %ctlz
1914 store i32 %sel, ptr addrspace(1) %out
1918 ; Selected on wrong constant
1919 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1920 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1922 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1923 ; SI-NEXT: s_mov_b32 s7, 0xf000
1924 ; SI-NEXT: s_mov_b32 s10, 0
1925 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1926 ; SI-NEXT: v_mov_b32_e32 v1, 0
1927 ; SI-NEXT: s_mov_b32 s11, s7
1928 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1929 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1930 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1931 ; SI-NEXT: s_mov_b32 s6, -1
1932 ; SI-NEXT: s_mov_b32 s4, s0
1933 ; SI-NEXT: s_mov_b32 s5, s1
1934 ; SI-NEXT: s_waitcnt vmcnt(0)
1935 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1936 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1937 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1938 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1941 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1943 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1944 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1945 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1946 ; VI-NEXT: v_mov_b32_e32 v1, s3
1947 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1948 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1949 ; VI-NEXT: flat_load_dword v0, v[0:1]
1950 ; VI-NEXT: s_waitcnt vmcnt(0)
1951 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1952 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1953 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1954 ; VI-NEXT: v_mov_b32_e32 v0, s0
1955 ; VI-NEXT: v_mov_b32_e32 v1, s1
1956 ; VI-NEXT: flat_store_dword v[0:1], v2
1959 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1961 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1963 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1964 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1967 ; EG-NEXT: Fetch clause starting at 6:
1968 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1969 ; EG-NEXT: ALU clause starting at 8:
1970 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1971 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1972 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1973 ; EG-NEXT: ALU clause starting at 11:
1974 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1975 ; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W,
1976 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1977 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1979 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1980 ; GFX9-GISEL: ; %bb.0:
1981 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1982 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1983 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1984 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1985 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1986 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1987 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1988 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1989 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1990 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1991 ; GFX9-GISEL-NEXT: s_endpgm
1992 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1993 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1994 %val = load i32, ptr addrspace(1) %in.gep
1995 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1996 %cmp = icmp ne i32 %val, 0
1997 %sel = select i1 %cmp, i32 %ctlz, i32 0
1998 store i32 %sel, ptr addrspace(1) %out
2002 ; Compare on wrong constant
2003 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
2004 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2006 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2007 ; SI-NEXT: s_mov_b32 s7, 0xf000
2008 ; SI-NEXT: s_mov_b32 s10, 0
2009 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2010 ; SI-NEXT: v_mov_b32_e32 v1, 0
2011 ; SI-NEXT: s_mov_b32 s11, s7
2012 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2013 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2014 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2015 ; SI-NEXT: s_mov_b32 s6, -1
2016 ; SI-NEXT: s_mov_b32 s4, s0
2017 ; SI-NEXT: s_mov_b32 s5, s1
2018 ; SI-NEXT: s_waitcnt vmcnt(0)
2019 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
2020 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2021 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2022 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2025 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2027 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2028 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2029 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2030 ; VI-NEXT: v_mov_b32_e32 v1, s3
2031 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2032 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2033 ; VI-NEXT: flat_load_dword v0, v[0:1]
2034 ; VI-NEXT: s_waitcnt vmcnt(0)
2035 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
2036 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2037 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
2038 ; VI-NEXT: v_mov_b32_e32 v0, s0
2039 ; VI-NEXT: v_mov_b32_e32 v1, s1
2040 ; VI-NEXT: flat_store_dword v[0:1], v2
2043 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2045 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
2047 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
2048 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2051 ; EG-NEXT: Fetch clause starting at 6:
2052 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2053 ; EG-NEXT: ALU clause starting at 8:
2054 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2055 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2056 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
2057 ; EG-NEXT: ALU clause starting at 11:
2058 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
2059 ; EG-NEXT: SETE_INT * T1.W, T0.X, 1,
2060 ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
2061 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2062 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2064 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2065 ; GFX9-GISEL: ; %bb.0:
2066 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2067 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2068 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2069 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
2070 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
2071 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
2072 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
2073 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
2074 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
2075 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
2076 ; GFX9-GISEL-NEXT: s_endpgm
2077 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2078 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
2079 %val = load i32, ptr addrspace(1) %in.gep
2080 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
2081 %cmp = icmp eq i32 %val, 1
2082 %sel = select i1 %cmp, i32 0, i32 %ctlz
2083 store i32 %sel, ptr addrspace(1) %out
2087 ; Selected on wrong constant
2088 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
2089 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2091 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2092 ; SI-NEXT: s_mov_b32 s7, 0xf000
2093 ; SI-NEXT: s_mov_b32 s10, 0
2094 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2095 ; SI-NEXT: v_mov_b32_e32 v1, 0
2096 ; SI-NEXT: s_mov_b32 s11, s7
2097 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2098 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2099 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2100 ; SI-NEXT: s_mov_b32 s6, -1
2101 ; SI-NEXT: s_mov_b32 s4, s0
2102 ; SI-NEXT: s_mov_b32 s5, s1
2103 ; SI-NEXT: s_waitcnt vmcnt(0)
2104 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
2105 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2106 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2107 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2110 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2112 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2113 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2114 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2115 ; VI-NEXT: v_mov_b32_e32 v1, s3
2116 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2117 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2118 ; VI-NEXT: flat_load_dword v0, v[0:1]
2119 ; VI-NEXT: s_waitcnt vmcnt(0)
2120 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
2121 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2122 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
2123 ; VI-NEXT: v_mov_b32_e32 v0, s0
2124 ; VI-NEXT: v_mov_b32_e32 v1, s1
2125 ; VI-NEXT: flat_store_dword v[0:1], v2
2128 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2130 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
2132 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
2133 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2136 ; EG-NEXT: Fetch clause starting at 6:
2137 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2138 ; EG-NEXT: ALU clause starting at 8:
2139 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2140 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2141 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
2142 ; EG-NEXT: ALU clause starting at 11:
2143 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
2144 ; EG-NEXT: SETNE_INT * T1.W, T0.X, 1,
2145 ; EG-NEXT: CNDE_INT T0.X, PS, 0.0, PV.W,
2146 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2147 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2149 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2150 ; GFX9-GISEL: ; %bb.0:
2151 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2152 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2153 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2154 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
2155 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
2156 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
2157 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2158 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2159 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
2160 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
2161 ; GFX9-GISEL-NEXT: s_endpgm
2162 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2163 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
2164 %val = load i32, ptr addrspace(1) %in.gep
2165 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
2166 %cmp = icmp ne i32 %val, 1
2167 %sel = select i1 %cmp, i32 %ctlz, i32 0
2168 store i32 %sel, ptr addrspace(1) %out
2172 define i7 @v_ctlz_zero_undef_i7(i7 %val) {
2173 ; SI-LABEL: v_ctlz_zero_undef_i7:
2175 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2176 ; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2177 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2178 ; SI-NEXT: s_setpc_b64 s[30:31]
2180 ; VI-LABEL: v_ctlz_zero_undef_i7:
2182 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2183 ; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2184 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2185 ; VI-NEXT: s_setpc_b64 s[30:31]
2187 ; EG-LABEL: v_ctlz_zero_undef_i7:
2192 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
2193 ; GFX9-GISEL: ; %bb.0:
2194 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2195 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2196 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2197 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2198 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
2202 define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind {
2203 ; SI-LABEL: s_ctlz_zero_undef_i18:
2205 ; SI-NEXT: s_load_dword s2, s[4:5], 0xb
2206 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2207 ; SI-NEXT: s_mov_b32 s3, 0xf000
2208 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2209 ; SI-NEXT: s_lshl_b32 s2, s2, 14
2210 ; SI-NEXT: s_flbit_i32_b32 s4, s2
2211 ; SI-NEXT: s_mov_b32 s2, -1
2212 ; SI-NEXT: v_mov_b32_e32 v0, s4
2213 ; SI-NEXT: s_bfe_u32 s4, s4, 0x20010
2214 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
2215 ; SI-NEXT: s_waitcnt expcnt(0)
2216 ; SI-NEXT: v_mov_b32_e32 v0, s4
2217 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
2220 ; VI-LABEL: s_ctlz_zero_undef_i18:
2222 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
2223 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2224 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2225 ; VI-NEXT: s_lshl_b32 s2, s2, 14
2226 ; VI-NEXT: v_mov_b32_e32 v0, s0
2227 ; VI-NEXT: s_flbit_i32_b32 s2, s2
2228 ; VI-NEXT: v_mov_b32_e32 v1, s1
2229 ; VI-NEXT: s_add_u32 s0, s0, 2
2230 ; VI-NEXT: v_mov_b32_e32 v2, s2
2231 ; VI-NEXT: s_addc_u32 s1, s1, 0
2232 ; VI-NEXT: flat_store_short v[0:1], v2
2233 ; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
2234 ; VI-NEXT: v_mov_b32_e32 v0, s0
2235 ; VI-NEXT: v_mov_b32_e32 v1, s1
2236 ; VI-NEXT: v_mov_b32_e32 v2, s2
2237 ; VI-NEXT: flat_store_byte v[0:1], v2
2240 ; EG-LABEL: s_ctlz_zero_undef_i18:
2242 ; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[]
2243 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
2244 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X
2246 ; EG-NEXT: ALU clause starting at 4:
2247 ; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
2248 ; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00)
2249 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
2250 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
2251 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2252 ; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
2253 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
2254 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
2255 ; EG-NEXT: LSHL T1.X, PV.W, PS,
2256 ; EG-NEXT: LSHL * T1.W, literal.x, PS,
2257 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2258 ; EG-NEXT: MOV T1.Y, 0.0,
2259 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2260 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2261 ; EG-NEXT: AND_INT T3.W, PV.W, literal.x,
2262 ; EG-NEXT: MOV * T4.W, literal.y,
2263 ; EG-NEXT: 3(4.203895e-45), 2(2.802597e-45)
2264 ; EG-NEXT: BFE_UINT T0.W, T0.W, literal.x, PS,
2265 ; EG-NEXT: LSHL * T3.W, PV.W, literal.y,
2266 ; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
2267 ; EG-NEXT: LSHL T0.X, PV.W, PS,
2268 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
2269 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2270 ; EG-NEXT: MOV T0.Y, 0.0,
2271 ; EG-NEXT: MOV T1.Z, 0.0,
2272 ; EG-NEXT: MOV * T0.Z, 0.0,
2273 ; EG-NEXT: LSHR T2.X, T2.W, literal.x,
2274 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2275 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2277 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18:
2278 ; GFX9-GISEL: ; %bb.0:
2279 ; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
2280 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2281 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
2282 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2283 ; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 14
2284 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
2285 ; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff
2286 ; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16
2287 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
2288 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
2289 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
2290 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2
2291 ; GFX9-GISEL-NEXT: s_endpgm
2292 %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone
2293 store i18 %ctlz, ptr addrspace(1) %out, align 4
2297 define i18 @v_ctlz_zero_undef_i18(i18 %val) {
2298 ; SI-LABEL: v_ctlz_zero_undef_i18:
2300 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2301 ; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2302 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2303 ; SI-NEXT: s_setpc_b64 s[30:31]
2305 ; VI-LABEL: v_ctlz_zero_undef_i18:
2307 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2308 ; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2309 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2310 ; VI-NEXT: s_setpc_b64 s[30:31]
2312 ; EG-LABEL: v_ctlz_zero_undef_i18:
2317 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
2318 ; GFX9-GISEL: ; %bb.0:
2319 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2320 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2321 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2322 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2323 %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
2327 define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
2328 ; SI-LABEL: v_ctlz_zero_undef_v2i18:
2330 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2331 ; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2332 ; SI-NEXT: v_lshlrev_b32_e32 v1, 14, v1
2333 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2334 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2335 ; SI-NEXT: s_setpc_b64 s[30:31]
2337 ; VI-LABEL: v_ctlz_zero_undef_v2i18:
2339 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2340 ; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2341 ; VI-NEXT: v_lshlrev_b32_e32 v1, 14, v1
2342 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2343 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2344 ; VI-NEXT: s_setpc_b64 s[30:31]
2346 ; EG-LABEL: v_ctlz_zero_undef_v2i18:
2351 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
2352 ; GFX9-GISEL: ; %bb.0:
2353 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2354 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2355 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 14, v1
2356 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2357 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2358 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2359 %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
2363 define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
2364 ; SI-LABEL: v_ctlz_zero_undef_v2i16:
2366 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2367 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2368 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2369 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2370 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
2371 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2372 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
2373 ; SI-NEXT: s_setpc_b64 s[30:31]
2375 ; VI-LABEL: v_ctlz_zero_undef_v2i16:
2377 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2378 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
2379 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2380 ; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2381 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2382 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2383 ; VI-NEXT: s_setpc_b64 s[30:31]
2385 ; EG-LABEL: v_ctlz_zero_undef_v2i16:
2390 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
2391 ; GFX9-GISEL: ; %bb.0:
2392 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2393 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
2394 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2395 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2396 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2397 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2398 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
2399 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2400 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2401 %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
2405 define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
2406 ; SI-LABEL: v_ctlz_zero_undef_v3i16:
2408 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2409 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2410 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2411 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2412 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2413 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2414 ; SI-NEXT: v_ffbh_u32_e32 v3, v2
2415 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2416 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
2417 ; SI-NEXT: v_or_b32_e32 v2, 0x200000, v3
2418 ; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16
2419 ; SI-NEXT: s_setpc_b64 s[30:31]
2421 ; VI-LABEL: v_ctlz_zero_undef_v3i16:
2423 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2424 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
2425 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2426 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2427 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
2428 ; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2429 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2430 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2431 ; VI-NEXT: s_setpc_b64 s[30:31]
2433 ; EG-LABEL: v_ctlz_zero_undef_v3i16:
2438 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
2439 ; GFX9-GISEL: ; %bb.0:
2440 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2441 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2442 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2443 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2444 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2445 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
2446 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2447 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
2448 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2449 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
2450 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2451 %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
2455 define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
2456 ; SI-LABEL: v_ctlz_zero_undef_v4i16:
2458 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2459 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2460 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2461 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2462 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2463 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
2464 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
2465 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2466 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2467 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2468 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2469 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
2470 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
2471 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
2472 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2473 ; SI-NEXT: s_setpc_b64 s[30:31]
2475 ; VI-LABEL: v_ctlz_zero_undef_v4i16:
2477 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2478 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
2479 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
2480 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
2481 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2482 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
2483 ; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2484 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
2485 ; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2486 ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2487 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2488 ; VI-NEXT: s_setpc_b64 s[30:31]
2490 ; EG-LABEL: v_ctlz_zero_undef_v4i16:
2495 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
2496 ; GFX9-GISEL: ; %bb.0:
2497 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2499 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2500 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2501 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2502 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2503 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2504 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2505 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2506 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
2507 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
2508 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
2509 ; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
2510 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
2511 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1
2512 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2513 %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
2517 define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
2518 ; SI-LABEL: v_ctlz_zero_undef_v2i8:
2520 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2521 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
2522 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
2523 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2524 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
2525 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2526 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
2527 ; SI-NEXT: s_setpc_b64 s[30:31]
2529 ; VI-LABEL: v_ctlz_zero_undef_v2i8:
2531 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2532 ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
2533 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2534 ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
2535 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
2536 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2537 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
2538 ; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
2539 ; VI-NEXT: s_setpc_b64 s[30:31]
2541 ; EG-LABEL: v_ctlz_zero_undef_v2i8:
2546 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
2547 ; GFX9-GISEL: ; %bb.0:
2548 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2549 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
2550 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1
2551 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2552 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2553 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2554 %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
2558 define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
2559 ; SI-LABEL: v_ctlz_zero_undef_v2i7:
2561 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562 ; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2563 ; SI-NEXT: v_lshlrev_b32_e32 v1, 25, v1
2564 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2565 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2566 ; SI-NEXT: s_setpc_b64 s[30:31]
2568 ; VI-LABEL: v_ctlz_zero_undef_v2i7:
2570 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2571 ; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2572 ; VI-NEXT: v_lshlrev_b32_e32 v1, 25, v1
2573 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2574 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2575 ; VI-NEXT: s_setpc_b64 s[30:31]
2577 ; EG-LABEL: v_ctlz_zero_undef_v2i7:
2582 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
2583 ; GFX9-GISEL: ; %bb.0:
2584 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2585 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2586 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 25, v1
2587 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2588 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2589 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2590 %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)