1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
7 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
8 declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
9 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
10 declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
12 declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
13 declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone
15 declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone
16 declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone
17 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
19 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
20 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
21 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
23 declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
24 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
25 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
27 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
29 define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
30 ; SI-LABEL: s_ctlz_zero_undef_i32:
32 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
33 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
34 ; SI-NEXT: s_mov_b32 s3, 0xf000
35 ; SI-NEXT: s_waitcnt lgkmcnt(0)
36 ; SI-NEXT: s_flbit_i32_b32 s4, s4
37 ; SI-NEXT: s_mov_b32 s2, -1
38 ; SI-NEXT: v_mov_b32_e32 v0, s4
39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
42 ; VI-LABEL: s_ctlz_zero_undef_i32:
44 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
45 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
46 ; VI-NEXT: s_waitcnt lgkmcnt(0)
47 ; VI-NEXT: s_flbit_i32_b32 s2, s4
48 ; VI-NEXT: v_mov_b32_e32 v0, s0
49 ; VI-NEXT: v_mov_b32_e32 v1, s1
50 ; VI-NEXT: v_mov_b32_e32 v2, s2
51 ; VI-NEXT: flat_store_dword v[0:1], v2
54 ; EG-LABEL: s_ctlz_zero_undef_i32:
56 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
57 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
60 ; EG-NEXT: ALU clause starting at 4:
61 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
62 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
63 ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z,
65 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32:
66 ; GFX9-GISEL: ; %bb.0:
67 ; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
68 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
69 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
70 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4
72 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
73 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
74 ; GFX9-GISEL-NEXT: s_endpgm
75 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
76 store i32 %ctlz, ptr addrspace(1) %out, align 4
80 define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
81 ; SI-LABEL: v_ctlz_zero_undef_i32:
83 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
84 ; SI-NEXT: s_mov_b32 s7, 0xf000
85 ; SI-NEXT: s_mov_b32 s10, 0
86 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
87 ; SI-NEXT: v_mov_b32_e32 v1, 0
88 ; SI-NEXT: s_mov_b32 s11, s7
89 ; SI-NEXT: s_waitcnt lgkmcnt(0)
90 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
91 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
92 ; SI-NEXT: s_mov_b32 s6, -1
93 ; SI-NEXT: s_mov_b32 s4, s0
94 ; SI-NEXT: s_mov_b32 s5, s1
95 ; SI-NEXT: s_waitcnt vmcnt(0)
96 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
97 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
100 ; VI-LABEL: v_ctlz_zero_undef_i32:
102 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
103 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
104 ; VI-NEXT: s_waitcnt lgkmcnt(0)
105 ; VI-NEXT: v_mov_b32_e32 v1, s3
106 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
107 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
108 ; VI-NEXT: flat_load_dword v0, v[0:1]
109 ; VI-NEXT: s_waitcnt vmcnt(0)
110 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
111 ; VI-NEXT: v_mov_b32_e32 v0, s0
112 ; VI-NEXT: v_mov_b32_e32 v1, s1
113 ; VI-NEXT: flat_store_dword v[0:1], v2
116 ; EG-LABEL: v_ctlz_zero_undef_i32:
118 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
120 ; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
121 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
124 ; EG-NEXT: Fetch clause starting at 6:
125 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
126 ; EG-NEXT: ALU clause starting at 8:
127 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
128 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
129 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
130 ; EG-NEXT: ALU clause starting at 11:
131 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
132 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
133 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
135 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
136 ; GFX9-GISEL: ; %bb.0:
137 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
138 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
139 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
140 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
142 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
143 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
144 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
145 ; GFX9-GISEL-NEXT: s_endpgm
146 %tid = call i32 @llvm.amdgcn.workitem.id.x()
147 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
148 %val = load i32, ptr addrspace(1) %in.gep, align 4
149 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
150 store i32 %ctlz, ptr addrspace(1) %out, align 4
154 define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
155 ; SI-LABEL: v_ctlz_zero_undef_v2i32:
157 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
158 ; SI-NEXT: s_mov_b32 s7, 0xf000
159 ; SI-NEXT: s_mov_b32 s10, 0
160 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
161 ; SI-NEXT: v_mov_b32_e32 v1, 0
162 ; SI-NEXT: s_mov_b32 s11, s7
163 ; SI-NEXT: s_waitcnt lgkmcnt(0)
164 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
165 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
166 ; SI-NEXT: s_mov_b32 s6, -1
167 ; SI-NEXT: s_mov_b32 s4, s0
168 ; SI-NEXT: s_mov_b32 s5, s1
169 ; SI-NEXT: s_waitcnt vmcnt(0)
170 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
171 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
172 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
175 ; VI-LABEL: v_ctlz_zero_undef_v2i32:
177 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
178 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
179 ; VI-NEXT: s_waitcnt lgkmcnt(0)
180 ; VI-NEXT: v_mov_b32_e32 v1, s3
181 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
182 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
183 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
184 ; VI-NEXT: v_mov_b32_e32 v3, s1
185 ; VI-NEXT: v_mov_b32_e32 v2, s0
186 ; VI-NEXT: s_waitcnt vmcnt(0)
187 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
188 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
189 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
192 ; EG-LABEL: v_ctlz_zero_undef_v2i32:
194 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
196 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
197 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
200 ; EG-NEXT: Fetch clause starting at 6:
201 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
202 ; EG-NEXT: ALU clause starting at 8:
203 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
204 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
205 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
206 ; EG-NEXT: ALU clause starting at 11:
207 ; EG-NEXT: FFBH_UINT * T0.Y, T0.Y,
208 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
209 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
210 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
212 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
213 ; GFX9-GISEL: ; %bb.0:
214 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
215 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
216 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
217 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
218 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
219 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
220 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
221 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
222 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
223 ; GFX9-GISEL-NEXT: s_endpgm
224 %tid = call i32 @llvm.amdgcn.workitem.id.x()
225 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
226 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
227 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
228 store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8
232 define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
233 ; SI-LABEL: v_ctlz_zero_undef_v4i32:
235 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
236 ; SI-NEXT: s_mov_b32 s7, 0xf000
237 ; SI-NEXT: s_mov_b32 s10, 0
238 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
239 ; SI-NEXT: v_mov_b32_e32 v1, 0
240 ; SI-NEXT: s_mov_b32 s11, s7
241 ; SI-NEXT: s_waitcnt lgkmcnt(0)
242 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
243 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
244 ; SI-NEXT: s_mov_b32 s6, -1
245 ; SI-NEXT: s_mov_b32 s4, s0
246 ; SI-NEXT: s_mov_b32 s5, s1
247 ; SI-NEXT: s_waitcnt vmcnt(0)
248 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
249 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
250 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
251 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
252 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
255 ; VI-LABEL: v_ctlz_zero_undef_v4i32:
257 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
258 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
259 ; VI-NEXT: s_waitcnt lgkmcnt(0)
260 ; VI-NEXT: v_mov_b32_e32 v1, s3
261 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
262 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
263 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
264 ; VI-NEXT: v_mov_b32_e32 v5, s1
265 ; VI-NEXT: v_mov_b32_e32 v4, s0
266 ; VI-NEXT: s_waitcnt vmcnt(0)
267 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
268 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
269 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
270 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
271 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
274 ; EG-LABEL: v_ctlz_zero_undef_v4i32:
276 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
278 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
279 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
282 ; EG-NEXT: Fetch clause starting at 6:
283 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
284 ; EG-NEXT: ALU clause starting at 8:
285 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
286 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
287 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
288 ; EG-NEXT: ALU clause starting at 11:
289 ; EG-NEXT: FFBH_UINT * T0.W, T0.W,
290 ; EG-NEXT: FFBH_UINT * T0.Z, T0.Z,
291 ; EG-NEXT: FFBH_UINT * T0.Y, T0.Y,
292 ; EG-NEXT: FFBH_UINT T0.X, T0.X,
293 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
294 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
296 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
297 ; GFX9-GISEL: ; %bb.0:
298 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
299 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
300 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
301 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
302 ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
303 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
304 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
305 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
306 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
307 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
308 ; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
309 ; GFX9-GISEL-NEXT: s_endpgm
310 %tid = call i32 @llvm.amdgcn.workitem.id.x()
311 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
312 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
313 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
314 store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16
318 define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
319 ; SI-LABEL: s_ctlz_zero_undef_i8_with_select:
321 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
322 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
323 ; SI-NEXT: s_mov_b32 s3, 0xf000
324 ; SI-NEXT: s_waitcnt lgkmcnt(0)
325 ; SI-NEXT: s_lshl_b32 s2, s4, 24
326 ; SI-NEXT: s_flbit_i32_b32 s4, s2
327 ; SI-NEXT: s_mov_b32 s2, -1
328 ; SI-NEXT: v_mov_b32_e32 v0, s4
329 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
332 ; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
334 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
335 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
336 ; VI-NEXT: s_waitcnt lgkmcnt(0)
337 ; VI-NEXT: s_lshl_b32 s2, s4, 24
338 ; VI-NEXT: s_flbit_i32_b32 s2, s2
339 ; VI-NEXT: v_mov_b32_e32 v0, s0
340 ; VI-NEXT: v_mov_b32_e32 v1, s1
341 ; VI-NEXT: v_mov_b32_e32 v2, s2
342 ; VI-NEXT: flat_store_byte v[0:1], v2
345 ; EG-LABEL: s_ctlz_zero_undef_i8_with_select:
347 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
349 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
350 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
353 ; EG-NEXT: Fetch clause starting at 6:
354 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
355 ; EG-NEXT: ALU clause starting at 8:
356 ; EG-NEXT: MOV * T0.X, 0.0,
357 ; EG-NEXT: ALU clause starting at 9:
358 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
359 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
360 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
361 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
362 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
363 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
364 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
365 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
366 ; EG-NEXT: LSHL T0.X, PV.W, PS,
367 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
368 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
369 ; EG-NEXT: MOV T0.Y, 0.0,
370 ; EG-NEXT: MOV * T0.Z, 0.0,
371 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
372 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
374 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select:
375 ; GFX9-GISEL: ; %bb.0:
376 ; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
377 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
378 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
379 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24
381 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
382 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
383 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
384 ; GFX9-GISEL-NEXT: s_endpgm
385 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
386 %ctlz_ret = icmp ne i8 %val, 0
387 %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
388 store i8 %ctlz, ptr addrspace(1) %out, align 4
392 define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
393 ; SI-LABEL: s_ctlz_zero_undef_i16_with_select:
395 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
396 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
397 ; SI-NEXT: s_mov_b32 s3, 0xf000
398 ; SI-NEXT: s_waitcnt lgkmcnt(0)
399 ; SI-NEXT: s_lshl_b32 s2, s4, 16
400 ; SI-NEXT: s_flbit_i32_b32 s4, s2
401 ; SI-NEXT: s_mov_b32 s2, -1
402 ; SI-NEXT: v_mov_b32_e32 v0, s4
403 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
406 ; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
408 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
409 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
410 ; VI-NEXT: s_waitcnt lgkmcnt(0)
411 ; VI-NEXT: s_lshl_b32 s2, s4, 16
412 ; VI-NEXT: s_flbit_i32_b32 s2, s2
413 ; VI-NEXT: v_mov_b32_e32 v0, s0
414 ; VI-NEXT: v_mov_b32_e32 v1, s1
415 ; VI-NEXT: v_mov_b32_e32 v2, s2
416 ; VI-NEXT: flat_store_short v[0:1], v2
419 ; EG-LABEL: s_ctlz_zero_undef_i16_with_select:
421 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
423 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
424 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
427 ; EG-NEXT: Fetch clause starting at 6:
428 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
429 ; EG-NEXT: ALU clause starting at 8:
430 ; EG-NEXT: MOV * T0.X, 0.0,
431 ; EG-NEXT: ALU clause starting at 9:
432 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
433 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
434 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
435 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
436 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
437 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
438 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
439 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
440 ; EG-NEXT: LSHL T0.X, PV.W, PS,
441 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
442 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
443 ; EG-NEXT: MOV T0.Y, 0.0,
444 ; EG-NEXT: MOV * T0.Z, 0.0,
445 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
446 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
448 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select:
449 ; GFX9-GISEL: ; %bb.0:
450 ; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
451 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
452 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
453 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
454 ; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16
455 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
456 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
457 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1]
458 ; GFX9-GISEL-NEXT: s_endpgm
459 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
460 %ctlz_ret = icmp ne i16 %val, 0
461 %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
462 store i16 %ctlz, ptr addrspace(1) %out, align 4
466 define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
467 ; SI-LABEL: s_ctlz_zero_undef_i32_with_select:
469 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
470 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
471 ; SI-NEXT: s_mov_b32 s3, 0xf000
472 ; SI-NEXT: s_waitcnt lgkmcnt(0)
473 ; SI-NEXT: s_flbit_i32_b32 s4, s4
474 ; SI-NEXT: s_mov_b32 s2, -1
475 ; SI-NEXT: v_mov_b32_e32 v0, s4
476 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
479 ; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
481 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
482 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
483 ; VI-NEXT: s_waitcnt lgkmcnt(0)
484 ; VI-NEXT: s_flbit_i32_b32 s2, s4
485 ; VI-NEXT: v_mov_b32_e32 v0, s0
486 ; VI-NEXT: v_mov_b32_e32 v1, s1
487 ; VI-NEXT: v_mov_b32_e32 v2, s2
488 ; VI-NEXT: flat_store_dword v[0:1], v2
491 ; EG-LABEL: s_ctlz_zero_undef_i32_with_select:
493 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
494 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
497 ; EG-NEXT: ALU clause starting at 4:
498 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
499 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
500 ; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z,
502 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select:
503 ; GFX9-GISEL: ; %bb.0:
504 ; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
505 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
506 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
507 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4
509 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
510 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
511 ; GFX9-GISEL-NEXT: s_endpgm
512 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
513 %ctlz_ret = icmp ne i32 %val, 0
514 %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
515 store i32 %ctlz, ptr addrspace(1) %out, align 4
519 define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
520 ; SI-LABEL: s_ctlz_zero_undef_i64_with_select:
522 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
523 ; SI-NEXT: s_mov_b32 s7, 0xf000
524 ; SI-NEXT: s_mov_b32 s6, -1
525 ; SI-NEXT: s_waitcnt lgkmcnt(0)
526 ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3]
527 ; SI-NEXT: v_mov_b32_e32 v1, 0
528 ; SI-NEXT: s_mov_b32 s4, s0
529 ; SI-NEXT: s_mov_b32 s5, s1
530 ; SI-NEXT: v_mov_b32_e32 v0, s2
531 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
534 ; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
536 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
537 ; VI-NEXT: v_mov_b32_e32 v1, 0
538 ; VI-NEXT: s_waitcnt lgkmcnt(0)
539 ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
540 ; VI-NEXT: v_mov_b32_e32 v3, s1
541 ; VI-NEXT: v_mov_b32_e32 v0, s2
542 ; VI-NEXT: v_mov_b32_e32 v2, s0
543 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
546 ; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
548 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
549 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
552 ; EG-NEXT: ALU clause starting at 4:
553 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W,
554 ; EG-NEXT: FFBH_UINT T1.W, KC0[3].X,
555 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
556 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
557 ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W,
558 ; EG-NEXT: MOV T0.Y, 0.0,
559 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
560 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
562 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
563 ; GFX9-GISEL: ; %bb.0:
564 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
565 ; GFX9-GISEL-NEXT: s_mov_b32 s1, 0
566 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
567 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7]
569 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
570 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
571 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
572 ; GFX9-GISEL-NEXT: s_endpgm
573 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
574 %ctlz_ret = icmp ne i64 %val, 0
575 %ret = select i1 %ctlz_ret, i64 %ctlz, i64 32
576 store i64 %ctlz, ptr addrspace(1) %out, align 4
580 define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
581 ; SI-LABEL: v_ctlz_zero_undef_i8_with_select:
583 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
584 ; SI-NEXT: s_mov_b32 s7, 0xf000
585 ; SI-NEXT: s_mov_b32 s6, -1
586 ; SI-NEXT: s_mov_b32 s10, s6
587 ; SI-NEXT: s_mov_b32 s11, s7
588 ; SI-NEXT: s_waitcnt lgkmcnt(0)
589 ; SI-NEXT: s_mov_b32 s8, s2
590 ; SI-NEXT: s_mov_b32 s9, s3
591 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
592 ; SI-NEXT: s_mov_b32 s4, s0
593 ; SI-NEXT: s_mov_b32 s5, s1
594 ; SI-NEXT: s_waitcnt vmcnt(0)
595 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
596 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
597 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
598 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
599 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
602 ; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
604 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
605 ; VI-NEXT: s_waitcnt lgkmcnt(0)
606 ; VI-NEXT: v_mov_b32_e32 v0, s2
607 ; VI-NEXT: v_mov_b32_e32 v1, s3
608 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
609 ; VI-NEXT: s_waitcnt vmcnt(0)
610 ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
611 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
612 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
613 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
614 ; VI-NEXT: v_mov_b32_e32 v0, s0
615 ; VI-NEXT: v_mov_b32_e32 v1, s1
616 ; VI-NEXT: flat_store_byte v[0:1], v2
619 ; EG-LABEL: v_ctlz_zero_undef_i8_with_select:
621 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
623 ; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
624 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
627 ; EG-NEXT: Fetch clause starting at 6:
628 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
629 ; EG-NEXT: ALU clause starting at 8:
630 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
631 ; EG-NEXT: ALU clause starting at 9:
632 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
633 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
634 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
635 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
636 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
637 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
638 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
639 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
640 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
641 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
642 ; EG-NEXT: LSHL T0.X, PV.W, PS,
643 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
644 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
645 ; EG-NEXT: MOV T0.Y, 0.0,
646 ; EG-NEXT: MOV * T0.Z, 0.0,
647 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
648 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
650 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
651 ; GFX9-GISEL: ; %bb.0:
652 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
653 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
654 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7]
656 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
657 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1
658 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
659 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
660 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
661 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
662 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5]
663 ; GFX9-GISEL-NEXT: s_endpgm
664 %val = load i8, ptr addrspace(1) %arrayidx, align 1
665 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
666 %ctlz_ret = icmp ne i8 %val, 0
667 %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
668 store i8 %ret, ptr addrspace(1) %out, align 4
672 define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
673 ; SI-LABEL: v_ctlz_zero_undef_i16_with_select:
675 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
676 ; SI-NEXT: s_mov_b32 s7, 0xf000
677 ; SI-NEXT: s_mov_b32 s6, -1
678 ; SI-NEXT: s_mov_b32 s10, s6
679 ; SI-NEXT: s_mov_b32 s11, s7
680 ; SI-NEXT: s_waitcnt lgkmcnt(0)
681 ; SI-NEXT: s_mov_b32 s8, s2
682 ; SI-NEXT: s_mov_b32 s9, s3
683 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
684 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
685 ; SI-NEXT: s_mov_b32 s4, s0
686 ; SI-NEXT: s_mov_b32 s5, s1
687 ; SI-NEXT: s_waitcnt vmcnt(1)
688 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
689 ; SI-NEXT: s_waitcnt vmcnt(0)
690 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
691 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
692 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
693 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
694 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
695 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
698 ; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
700 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
701 ; VI-NEXT: s_waitcnt lgkmcnt(0)
702 ; VI-NEXT: s_add_u32 s4, s2, 1
703 ; VI-NEXT: s_addc_u32 s5, s3, 0
704 ; VI-NEXT: v_mov_b32_e32 v2, s4
705 ; VI-NEXT: v_mov_b32_e32 v0, s2
706 ; VI-NEXT: v_mov_b32_e32 v3, s5
707 ; VI-NEXT: v_mov_b32_e32 v1, s3
708 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
709 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
710 ; VI-NEXT: s_waitcnt vmcnt(1)
711 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2
712 ; VI-NEXT: s_waitcnt vmcnt(0)
713 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
714 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
715 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
716 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
717 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
718 ; VI-NEXT: v_mov_b32_e32 v0, s0
719 ; VI-NEXT: v_mov_b32_e32 v1, s1
720 ; VI-NEXT: flat_store_short v[0:1], v2
723 ; EG-LABEL: v_ctlz_zero_undef_i16_with_select:
725 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
727 ; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
728 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
731 ; EG-NEXT: Fetch clause starting at 6:
732 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
733 ; EG-NEXT: ALU clause starting at 8:
734 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
735 ; EG-NEXT: ALU clause starting at 9:
736 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
737 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
738 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
739 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
740 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
741 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
742 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
743 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
744 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
745 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
746 ; EG-NEXT: LSHL T0.X, PV.W, PS,
747 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
748 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
749 ; EG-NEXT: MOV T0.Y, 0.0,
750 ; EG-NEXT: MOV * T0.Z, 0.0,
751 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
752 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
754 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
755 ; GFX9-GISEL: ; %bb.0:
756 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
757 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
758 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
759 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7]
760 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1
761 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
762 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
763 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1
764 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
765 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
766 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
767 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
768 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5]
769 ; GFX9-GISEL-NEXT: s_endpgm
770 %val = load i16, ptr addrspace(1) %arrayidx, align 1
771 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
772 %ctlz_ret = icmp ne i16 %val, 0
773 %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
774 store i16 %ret, ptr addrspace(1) %out, align 4
778 define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
779 ; SI-LABEL: v_ctlz_zero_undef_i32_with_select:
781 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
782 ; SI-NEXT: s_mov_b32 s7, 0xf000
783 ; SI-NEXT: s_mov_b32 s6, -1
784 ; SI-NEXT: s_mov_b32 s10, s6
785 ; SI-NEXT: s_mov_b32 s11, s7
786 ; SI-NEXT: s_waitcnt lgkmcnt(0)
787 ; SI-NEXT: s_mov_b32 s8, s2
788 ; SI-NEXT: s_mov_b32 s9, s3
789 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
790 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3
791 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
792 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2
793 ; SI-NEXT: s_mov_b32 s4, s0
794 ; SI-NEXT: s_mov_b32 s5, s1
795 ; SI-NEXT: s_waitcnt vmcnt(3)
796 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
797 ; SI-NEXT: s_waitcnt vmcnt(2)
798 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
799 ; SI-NEXT: s_waitcnt vmcnt(1)
800 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
801 ; SI-NEXT: s_waitcnt vmcnt(0)
802 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
803 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
804 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
805 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
806 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
807 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
810 ; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
812 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
813 ; VI-NEXT: s_waitcnt lgkmcnt(0)
814 ; VI-NEXT: s_add_u32 s4, s2, 3
815 ; VI-NEXT: s_addc_u32 s5, s3, 0
816 ; VI-NEXT: v_mov_b32_e32 v2, s4
817 ; VI-NEXT: v_mov_b32_e32 v3, s5
818 ; VI-NEXT: s_add_u32 s4, s2, 2
819 ; VI-NEXT: v_mov_b32_e32 v0, s2
820 ; VI-NEXT: s_addc_u32 s5, s3, 0
821 ; VI-NEXT: v_mov_b32_e32 v1, s3
822 ; VI-NEXT: s_add_u32 s2, s2, 1
823 ; VI-NEXT: s_addc_u32 s3, s3, 0
824 ; VI-NEXT: v_mov_b32_e32 v4, s4
825 ; VI-NEXT: v_mov_b32_e32 v7, s3
826 ; VI-NEXT: v_mov_b32_e32 v5, s5
827 ; VI-NEXT: v_mov_b32_e32 v6, s2
828 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
829 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
830 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
831 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
832 ; VI-NEXT: s_waitcnt vmcnt(3)
833 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
834 ; VI-NEXT: s_waitcnt vmcnt(2)
835 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
836 ; VI-NEXT: s_waitcnt vmcnt(1)
837 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
838 ; VI-NEXT: s_waitcnt vmcnt(0)
839 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
840 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
841 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
842 ; VI-NEXT: v_min_u32_e32 v2, 32, v0
843 ; VI-NEXT: v_mov_b32_e32 v0, s0
844 ; VI-NEXT: v_mov_b32_e32 v1, s1
845 ; VI-NEXT: flat_store_dword v[0:1], v2
848 ; EG-LABEL: v_ctlz_zero_undef_i32_with_select:
850 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
852 ; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
853 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
856 ; EG-NEXT: Fetch clause starting at 6:
857 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
858 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
859 ; EG-NEXT: ALU clause starting at 10:
860 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
861 ; EG-NEXT: ALU clause starting at 11:
862 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
863 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
864 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
865 ; EG-NEXT: FFBH_UINT * T1.W, PV.W,
866 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
867 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
868 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
870 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
871 ; GFX9-GISEL: ; %bb.0:
872 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
873 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
874 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
875 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7]
876 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1
877 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3
878 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2
879 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
880 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
881 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
882 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
883 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
884 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4
885 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
886 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
887 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
888 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
889 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
890 ; GFX9-GISEL-NEXT: s_endpgm
891 %val = load i32, ptr addrspace(1) %arrayidx, align 1
892 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
893 %ctlz_ret = icmp ne i32 %val, 0
894 %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
895 store i32 %ret, ptr addrspace(1) %out, align 4
899 define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
900 ; SI-LABEL: v_ctlz_zero_undef_i64_with_select:
902 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
903 ; SI-NEXT: s_mov_b32 s3, 0xf000
904 ; SI-NEXT: s_mov_b32 s2, -1
905 ; SI-NEXT: s_mov_b32 s10, s2
906 ; SI-NEXT: s_mov_b32 s11, s3
907 ; SI-NEXT: s_waitcnt lgkmcnt(0)
908 ; SI-NEXT: s_mov_b32 s8, s6
909 ; SI-NEXT: s_mov_b32 s9, s7
910 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:5
911 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:7
912 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
913 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:1
914 ; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2
915 ; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:3
916 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4
917 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6
918 ; SI-NEXT: s_mov_b32 s0, s4
919 ; SI-NEXT: s_mov_b32 s1, s5
920 ; SI-NEXT: s_waitcnt vmcnt(7)
921 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
922 ; SI-NEXT: s_waitcnt vmcnt(6)
923 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
924 ; SI-NEXT: s_waitcnt vmcnt(4)
925 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
926 ; SI-NEXT: s_waitcnt vmcnt(2)
927 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
928 ; SI-NEXT: s_waitcnt vmcnt(1)
929 ; SI-NEXT: v_or_b32_e32 v0, v0, v6
930 ; SI-NEXT: s_waitcnt vmcnt(0)
931 ; SI-NEXT: v_or_b32_e32 v1, v1, v7
932 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
933 ; SI-NEXT: v_or_b32_e32 v3, v5, v4
934 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
935 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
936 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
937 ; SI-NEXT: v_or_b32_e32 v1, v3, v2
938 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
939 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
940 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1
941 ; SI-NEXT: v_min_u32_e32 v0, v1, v0
942 ; SI-NEXT: v_min_u32_e32 v0, 64, v0
943 ; SI-NEXT: v_mov_b32_e32 v1, 0
944 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
947 ; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
949 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
950 ; VI-NEXT: s_waitcnt lgkmcnt(0)
951 ; VI-NEXT: s_add_u32 s4, s2, 5
952 ; VI-NEXT: s_addc_u32 s5, s3, 0
953 ; VI-NEXT: v_mov_b32_e32 v0, s4
954 ; VI-NEXT: v_mov_b32_e32 v1, s5
955 ; VI-NEXT: s_add_u32 s4, s2, 4
956 ; VI-NEXT: s_addc_u32 s5, s3, 0
957 ; VI-NEXT: v_mov_b32_e32 v2, s4
958 ; VI-NEXT: v_mov_b32_e32 v3, s5
959 ; VI-NEXT: s_add_u32 s4, s2, 7
960 ; VI-NEXT: s_addc_u32 s5, s3, 0
961 ; VI-NEXT: v_mov_b32_e32 v4, s4
962 ; VI-NEXT: v_mov_b32_e32 v5, s5
963 ; VI-NEXT: s_add_u32 s4, s2, 6
964 ; VI-NEXT: s_addc_u32 s5, s3, 0
965 ; VI-NEXT: v_mov_b32_e32 v7, s5
966 ; VI-NEXT: v_mov_b32_e32 v6, s4
967 ; VI-NEXT: s_add_u32 s4, s2, 3
968 ; VI-NEXT: s_addc_u32 s5, s3, 0
969 ; VI-NEXT: v_mov_b32_e32 v9, s5
970 ; VI-NEXT: v_mov_b32_e32 v8, s4
971 ; VI-NEXT: s_add_u32 s4, s2, 2
972 ; VI-NEXT: s_addc_u32 s5, s3, 0
973 ; VI-NEXT: v_mov_b32_e32 v11, s5
974 ; VI-NEXT: v_mov_b32_e32 v10, s4
975 ; VI-NEXT: s_add_u32 s4, s2, 1
976 ; VI-NEXT: flat_load_ubyte v12, v[0:1]
977 ; VI-NEXT: flat_load_ubyte v13, v[2:3]
978 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
979 ; VI-NEXT: flat_load_ubyte v5, v[6:7]
980 ; VI-NEXT: s_addc_u32 s5, s3, 0
981 ; VI-NEXT: v_mov_b32_e32 v0, s4
982 ; VI-NEXT: flat_load_ubyte v6, v[8:9]
983 ; VI-NEXT: v_mov_b32_e32 v2, s2
984 ; VI-NEXT: v_mov_b32_e32 v1, s5
985 ; VI-NEXT: v_mov_b32_e32 v3, s3
986 ; VI-NEXT: flat_load_ubyte v7, v[10:11]
987 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
988 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
989 ; VI-NEXT: v_mov_b32_e32 v1, 0
990 ; VI-NEXT: s_waitcnt vmcnt(7)
991 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12
992 ; VI-NEXT: s_waitcnt vmcnt(6)
993 ; VI-NEXT: v_or_b32_e32 v3, v3, v13
994 ; VI-NEXT: s_waitcnt vmcnt(5)
995 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
996 ; VI-NEXT: s_waitcnt vmcnt(4)
997 ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
998 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
999 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
1000 ; VI-NEXT: s_waitcnt vmcnt(3)
1001 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
1002 ; VI-NEXT: s_waitcnt vmcnt(2)
1003 ; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1004 ; VI-NEXT: s_waitcnt vmcnt(1)
1005 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1006 ; VI-NEXT: s_waitcnt vmcnt(0)
1007 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1008 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
1009 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1010 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1011 ; VI-NEXT: v_min_u32_e32 v0, v0, v3
1012 ; VI-NEXT: v_mov_b32_e32 v3, s1
1013 ; VI-NEXT: v_min_u32_e32 v0, 64, v0
1014 ; VI-NEXT: v_mov_b32_e32 v2, s0
1015 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1018 ; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
1020 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1022 ; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
1023 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1026 ; EG-NEXT: Fetch clause starting at 6:
1027 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1028 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 4, #1
1029 ; EG-NEXT: VTX_READ_16 T3.X, T0.X, 6, #1
1030 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1031 ; EG-NEXT: ALU clause starting at 14:
1032 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1033 ; EG-NEXT: ALU clause starting at 15:
1034 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1035 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1036 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1037 ; EG-NEXT: FFBH_UINT T1.W, PV.W,
1038 ; EG-NEXT: LSHL * T2.W, T3.X, literal.x,
1039 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1040 ; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W,
1041 ; EG-NEXT: OR_INT * T1.W, PS, T2.X,
1042 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1043 ; EG-NEXT: FFBH_UINT T2.W, PS,
1044 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1045 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1046 ; EG-NEXT: CNDE_INT T0.X, T1.W, PS, PV.W,
1047 ; EG-NEXT: MOV T0.Y, 0.0,
1048 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1049 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1051 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
1052 ; GFX9-GISEL: ; %bb.0:
1053 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1054 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1055 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1056 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7]
1057 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1
1058 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2
1059 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3
1060 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4
1061 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5
1062 ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6
1063 ; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7
1064 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6)
1065 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0
1066 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5)
1067 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1068 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4)
1069 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
1070 ; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0
1071 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1072 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5
1073 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1074 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v7
1075 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1076 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v8, 24, v5
1077 ; GFX9-GISEL-NEXT: v_or3_b32 v3, v0, v4, 0
1078 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v2
1079 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v4, v3
1080 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
1081 ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1082 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0
1083 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
1084 ; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
1085 ; GFX9-GISEL-NEXT: s_endpgm
1086 %val = load i64, ptr addrspace(1) %arrayidx, align 1
1087 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
1088 %ctlz_ret = icmp ne i64 %val, 0
1089 %ret = select i1 %ctlz_ret, i64 %ctlz, i64 64
1090 store i64 %ret, ptr addrspace(1) %out, align 4
1094 define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1095 ; SI-LABEL: v_ctlz_zero_undef_i8:
1097 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1098 ; SI-NEXT: s_mov_b32 s7, 0xf000
1099 ; SI-NEXT: v_mov_b32_e32 v1, 0
1100 ; SI-NEXT: s_mov_b32 s10, 0
1101 ; SI-NEXT: s_mov_b32 s11, s7
1102 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1103 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1104 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1105 ; SI-NEXT: s_mov_b32 s6, -1
1106 ; SI-NEXT: s_mov_b32 s4, s0
1107 ; SI-NEXT: s_mov_b32 s5, s1
1108 ; SI-NEXT: s_waitcnt vmcnt(0)
1109 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1110 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1111 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1114 ; VI-LABEL: v_ctlz_zero_undef_i8:
1116 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1117 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1118 ; VI-NEXT: v_mov_b32_e32 v1, s3
1119 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1120 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1121 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1122 ; VI-NEXT: s_waitcnt vmcnt(0)
1123 ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1124 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1125 ; VI-NEXT: v_mov_b32_e32 v0, s0
1126 ; VI-NEXT: v_mov_b32_e32 v1, s1
1127 ; VI-NEXT: flat_store_byte v[0:1], v2
1130 ; EG-LABEL: v_ctlz_zero_undef_i8:
1132 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1134 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
1135 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1138 ; EG-NEXT: Fetch clause starting at 6:
1139 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1140 ; EG-NEXT: ALU clause starting at 8:
1141 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1142 ; EG-NEXT: ALU clause starting at 9:
1143 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1144 ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
1145 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
1146 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1147 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1148 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1149 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1150 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1151 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1152 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1153 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1154 ; EG-NEXT: MOV T0.Y, 0.0,
1155 ; EG-NEXT: MOV * T0.Z, 0.0,
1156 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1157 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1159 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
1160 ; GFX9-GISEL: ; %bb.0:
1161 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1162 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1163 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1164 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6
1165 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7
1166 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
1167 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
1168 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1169 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1170 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1171 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
1172 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1173 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5]
1174 ; GFX9-GISEL-NEXT: s_endpgm
1175 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1176 %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1177 %val = load i8, ptr addrspace(1) %in.gep
1178 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
1179 store i8 %ctlz, ptr addrspace(1) %out
1183 define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
1184 ; SI-LABEL: s_ctlz_zero_undef_i64:
1186 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13
1187 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
1188 ; SI-NEXT: s_mov_b32 s3, 0xf000
1189 ; SI-NEXT: s_mov_b32 s2, -1
1190 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1191 ; SI-NEXT: s_flbit_i32_b64 s4, s[4:5]
1192 ; SI-NEXT: v_mov_b32_e32 v1, 0
1193 ; SI-NEXT: v_mov_b32_e32 v0, s4
1194 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1197 ; VI-LABEL: s_ctlz_zero_undef_i64:
1199 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c
1200 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24
1201 ; VI-NEXT: v_mov_b32_e32 v1, 0
1202 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1203 ; VI-NEXT: s_flbit_i32_b64 s0, s[0:1]
1204 ; VI-NEXT: v_mov_b32_e32 v2, s2
1205 ; VI-NEXT: v_mov_b32_e32 v0, s0
1206 ; VI-NEXT: v_mov_b32_e32 v3, s3
1207 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1210 ; EG-LABEL: s_ctlz_zero_undef_i64:
1212 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1213 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1216 ; EG-NEXT: ALU clause starting at 4:
1217 ; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W,
1218 ; EG-NEXT: FFBH_UINT T1.W, KC0[5].X,
1219 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1220 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1221 ; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W,
1222 ; EG-NEXT: MOV T0.Y, 0.0,
1223 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1224 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1226 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64:
1227 ; GFX9-GISEL: ; %bb.0:
1228 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c
1229 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
1230 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0
1231 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
1232 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1233 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1]
1234 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
1235 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
1236 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1237 ; GFX9-GISEL-NEXT: s_endpgm
1238 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1239 store i64 %ctlz, ptr addrspace(1) %out
1243 define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
1244 ; SI-LABEL: s_ctlz_zero_undef_i64_trunc:
1246 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1247 ; SI-NEXT: s_mov_b32 s7, 0xf000
1248 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1249 ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3]
1250 ; SI-NEXT: s_mov_b32 s6, -1
1251 ; SI-NEXT: s_mov_b32 s4, s0
1252 ; SI-NEXT: s_mov_b32 s5, s1
1253 ; SI-NEXT: v_mov_b32_e32 v0, s2
1254 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1257 ; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
1259 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1260 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1261 ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
1262 ; VI-NEXT: v_mov_b32_e32 v0, s0
1263 ; VI-NEXT: v_mov_b32_e32 v1, s1
1264 ; VI-NEXT: v_mov_b32_e32 v2, s2
1265 ; VI-NEXT: flat_store_dword v[0:1], v2
1268 ; EG-LABEL: s_ctlz_zero_undef_i64_trunc:
1270 ; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
1271 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1274 ; EG-NEXT: ALU clause starting at 4:
1275 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W,
1276 ; EG-NEXT: FFBH_UINT T1.W, KC0[3].X,
1277 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1278 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1279 ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W,
1280 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1281 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1283 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
1284 ; GFX9-GISEL: ; %bb.0:
1285 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1286 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1287 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1288 ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7]
1289 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
1290 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
1291 ; GFX9-GISEL-NEXT: s_endpgm
1292 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1293 %trunc = trunc i64 %ctlz to i32
1294 store i32 %trunc, ptr addrspace(1) %out
1298 define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1299 ; SI-LABEL: v_ctlz_zero_undef_i64:
1301 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1302 ; SI-NEXT: s_mov_b32 s7, 0xf000
1303 ; SI-NEXT: s_mov_b32 s6, 0
1304 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1305 ; SI-NEXT: v_mov_b32_e32 v1, 0
1306 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1307 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1308 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
1309 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1310 ; SI-NEXT: s_waitcnt vmcnt(0)
1311 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
1312 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
1313 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
1314 ; SI-NEXT: v_min_u32_e32 v2, v2, v3
1315 ; SI-NEXT: v_mov_b32_e32 v3, v1
1316 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
1319 ; VI-LABEL: v_ctlz_zero_undef_i64:
1321 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1322 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1323 ; VI-NEXT: v_mov_b32_e32 v2, 0
1324 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1325 ; VI-NEXT: v_mov_b32_e32 v1, s3
1326 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3
1327 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1328 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1329 ; VI-NEXT: v_mov_b32_e32 v4, s1
1330 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
1331 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1332 ; VI-NEXT: s_waitcnt vmcnt(0)
1333 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1334 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1335 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
1336 ; VI-NEXT: v_min_u32_e32 v1, v0, v1
1337 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
1340 ; EG-LABEL: v_ctlz_zero_undef_i64:
1342 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1344 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1345 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1348 ; EG-NEXT: Fetch clause starting at 6:
1349 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1350 ; EG-NEXT: ALU clause starting at 8:
1351 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1352 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1353 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1354 ; EG-NEXT: ALU clause starting at 11:
1355 ; EG-NEXT: FFBH_UINT * T1.W, T0.X,
1356 ; EG-NEXT: FFBH_UINT T2.W, T0.Y,
1357 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
1358 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1359 ; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W,
1360 ; EG-NEXT: MOV T0.Y, 0.0,
1361 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1362 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1363 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1365 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
1366 ; GFX9-GISEL: ; %bb.0:
1367 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1368 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1369 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1370 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
1371 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1372 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1373 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
1374 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
1375 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0
1376 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1377 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1378 ; GFX9-GISEL-NEXT: s_endpgm
1379 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1380 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1381 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
1382 %val = load i64, ptr addrspace(1) %in.gep
1383 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1384 store i64 %ctlz, ptr addrspace(1) %out.gep
1388 define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1389 ; SI-LABEL: v_ctlz_zero_undef_i64_trunc:
1391 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1392 ; SI-NEXT: s_mov_b32 s7, 0xf000
1393 ; SI-NEXT: s_mov_b32 s6, 0
1394 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1395 ; SI-NEXT: v_mov_b32_e32 v2, 0
1396 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1397 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1398 ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
1399 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1400 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1401 ; SI-NEXT: s_waitcnt vmcnt(0)
1402 ; SI-NEXT: v_ffbh_u32_e32 v0, v3
1403 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
1404 ; SI-NEXT: v_ffbh_u32_e32 v3, v4
1405 ; SI-NEXT: v_min_u32_e32 v0, v0, v3
1406 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1409 ; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
1411 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1412 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1413 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1414 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1415 ; VI-NEXT: v_mov_b32_e32 v2, s3
1416 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1417 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1418 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
1419 ; VI-NEXT: v_mov_b32_e32 v4, s1
1420 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
1421 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1422 ; VI-NEXT: s_waitcnt vmcnt(0)
1423 ; VI-NEXT: v_ffbh_u32_e32 v0, v1
1424 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1425 ; VI-NEXT: v_ffbh_u32_e32 v1, v2
1426 ; VI-NEXT: v_min_u32_e32 v0, v0, v1
1427 ; VI-NEXT: flat_store_dword v[3:4], v0
1430 ; EG-LABEL: v_ctlz_zero_undef_i64_trunc:
1432 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1434 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1435 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1438 ; EG-NEXT: Fetch clause starting at 6:
1439 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
1440 ; EG-NEXT: ALU clause starting at 8:
1441 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1442 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1443 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
1444 ; EG-NEXT: ALU clause starting at 11:
1445 ; EG-NEXT: FFBH_UINT * T0.W, T1.X,
1446 ; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
1447 ; EG-NEXT: FFBH_UINT T1.W, T1.Y,
1448 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y,
1449 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
1450 ; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W,
1451 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
1452 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1453 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1455 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
1456 ; GFX9-GISEL: ; %bb.0:
1457 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1458 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1459 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1460 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1461 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7]
1462 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1463 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
1464 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
1465 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1
1466 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1
1467 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
1468 ; GFX9-GISEL-NEXT: s_endpgm
1469 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1470 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1471 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
1472 %val = load i64, ptr addrspace(1) %in.gep
1473 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1474 %trunc = trunc i64 %ctlz to i32
1475 store i32 %trunc, ptr addrspace(1) %out.gep
1479 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1480 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1482 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1483 ; SI-NEXT: s_mov_b32 s7, 0xf000
1484 ; SI-NEXT: s_mov_b32 s10, 0
1485 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1486 ; SI-NEXT: v_mov_b32_e32 v1, 0
1487 ; SI-NEXT: s_mov_b32 s11, s7
1488 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1489 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1490 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1491 ; SI-NEXT: s_mov_b32 s6, -1
1492 ; SI-NEXT: s_mov_b32 s4, s0
1493 ; SI-NEXT: s_mov_b32 s5, s1
1494 ; SI-NEXT: s_waitcnt vmcnt(0)
1495 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1496 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1499 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1501 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1502 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1503 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1504 ; VI-NEXT: v_mov_b32_e32 v1, s3
1505 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1506 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1507 ; VI-NEXT: flat_load_dword v0, v[0:1]
1508 ; VI-NEXT: s_waitcnt vmcnt(0)
1509 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1510 ; VI-NEXT: v_mov_b32_e32 v0, s0
1511 ; VI-NEXT: v_mov_b32_e32 v1, s1
1512 ; VI-NEXT: flat_store_dword v[0:1], v2
1515 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1517 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1519 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1520 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1523 ; EG-NEXT: Fetch clause starting at 6:
1524 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1525 ; EG-NEXT: ALU clause starting at 8:
1526 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1527 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1528 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1529 ; EG-NEXT: ALU clause starting at 11:
1530 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1531 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1532 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1533 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1535 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1536 ; GFX9-GISEL: ; %bb.0:
1537 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1538 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1539 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1540 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
1541 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1542 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1543 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1544 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
1545 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1546 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
1547 ; GFX9-GISEL-NEXT: s_endpgm
1548 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1549 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1550 %val = load i32, ptr addrspace(1) %in.gep
1551 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1552 %cmp = icmp eq i32 %val, 0
1553 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1554 store i32 %sel, ptr addrspace(1) %out
1558 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1559 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1561 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1562 ; SI-NEXT: s_mov_b32 s7, 0xf000
1563 ; SI-NEXT: s_mov_b32 s10, 0
1564 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1565 ; SI-NEXT: v_mov_b32_e32 v1, 0
1566 ; SI-NEXT: s_mov_b32 s11, s7
1567 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1568 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1569 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1570 ; SI-NEXT: s_mov_b32 s6, -1
1571 ; SI-NEXT: s_mov_b32 s4, s0
1572 ; SI-NEXT: s_mov_b32 s5, s1
1573 ; SI-NEXT: s_waitcnt vmcnt(0)
1574 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1575 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1578 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1580 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1581 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1582 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1583 ; VI-NEXT: v_mov_b32_e32 v1, s3
1584 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1585 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1586 ; VI-NEXT: flat_load_dword v0, v[0:1]
1587 ; VI-NEXT: s_waitcnt vmcnt(0)
1588 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1589 ; VI-NEXT: v_mov_b32_e32 v0, s0
1590 ; VI-NEXT: v_mov_b32_e32 v1, s1
1591 ; VI-NEXT: flat_store_dword v[0:1], v2
1594 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1596 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1598 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1599 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1602 ; EG-NEXT: Fetch clause starting at 6:
1603 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1604 ; EG-NEXT: ALU clause starting at 8:
1605 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1606 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1607 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1608 ; EG-NEXT: ALU clause starting at 11:
1609 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1610 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1611 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1612 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1614 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1615 ; GFX9-GISEL: ; %bb.0:
1616 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1617 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1618 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1619 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
1620 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1621 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1622 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1623 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
1624 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1625 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
1626 ; GFX9-GISEL-NEXT: s_endpgm
1627 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1628 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1629 %val = load i32, ptr addrspace(1) %in.gep
1630 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1631 %cmp = icmp ne i32 %val, 0
1632 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1633 store i32 %sel, ptr addrspace(1) %out
1637 define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1638 ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1640 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1641 ; SI-NEXT: s_mov_b32 s7, 0xf000
1642 ; SI-NEXT: v_mov_b32_e32 v1, 0
1643 ; SI-NEXT: s_mov_b32 s10, 0
1644 ; SI-NEXT: s_mov_b32 s11, s7
1645 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1646 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1647 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1648 ; SI-NEXT: s_mov_b32 s6, -1
1649 ; SI-NEXT: s_mov_b32 s4, s0
1650 ; SI-NEXT: s_mov_b32 s5, s1
1651 ; SI-NEXT: s_waitcnt vmcnt(0)
1652 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1653 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1656 ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1658 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1659 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1660 ; VI-NEXT: v_mov_b32_e32 v1, s3
1661 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1662 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1663 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1664 ; VI-NEXT: s_waitcnt vmcnt(0)
1665 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1666 ; VI-NEXT: v_mov_b32_e32 v0, s0
1667 ; VI-NEXT: v_mov_b32_e32 v1, s1
1668 ; VI-NEXT: flat_store_byte v[0:1], v2
1671 ; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1673 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1675 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1676 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1679 ; EG-NEXT: Fetch clause starting at 6:
1680 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1681 ; EG-NEXT: ALU clause starting at 8:
1682 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1683 ; EG-NEXT: ALU clause starting at 9:
1684 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1685 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1686 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1687 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1688 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1689 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1690 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1691 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1692 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1693 ; EG-NEXT: MOV T0.Y, 0.0,
1694 ; EG-NEXT: MOV * T0.Z, 0.0,
1695 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1696 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1698 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1699 ; GFX9-GISEL: ; %bb.0:
1700 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1701 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1702 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1703 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6
1704 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7
1705 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
1706 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
1707 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1708 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1709 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1710 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0
1711 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
1712 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
1713 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[0:1]
1714 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5]
1715 ; GFX9-GISEL-NEXT: s_endpgm
1716 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1717 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1718 %val = load i8, ptr addrspace(1) %valptr.gep
1719 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
1720 %cmp = icmp eq i8 %val, 0
1721 %sel = select i1 %cmp, i8 -1, i8 %ctlz
1722 store i8 %sel, ptr addrspace(1) %out
1726 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1727 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1729 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1730 ; SI-NEXT: s_mov_b32 s7, 0xf000
1731 ; SI-NEXT: s_mov_b32 s10, 0
1732 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1733 ; SI-NEXT: v_mov_b32_e32 v1, 0
1734 ; SI-NEXT: s_mov_b32 s11, s7
1735 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1736 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1737 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1738 ; SI-NEXT: s_mov_b32 s6, -1
1739 ; SI-NEXT: s_mov_b32 s4, s0
1740 ; SI-NEXT: s_mov_b32 s5, s1
1741 ; SI-NEXT: s_waitcnt vmcnt(0)
1742 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1743 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1744 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1745 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
1746 ; SI-NEXT: s_waitcnt vmcnt(0)
1747 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1748 ; SI-NEXT: s_waitcnt vmcnt(0)
1751 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1753 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1754 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1755 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1756 ; VI-NEXT: v_mov_b32_e32 v1, s3
1757 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1758 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1759 ; VI-NEXT: flat_load_dword v2, v[0:1]
1760 ; VI-NEXT: v_mov_b32_e32 v0, s0
1761 ; VI-NEXT: v_mov_b32_e32 v1, s1
1762 ; VI-NEXT: s_waitcnt vmcnt(0)
1763 ; VI-NEXT: v_ffbh_u32_e32 v3, v2
1764 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
1765 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1766 ; VI-NEXT: flat_store_dword v[0:1], v3
1767 ; VI-NEXT: s_waitcnt vmcnt(0)
1768 ; VI-NEXT: flat_store_byte v[0:1], v2
1769 ; VI-NEXT: s_waitcnt vmcnt(0)
1772 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1774 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1776 ; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
1777 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
1778 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T2.X
1780 ; EG-NEXT: Fetch clause starting at 6:
1781 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1782 ; EG-NEXT: ALU clause starting at 8:
1783 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1784 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1785 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1786 ; EG-NEXT: ALU clause starting at 11:
1787 ; EG-NEXT: SETE_INT * T0.W, T0.X, 0.0,
1788 ; EG-NEXT: AND_INT T1.X, PV.W, 1,
1789 ; EG-NEXT: MOV * T1.W, literal.x,
1790 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1791 ; EG-NEXT: MOV T1.Y, 0.0,
1792 ; EG-NEXT: MOV * T1.Z, 0.0,
1793 ; EG-NEXT: MOV T2.X, literal.x,
1794 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1795 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
1796 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1797 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.y,
1798 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1800 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1801 ; GFX9-GISEL: ; %bb.0:
1802 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1803 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1804 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1805 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1806 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
1807 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1808 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0
1809 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1810 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
1811 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1812 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
1813 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1814 ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off
1815 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1816 ; GFX9-GISEL-NEXT: s_endpgm
1817 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1818 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1819 %val = load i32, ptr addrspace(1) %in.gep
1820 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1821 %cmp = icmp eq i32 %val, 0
1822 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1823 store volatile i32 %sel, ptr addrspace(1) %out
1824 store volatile i1 %cmp, ptr addrspace(1) undef
1828 ; Selected on wrong constant
1829 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1830 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1832 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1833 ; SI-NEXT: s_mov_b32 s7, 0xf000
1834 ; SI-NEXT: s_mov_b32 s10, 0
1835 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1836 ; SI-NEXT: v_mov_b32_e32 v1, 0
1837 ; SI-NEXT: s_mov_b32 s11, s7
1838 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1839 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1840 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1841 ; SI-NEXT: s_mov_b32 s6, -1
1842 ; SI-NEXT: s_mov_b32 s4, s0
1843 ; SI-NEXT: s_mov_b32 s5, s1
1844 ; SI-NEXT: s_waitcnt vmcnt(0)
1845 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1846 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1847 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1848 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1851 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1853 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1854 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1855 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1856 ; VI-NEXT: v_mov_b32_e32 v1, s3
1857 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1858 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1859 ; VI-NEXT: flat_load_dword v0, v[0:1]
1860 ; VI-NEXT: s_waitcnt vmcnt(0)
1861 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1862 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1863 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1864 ; VI-NEXT: v_mov_b32_e32 v0, s0
1865 ; VI-NEXT: v_mov_b32_e32 v1, s1
1866 ; VI-NEXT: flat_store_dword v[0:1], v2
1869 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1871 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1873 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1874 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1877 ; EG-NEXT: Fetch clause starting at 6:
1878 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1879 ; EG-NEXT: ALU clause starting at 8:
1880 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1881 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1882 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1883 ; EG-NEXT: ALU clause starting at 11:
1884 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1885 ; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W,
1886 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1887 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1889 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1890 ; GFX9-GISEL: ; %bb.0:
1891 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1892 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1893 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1894 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
1895 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1896 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1897 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1898 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
1899 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1900 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
1901 ; GFX9-GISEL-NEXT: s_endpgm
1902 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1903 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1904 %val = load i32, ptr addrspace(1) %in.gep
1905 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1906 %cmp = icmp eq i32 %val, 0
1907 %sel = select i1 %cmp, i32 0, i32 %ctlz
1908 store i32 %sel, ptr addrspace(1) %out
1912 ; Selected on wrong constant
1913 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1914 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1916 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1917 ; SI-NEXT: s_mov_b32 s7, 0xf000
1918 ; SI-NEXT: s_mov_b32 s10, 0
1919 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1920 ; SI-NEXT: v_mov_b32_e32 v1, 0
1921 ; SI-NEXT: s_mov_b32 s11, s7
1922 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1923 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1924 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1925 ; SI-NEXT: s_mov_b32 s6, -1
1926 ; SI-NEXT: s_mov_b32 s4, s0
1927 ; SI-NEXT: s_mov_b32 s5, s1
1928 ; SI-NEXT: s_waitcnt vmcnt(0)
1929 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
1930 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1931 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1932 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1935 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1937 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1938 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1939 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1940 ; VI-NEXT: v_mov_b32_e32 v1, s3
1941 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1942 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1943 ; VI-NEXT: flat_load_dword v0, v[0:1]
1944 ; VI-NEXT: s_waitcnt vmcnt(0)
1945 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
1946 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1947 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
1948 ; VI-NEXT: v_mov_b32_e32 v0, s0
1949 ; VI-NEXT: v_mov_b32_e32 v1, s1
1950 ; VI-NEXT: flat_store_dword v[0:1], v2
1953 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1955 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1957 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
1958 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1961 ; EG-NEXT: Fetch clause starting at 6:
1962 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1963 ; EG-NEXT: ALU clause starting at 8:
1964 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1965 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1966 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1967 ; EG-NEXT: ALU clause starting at 11:
1968 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1969 ; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W,
1970 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1971 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1973 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1974 ; GFX9-GISEL: ; %bb.0:
1975 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1976 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1977 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1978 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
1979 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1980 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1981 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1982 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1983 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1984 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
1985 ; GFX9-GISEL-NEXT: s_endpgm
1986 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1987 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1988 %val = load i32, ptr addrspace(1) %in.gep
1989 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1990 %cmp = icmp ne i32 %val, 0
1991 %sel = select i1 %cmp, i32 %ctlz, i32 0
1992 store i32 %sel, ptr addrspace(1) %out
1996 ; Compare on wrong constant
1997 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1998 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2000 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2001 ; SI-NEXT: s_mov_b32 s7, 0xf000
2002 ; SI-NEXT: s_mov_b32 s10, 0
2003 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2004 ; SI-NEXT: v_mov_b32_e32 v1, 0
2005 ; SI-NEXT: s_mov_b32 s11, s7
2006 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2007 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2008 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2009 ; SI-NEXT: s_mov_b32 s6, -1
2010 ; SI-NEXT: s_mov_b32 s4, s0
2011 ; SI-NEXT: s_mov_b32 s5, s1
2012 ; SI-NEXT: s_waitcnt vmcnt(0)
2013 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
2014 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2015 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2016 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2019 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2021 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2022 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2023 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2024 ; VI-NEXT: v_mov_b32_e32 v1, s3
2025 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2026 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2027 ; VI-NEXT: flat_load_dword v0, v[0:1]
2028 ; VI-NEXT: s_waitcnt vmcnt(0)
2029 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
2030 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2031 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
2032 ; VI-NEXT: v_mov_b32_e32 v0, s0
2033 ; VI-NEXT: v_mov_b32_e32 v1, s1
2034 ; VI-NEXT: flat_store_dword v[0:1], v2
2037 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2039 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
2041 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
2042 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2045 ; EG-NEXT: Fetch clause starting at 6:
2046 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2047 ; EG-NEXT: ALU clause starting at 8:
2048 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2049 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2050 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
2051 ; EG-NEXT: ALU clause starting at 11:
2052 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
2053 ; EG-NEXT: SETE_INT * T1.W, T0.X, 1,
2054 ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
2055 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2056 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2058 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2059 ; GFX9-GISEL: ; %bb.0:
2060 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2061 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2062 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2063 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
2064 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
2065 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
2066 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
2067 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
2068 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
2069 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
2070 ; GFX9-GISEL-NEXT: s_endpgm
2071 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2072 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
2073 %val = load i32, ptr addrspace(1) %in.gep
2074 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
2075 %cmp = icmp eq i32 %val, 1
2076 %sel = select i1 %cmp, i32 0, i32 %ctlz
2077 store i32 %sel, ptr addrspace(1) %out
2081 ; Selected on wrong constant
2082 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
2083 ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2085 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2086 ; SI-NEXT: s_mov_b32 s7, 0xf000
2087 ; SI-NEXT: s_mov_b32 s10, 0
2088 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2089 ; SI-NEXT: v_mov_b32_e32 v1, 0
2090 ; SI-NEXT: s_mov_b32 s11, s7
2091 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2092 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
2093 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2094 ; SI-NEXT: s_mov_b32 s6, -1
2095 ; SI-NEXT: s_mov_b32 s4, s0
2096 ; SI-NEXT: s_mov_b32 s5, s1
2097 ; SI-NEXT: s_waitcnt vmcnt(0)
2098 ; SI-NEXT: v_ffbh_u32_e32 v1, v0
2099 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2100 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2101 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2104 ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2106 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2107 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2108 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2109 ; VI-NEXT: v_mov_b32_e32 v1, s3
2110 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2111 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2112 ; VI-NEXT: flat_load_dword v0, v[0:1]
2113 ; VI-NEXT: s_waitcnt vmcnt(0)
2114 ; VI-NEXT: v_ffbh_u32_e32 v1, v0
2115 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2116 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
2117 ; VI-NEXT: v_mov_b32_e32 v0, s0
2118 ; VI-NEXT: v_mov_b32_e32 v1, s1
2119 ; VI-NEXT: flat_store_dword v[0:1], v2
2122 ; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2124 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
2126 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
2127 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2130 ; EG-NEXT: Fetch clause starting at 6:
2131 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2132 ; EG-NEXT: ALU clause starting at 8:
2133 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2134 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2135 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
2136 ; EG-NEXT: ALU clause starting at 11:
2137 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
2138 ; EG-NEXT: SETNE_INT * T1.W, T0.X, 1,
2139 ; EG-NEXT: CNDE_INT T0.X, PS, 0.0, PV.W,
2140 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2141 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2143 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2144 ; GFX9-GISEL: ; %bb.0:
2145 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2146 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2147 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2148 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7]
2149 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
2150 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
2151 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
2152 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2153 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
2154 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
2155 ; GFX9-GISEL-NEXT: s_endpgm
2156 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2157 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
2158 %val = load i32, ptr addrspace(1) %in.gep
2159 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
2160 %cmp = icmp ne i32 %val, 1
2161 %sel = select i1 %cmp, i32 %ctlz, i32 0
2162 store i32 %sel, ptr addrspace(1) %out
2166 define i7 @v_ctlz_zero_undef_i7(i7 %val) {
2167 ; SI-LABEL: v_ctlz_zero_undef_i7:
2169 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2170 ; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2171 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2172 ; SI-NEXT: s_setpc_b64 s[30:31]
2174 ; VI-LABEL: v_ctlz_zero_undef_i7:
2176 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2177 ; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2178 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2179 ; VI-NEXT: s_setpc_b64 s[30:31]
2181 ; EG-LABEL: v_ctlz_zero_undef_i7:
2186 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
2187 ; GFX9-GISEL: ; %bb.0:
2188 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2189 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2190 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2191 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2192 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
2196 define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind {
2197 ; SI-LABEL: s_ctlz_zero_undef_i18:
2199 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
2200 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2201 ; SI-NEXT: s_mov_b32 s3, 0xf000
2202 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2203 ; SI-NEXT: s_lshl_b32 s2, s4, 14
2204 ; SI-NEXT: s_flbit_i32_b32 s4, s2
2205 ; SI-NEXT: s_mov_b32 s2, -1
2206 ; SI-NEXT: v_mov_b32_e32 v0, s4
2207 ; SI-NEXT: s_bfe_u32 s4, s4, 0x20010
2208 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
2209 ; SI-NEXT: s_waitcnt expcnt(0)
2210 ; SI-NEXT: v_mov_b32_e32 v0, s4
2211 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
2214 ; VI-LABEL: s_ctlz_zero_undef_i18:
2216 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
2217 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2218 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2219 ; VI-NEXT: s_lshl_b32 s2, s4, 14
2220 ; VI-NEXT: v_mov_b32_e32 v0, s0
2221 ; VI-NEXT: s_flbit_i32_b32 s2, s2
2222 ; VI-NEXT: v_mov_b32_e32 v1, s1
2223 ; VI-NEXT: s_add_u32 s0, s0, 2
2224 ; VI-NEXT: v_mov_b32_e32 v2, s2
2225 ; VI-NEXT: s_addc_u32 s1, s1, 0
2226 ; VI-NEXT: flat_store_short v[0:1], v2
2227 ; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
2228 ; VI-NEXT: v_mov_b32_e32 v0, s0
2229 ; VI-NEXT: v_mov_b32_e32 v1, s1
2230 ; VI-NEXT: v_mov_b32_e32 v2, s2
2231 ; VI-NEXT: flat_store_byte v[0:1], v2
2234 ; EG-LABEL: s_ctlz_zero_undef_i18:
2236 ; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[]
2237 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
2238 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X
2240 ; EG-NEXT: ALU clause starting at 4:
2241 ; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
2242 ; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00)
2243 ; EG-NEXT: FFBH_UINT T0.W, PV.W,
2244 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
2245 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2246 ; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
2247 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
2248 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
2249 ; EG-NEXT: LSHL T1.X, PV.W, PS,
2250 ; EG-NEXT: LSHL * T1.W, literal.x, PS,
2251 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2252 ; EG-NEXT: MOV T1.Y, 0.0,
2253 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
2254 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2255 ; EG-NEXT: AND_INT T3.W, PV.W, literal.x,
2256 ; EG-NEXT: MOV * T4.W, literal.y,
2257 ; EG-NEXT: 3(4.203895e-45), 2(2.802597e-45)
2258 ; EG-NEXT: BFE_UINT T0.W, T0.W, literal.x, PS,
2259 ; EG-NEXT: LSHL * T3.W, PV.W, literal.y,
2260 ; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
2261 ; EG-NEXT: LSHL T0.X, PV.W, PS,
2262 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
2263 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2264 ; EG-NEXT: MOV T0.Y, 0.0,
2265 ; EG-NEXT: MOV T1.Z, 0.0,
2266 ; EG-NEXT: MOV * T0.Z, 0.0,
2267 ; EG-NEXT: LSHR T2.X, T2.W, literal.x,
2268 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2269 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2271 ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18:
2272 ; GFX9-GISEL: ; %bb.0:
2273 ; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c
2274 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2275 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
2276 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
2277 ; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14
2278 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2
2279 ; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff
2280 ; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16
2281 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
2282 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
2283 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
2284 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2
2285 ; GFX9-GISEL-NEXT: s_endpgm
2286 %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone
2287 store i18 %ctlz, ptr addrspace(1) %out, align 4
2291 define i18 @v_ctlz_zero_undef_i18(i18 %val) {
2292 ; SI-LABEL: v_ctlz_zero_undef_i18:
2294 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2295 ; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2296 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2297 ; SI-NEXT: s_setpc_b64 s[30:31]
2299 ; VI-LABEL: v_ctlz_zero_undef_i18:
2301 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2302 ; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2303 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2304 ; VI-NEXT: s_setpc_b64 s[30:31]
2306 ; EG-LABEL: v_ctlz_zero_undef_i18:
2311 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
2312 ; GFX9-GISEL: ; %bb.0:
2313 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2314 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2315 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2316 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2317 %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
2321 define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
2322 ; SI-LABEL: v_ctlz_zero_undef_v2i18:
2324 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2325 ; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2326 ; SI-NEXT: v_lshlrev_b32_e32 v1, 14, v1
2327 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2328 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2329 ; SI-NEXT: s_setpc_b64 s[30:31]
2331 ; VI-LABEL: v_ctlz_zero_undef_v2i18:
2333 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2334 ; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2335 ; VI-NEXT: v_lshlrev_b32_e32 v1, 14, v1
2336 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2337 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2338 ; VI-NEXT: s_setpc_b64 s[30:31]
2340 ; EG-LABEL: v_ctlz_zero_undef_v2i18:
2345 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
2346 ; GFX9-GISEL: ; %bb.0:
2347 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2348 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0
2349 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 14, v1
2350 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2351 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2352 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2353 %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
2357 define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
2358 ; SI-LABEL: v_ctlz_zero_undef_v2i16:
2360 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2361 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2362 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2363 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2364 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
2365 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2366 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
2367 ; SI-NEXT: s_setpc_b64 s[30:31]
2369 ; VI-LABEL: v_ctlz_zero_undef_v2i16:
2371 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2372 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
2373 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2374 ; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2375 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2376 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2377 ; VI-NEXT: s_setpc_b64 s[30:31]
2379 ; EG-LABEL: v_ctlz_zero_undef_v2i16:
2384 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
2385 ; GFX9-GISEL: ; %bb.0:
2386 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2387 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
2388 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2389 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2390 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2391 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2392 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
2393 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2394 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2395 %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
2399 define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
2400 ; SI-LABEL: v_ctlz_zero_undef_v3i16:
2402 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2403 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2404 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2405 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2406 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2407 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2408 ; SI-NEXT: v_ffbh_u32_e32 v3, v2
2409 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2410 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
2411 ; SI-NEXT: v_or_b32_e32 v2, 0x200000, v3
2412 ; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16
2413 ; SI-NEXT: s_setpc_b64 s[30:31]
2415 ; VI-LABEL: v_ctlz_zero_undef_v3i16:
2417 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2418 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
2419 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2420 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2421 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
2422 ; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2423 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2424 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2425 ; VI-NEXT: s_setpc_b64 s[30:31]
2427 ; EG-LABEL: v_ctlz_zero_undef_v3i16:
2432 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
2433 ; GFX9-GISEL: ; %bb.0:
2434 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2435 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2436 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2437 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2438 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2439 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
2440 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2441 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
2442 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2443 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
2444 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2445 %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
2449 define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
2450 ; SI-LABEL: v_ctlz_zero_undef_v4i16:
2452 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2453 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2454 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2455 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2456 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2457 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
2458 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
2459 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2460 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2461 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2462 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2463 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
2464 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
2465 ; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
2466 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2467 ; SI-NEXT: s_setpc_b64 s[30:31]
2469 ; VI-LABEL: v_ctlz_zero_undef_v4i16:
2471 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2472 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
2473 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
2474 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
2475 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2476 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
2477 ; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2478 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
2479 ; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2480 ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2481 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2482 ; VI-NEXT: s_setpc_b64 s[30:31]
2484 ; EG-LABEL: v_ctlz_zero_undef_v4i16:
2489 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
2490 ; GFX9-GISEL: ; %bb.0:
2491 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2492 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2493 ; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2494 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2495 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2496 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2497 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2498 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2499 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2500 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
2501 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
2502 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
2503 ; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
2504 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
2505 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1
2506 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2507 %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
2511 define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
2512 ; SI-LABEL: v_ctlz_zero_undef_v2i8:
2514 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2515 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
2516 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
2517 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2518 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
2519 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2520 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
2521 ; SI-NEXT: s_setpc_b64 s[30:31]
2523 ; VI-LABEL: v_ctlz_zero_undef_v2i8:
2525 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2526 ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
2527 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2528 ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
2529 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
2530 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2531 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
2532 ; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
2533 ; VI-NEXT: s_setpc_b64 s[30:31]
2535 ; EG-LABEL: v_ctlz_zero_undef_v2i8:
2540 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
2541 ; GFX9-GISEL: ; %bb.0:
2542 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2543 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
2544 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1
2545 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2546 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2547 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2548 %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
2552 define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
2553 ; SI-LABEL: v_ctlz_zero_undef_v2i7:
2555 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2556 ; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2557 ; SI-NEXT: v_lshlrev_b32_e32 v1, 25, v1
2558 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
2559 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
2560 ; SI-NEXT: s_setpc_b64 s[30:31]
2562 ; VI-LABEL: v_ctlz_zero_undef_v2i7:
2564 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2565 ; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2566 ; VI-NEXT: v_lshlrev_b32_e32 v1, 25, v1
2567 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
2568 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
2569 ; VI-NEXT: s_setpc_b64 s[30:31]
2571 ; EG-LABEL: v_ctlz_zero_undef_v2i7:
2576 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
2577 ; GFX9-GISEL: ; %bb.0:
2578 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2579 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0
2580 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 25, v1
2581 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
2582 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
2583 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
2584 %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)