1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4 ; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11
9 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
10 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
11 declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
13 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
14 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
15 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
17 declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
18 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
19 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
21 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
23 define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
24 ; SI-LABEL: s_ctlz_i32:
26 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
27 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
28 ; SI-NEXT: s_mov_b32 s3, 0xf000
29 ; SI-NEXT: s_waitcnt lgkmcnt(0)
30 ; SI-NEXT: s_flbit_i32_b32 s2, s2
31 ; SI-NEXT: s_min_u32 s4, s2, 32
32 ; SI-NEXT: s_mov_b32 s2, -1
33 ; SI-NEXT: v_mov_b32_e32 v0, s4
34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
37 ; VI-LABEL: s_ctlz_i32:
39 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
41 ; VI-NEXT: s_mov_b32 s3, 0xf000
42 ; VI-NEXT: s_mov_b32 s2, -1
43 ; VI-NEXT: s_waitcnt lgkmcnt(0)
44 ; VI-NEXT: s_flbit_i32_b32 s4, s4
45 ; VI-NEXT: s_min_u32 s4, s4, 32
46 ; VI-NEXT: v_mov_b32_e32 v0, s4
47 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
50 ; EG-LABEL: s_ctlz_i32:
52 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
53 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
56 ; EG-NEXT: ALU clause starting at 4:
57 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].Z,
58 ; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
59 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
60 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
62 ; GFX10-LABEL: s_ctlz_i32:
64 ; GFX10-NEXT: s_clause 0x1
65 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
66 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
67 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
68 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX10-NEXT: s_flbit_i32_b32 s0, s4
70 ; GFX10-NEXT: s_min_u32 s0, s0, 32
71 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
72 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
73 ; GFX10-NEXT: s_endpgm
75 ; GFX10-GISEL-LABEL: s_ctlz_i32:
76 ; GFX10-GISEL: ; %bb.0:
77 ; GFX10-GISEL-NEXT: s_clause 0x1
78 ; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
79 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
80 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
81 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4
83 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32
84 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
85 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
86 ; GFX10-GISEL-NEXT: s_endpgm
88 ; GFX11-LABEL: s_ctlz_i32:
90 ; GFX11-NEXT: s_clause 0x1
91 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
92 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
93 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX11-NEXT: s_clz_i32_u32 s2, s2
95 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
96 ; GFX11-NEXT: s_min_u32 s2, s2, 32
97 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
98 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
100 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
101 ; GFX11-NEXT: s_endpgm
102 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
103 store i32 %ctlz, ptr addrspace(1) %out, align 4
107 define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
108 ; SI-LABEL: v_ctlz_i32:
110 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
111 ; SI-NEXT: s_mov_b32 s7, 0xf000
112 ; SI-NEXT: s_mov_b32 s10, 0
113 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
114 ; SI-NEXT: v_mov_b32_e32 v1, 0
115 ; SI-NEXT: s_mov_b32 s11, s7
116 ; SI-NEXT: s_waitcnt lgkmcnt(0)
117 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
118 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
119 ; SI-NEXT: s_mov_b32 s6, -1
120 ; SI-NEXT: s_mov_b32 s4, s0
121 ; SI-NEXT: s_mov_b32 s5, s1
122 ; SI-NEXT: s_waitcnt vmcnt(0)
123 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
124 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
125 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
128 ; VI-LABEL: v_ctlz_i32:
130 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
131 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
132 ; VI-NEXT: s_waitcnt lgkmcnt(0)
133 ; VI-NEXT: v_mov_b32_e32 v1, s3
134 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
135 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
136 ; VI-NEXT: flat_load_dword v0, v[0:1]
137 ; VI-NEXT: s_mov_b32 s3, 0xf000
138 ; VI-NEXT: s_mov_b32 s2, -1
139 ; VI-NEXT: s_waitcnt vmcnt(0)
140 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
141 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
142 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
145 ; EG-LABEL: v_ctlz_i32:
147 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
149 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
150 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
153 ; EG-NEXT: Fetch clause starting at 6:
154 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
155 ; EG-NEXT: ALU clause starting at 8:
156 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
157 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
158 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
159 ; EG-NEXT: ALU clause starting at 11:
160 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
161 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
162 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
163 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
165 ; GFX10-LABEL: v_ctlz_i32:
167 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
168 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
169 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
170 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
171 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
172 ; GFX10-NEXT: s_waitcnt vmcnt(0)
173 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
174 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
175 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
176 ; GFX10-NEXT: s_endpgm
178 ; GFX10-GISEL-LABEL: v_ctlz_i32:
179 ; GFX10-GISEL: ; %bb.0:
180 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
181 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
182 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
183 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
185 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
186 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
187 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
188 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
189 ; GFX10-GISEL-NEXT: s_endpgm
191 ; GFX11-LABEL: v_ctlz_i32:
193 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
194 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
195 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
196 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
197 ; GFX11-NEXT: s_waitcnt vmcnt(0)
198 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
200 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
201 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
202 ; GFX11-NEXT: s_nop 0
203 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
204 ; GFX11-NEXT: s_endpgm
205 %tid = call i32 @llvm.amdgcn.workitem.id.x()
206 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
207 %val = load i32, ptr addrspace(1) %in.gep, align 4
208 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
209 store i32 %ctlz, ptr addrspace(1) %out, align 4
213 define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
214 ; SI-LABEL: v_ctlz_v2i32:
216 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
217 ; SI-NEXT: s_mov_b32 s7, 0xf000
218 ; SI-NEXT: s_mov_b32 s10, 0
219 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
220 ; SI-NEXT: v_mov_b32_e32 v1, 0
221 ; SI-NEXT: s_mov_b32 s11, s7
222 ; SI-NEXT: s_waitcnt lgkmcnt(0)
223 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
224 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
225 ; SI-NEXT: s_mov_b32 s6, -1
226 ; SI-NEXT: s_mov_b32 s4, s0
227 ; SI-NEXT: s_mov_b32 s5, s1
228 ; SI-NEXT: s_waitcnt vmcnt(0)
229 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
230 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
231 ; SI-NEXT: v_min_u32_e32 v1, 32, v1
232 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
233 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
236 ; VI-LABEL: v_ctlz_v2i32:
238 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
239 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
240 ; VI-NEXT: s_waitcnt lgkmcnt(0)
241 ; VI-NEXT: v_mov_b32_e32 v1, s3
242 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
243 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
244 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
245 ; VI-NEXT: s_mov_b32 s3, 0xf000
246 ; VI-NEXT: s_mov_b32 s2, -1
247 ; VI-NEXT: s_waitcnt vmcnt(0)
248 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
249 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
250 ; VI-NEXT: v_min_u32_e32 v1, 32, v1
251 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
252 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
255 ; EG-LABEL: v_ctlz_v2i32:
257 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
259 ; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
260 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
263 ; EG-NEXT: Fetch clause starting at 6:
264 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
265 ; EG-NEXT: ALU clause starting at 8:
266 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
267 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
268 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
269 ; EG-NEXT: ALU clause starting at 11:
270 ; EG-NEXT: FFBH_UINT * T0.W, T0.Y,
271 ; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
272 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
273 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
274 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
275 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
276 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
278 ; GFX10-LABEL: v_ctlz_v2i32:
280 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
281 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
282 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
283 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
285 ; GFX10-NEXT: s_waitcnt vmcnt(0)
286 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
287 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
288 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
289 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
290 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
291 ; GFX10-NEXT: s_endpgm
293 ; GFX10-GISEL-LABEL: v_ctlz_v2i32:
294 ; GFX10-GISEL: ; %bb.0:
295 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
296 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
297 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
298 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
299 ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
300 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
301 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
302 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
303 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
304 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
305 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
306 ; GFX10-GISEL-NEXT: s_endpgm
308 ; GFX11-LABEL: v_ctlz_v2i32:
310 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
311 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
312 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
313 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
315 ; GFX11-NEXT: s_waitcnt vmcnt(0)
316 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
317 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
318 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
319 ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
320 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
321 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
322 ; GFX11-NEXT: s_nop 0
323 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
324 ; GFX11-NEXT: s_endpgm
325 %tid = call i32 @llvm.amdgcn.workitem.id.x()
326 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
327 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
328 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
329 store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8
333 define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
334 ; SI-LABEL: v_ctlz_v4i32:
336 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
337 ; SI-NEXT: s_mov_b32 s7, 0xf000
338 ; SI-NEXT: s_mov_b32 s10, 0
339 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
340 ; SI-NEXT: v_mov_b32_e32 v1, 0
341 ; SI-NEXT: s_mov_b32 s11, s7
342 ; SI-NEXT: s_waitcnt lgkmcnt(0)
343 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
344 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
345 ; SI-NEXT: s_mov_b32 s6, -1
346 ; SI-NEXT: s_mov_b32 s4, s0
347 ; SI-NEXT: s_mov_b32 s5, s1
348 ; SI-NEXT: s_waitcnt vmcnt(0)
349 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
350 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
351 ; SI-NEXT: v_ffbh_u32_e32 v1, v1
352 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
353 ; SI-NEXT: v_min_u32_e32 v3, 32, v3
354 ; SI-NEXT: v_min_u32_e32 v2, 32, v2
355 ; SI-NEXT: v_min_u32_e32 v1, 32, v1
356 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
357 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
360 ; VI-LABEL: v_ctlz_v4i32:
362 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
363 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
364 ; VI-NEXT: s_waitcnt lgkmcnt(0)
365 ; VI-NEXT: v_mov_b32_e32 v1, s3
366 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
367 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
368 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
369 ; VI-NEXT: s_mov_b32 s3, 0xf000
370 ; VI-NEXT: s_mov_b32 s2, -1
371 ; VI-NEXT: s_waitcnt vmcnt(0)
372 ; VI-NEXT: v_ffbh_u32_e32 v3, v3
373 ; VI-NEXT: v_ffbh_u32_e32 v2, v2
374 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
375 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
376 ; VI-NEXT: v_min_u32_e32 v3, 32, v3
377 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
378 ; VI-NEXT: v_min_u32_e32 v1, 32, v1
379 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
380 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
383 ; EG-LABEL: v_ctlz_v4i32:
385 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
387 ; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
388 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
391 ; EG-NEXT: Fetch clause starting at 6:
392 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
393 ; EG-NEXT: ALU clause starting at 8:
394 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
395 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
396 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
397 ; EG-NEXT: ALU clause starting at 11:
398 ; EG-NEXT: FFBH_UINT * T1.W, T0.W,
399 ; EG-NEXT: FFBH_UINT T2.W, T0.Z,
400 ; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
401 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
402 ; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
403 ; EG-NEXT: FFBH_UINT * T1.W, T0.Y,
404 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
405 ; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
406 ; EG-NEXT: FFBH_UINT * T1.W, T0.X,
407 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
408 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
409 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
410 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
412 ; GFX10-LABEL: v_ctlz_v4i32:
414 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
415 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
416 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
417 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
418 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
419 ; GFX10-NEXT: s_waitcnt vmcnt(0)
420 ; GFX10-NEXT: v_ffbh_u32_e32 v3, v3
421 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
422 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
423 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
424 ; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
425 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
426 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
427 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
428 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
429 ; GFX10-NEXT: s_endpgm
431 ; GFX10-GISEL-LABEL: v_ctlz_v4i32:
432 ; GFX10-GISEL: ; %bb.0:
433 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
434 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
435 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0
436 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
437 ; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
438 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
439 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
440 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
441 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
442 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
443 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
444 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
445 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
446 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3
447 ; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
448 ; GFX10-GISEL-NEXT: s_endpgm
450 ; GFX11-LABEL: v_ctlz_v4i32:
452 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
453 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
454 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
455 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
457 ; GFX11-NEXT: s_waitcnt vmcnt(0)
458 ; GFX11-NEXT: v_clz_i32_u32_e32 v3, v3
459 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2
460 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
461 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
463 ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3
464 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
465 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
466 ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
467 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
468 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
469 ; GFX11-NEXT: s_nop 0
470 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
471 ; GFX11-NEXT: s_endpgm
472 %tid = call i32 @llvm.amdgcn.workitem.id.x()
473 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
474 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
475 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
476 store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16
480 define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
481 ; SI-LABEL: v_ctlz_i8:
483 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
484 ; SI-NEXT: s_mov_b32 s7, 0xf000
485 ; SI-NEXT: s_mov_b32 s6, -1
486 ; SI-NEXT: s_mov_b32 s10, s6
487 ; SI-NEXT: s_mov_b32 s11, s7
488 ; SI-NEXT: s_waitcnt lgkmcnt(0)
489 ; SI-NEXT: s_mov_b32 s8, s2
490 ; SI-NEXT: s_mov_b32 s9, s3
491 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
492 ; SI-NEXT: s_mov_b32 s4, s0
493 ; SI-NEXT: s_mov_b32 s5, s1
494 ; SI-NEXT: s_waitcnt vmcnt(0)
495 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
496 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
497 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
498 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
501 ; VI-LABEL: v_ctlz_i8:
503 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
504 ; VI-NEXT: s_mov_b32 s7, 0xf000
505 ; VI-NEXT: s_mov_b32 s6, -1
506 ; VI-NEXT: s_mov_b32 s10, s6
507 ; VI-NEXT: s_mov_b32 s11, s7
508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
509 ; VI-NEXT: s_mov_b32 s8, s2
510 ; VI-NEXT: s_mov_b32 s9, s3
511 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
512 ; VI-NEXT: s_mov_b32 s4, s0
513 ; VI-NEXT: s_mov_b32 s5, s1
514 ; VI-NEXT: s_waitcnt vmcnt(0)
515 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
516 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
517 ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
518 ; VI-NEXT: v_add_u16_e32 v0, -8, v0
519 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
522 ; EG-LABEL: v_ctlz_i8:
524 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
526 ; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
527 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
530 ; EG-NEXT: Fetch clause starting at 6:
531 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
532 ; EG-NEXT: ALU clause starting at 8:
533 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
534 ; EG-NEXT: ALU clause starting at 9:
535 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
536 ; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W,
537 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
538 ; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45)
539 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
540 ; EG-NEXT: -24(nan), 0(0.000000e+00)
541 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
542 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
543 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
544 ; EG-NEXT: LSHL T0.X, PV.W, PS,
545 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
546 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
547 ; EG-NEXT: MOV T0.Y, 0.0,
548 ; EG-NEXT: MOV * T0.Z, 0.0,
549 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
550 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
552 ; GFX10-LABEL: v_ctlz_i8:
554 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
555 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
556 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
558 ; GFX10-NEXT: s_waitcnt vmcnt(0)
559 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
560 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
561 ; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
562 ; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
563 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
564 ; GFX10-NEXT: s_endpgm
566 ; GFX10-GISEL-LABEL: v_ctlz_i8:
567 ; GFX10-GISEL: ; %bb.0:
568 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
569 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
570 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
571 ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
572 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
573 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
574 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
575 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
576 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
577 ; GFX10-GISEL-NEXT: s_endpgm
579 ; GFX11-LABEL: v_ctlz_i8:
581 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
582 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
583 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3]
585 ; GFX11-NEXT: s_waitcnt vmcnt(0)
586 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
587 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
588 ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
589 ; GFX11-NEXT: v_add_nc_u32_e32 v1, -16, v1
590 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
591 ; GFX11-NEXT: v_add_nc_u16 v1, v1, -8
592 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
593 ; GFX11-NEXT: s_nop 0
594 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
595 ; GFX11-NEXT: s_endpgm
596 %val = load i8, ptr addrspace(1) %valptr
597 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
598 store i8 %ctlz, ptr addrspace(1) %out
602 define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
603 ; SI-LABEL: s_ctlz_i64:
605 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
606 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
607 ; SI-NEXT: s_mov_b32 s3, 0xf000
608 ; SI-NEXT: s_mov_b32 s2, -1
609 ; SI-NEXT: s_waitcnt lgkmcnt(0)
610 ; SI-NEXT: s_flbit_i32_b32 s4, s4
611 ; SI-NEXT: s_flbit_i32_b32 s5, s5
612 ; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf
613 ; SI-NEXT: v_mov_b32_e32 v0, s5
614 ; SI-NEXT: s_add_i32 s4, s4, 32
615 ; SI-NEXT: v_min3_u32 v0, s4, v0, 64
616 ; SI-NEXT: v_mov_b32_e32 v1, 0
617 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
620 ; VI-LABEL: s_ctlz_i64:
622 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
623 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
624 ; VI-NEXT: s_mov_b32 s3, 0xf000
625 ; VI-NEXT: s_mov_b32 s2, -1
626 ; VI-NEXT: v_mov_b32_e32 v1, 0
627 ; VI-NEXT: s_waitcnt lgkmcnt(0)
628 ; VI-NEXT: s_flbit_i32_b32 s4, s4
629 ; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp
630 ; VI-NEXT: s_flbit_i32_b32 s4, s5
631 ; VI-NEXT: v_min3_u32 v0, v0, s4, 64
632 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
635 ; EG-LABEL: s_ctlz_i64:
637 ; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
638 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
641 ; EG-NEXT: ALU clause starting at 4:
642 ; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W,
643 ; EG-NEXT: CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
644 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
645 ; EG-NEXT: FFBH_UINT T1.W, KC0[5].X,
646 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
647 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
648 ; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W,
649 ; EG-NEXT: MOV T0.Y, 0.0,
650 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
651 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
653 ; GFX10-LABEL: s_ctlz_i64:
655 ; GFX10-NEXT: s_clause 0x1
656 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
657 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
658 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
659 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
660 ; GFX10-NEXT: s_flbit_i32_b32 s0, s2
661 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp
662 ; GFX10-NEXT: s_flbit_i32_b32 s0, s3
663 ; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64
664 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
665 ; GFX10-NEXT: s_endpgm
667 ; GFX10-GISEL-LABEL: s_ctlz_i64:
668 ; GFX10-GISEL: ; %bb.0:
669 ; GFX10-GISEL-NEXT: s_clause 0x1
670 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
671 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
672 ; GFX10-GISEL-NEXT: s_mov_b32 s1, 0
673 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
674 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
675 ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3]
676 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
677 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
678 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
679 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
680 ; GFX10-GISEL-NEXT: s_endpgm
682 ; GFX11-LABEL: s_ctlz_i64:
684 ; GFX11-NEXT: s_clause 0x1
685 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
686 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
687 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
688 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
689 ; GFX11-NEXT: s_clz_i32_u32 s2, s2
690 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
691 ; GFX11-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp
692 ; GFX11-NEXT: s_clz_i32_u32 s2, s3
693 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
694 ; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64
695 ; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1]
696 ; GFX11-NEXT: s_nop 0
697 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
698 ; GFX11-NEXT: s_endpgm
699 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
700 store i64 %ctlz, ptr addrspace(1) %out
704 define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
705 ; SI-LABEL: s_ctlz_i64_trunc:
707 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
708 ; SI-NEXT: s_mov_b32 s7, 0xf000
709 ; SI-NEXT: s_mov_b32 s6, -1
710 ; SI-NEXT: s_waitcnt lgkmcnt(0)
711 ; SI-NEXT: s_mov_b32 s4, s0
712 ; SI-NEXT: s_mov_b32 s5, s1
713 ; SI-NEXT: s_flbit_i32_b32 s0, s2
714 ; SI-NEXT: s_min_u32 s0, s0, 0xffffffdf
715 ; SI-NEXT: s_flbit_i32_b32 s1, s3
716 ; SI-NEXT: s_add_i32 s0, s0, 32
717 ; SI-NEXT: v_mov_b32_e32 v0, s1
718 ; SI-NEXT: v_min3_u32 v0, s0, v0, 64
719 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
722 ; VI-LABEL: s_ctlz_i64_trunc:
724 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
725 ; VI-NEXT: s_mov_b32 s7, 0xf000
726 ; VI-NEXT: s_mov_b32 s6, -1
727 ; VI-NEXT: s_waitcnt lgkmcnt(0)
728 ; VI-NEXT: s_mov_b32 s4, s0
729 ; VI-NEXT: s_flbit_i32_b32 s0, s2
730 ; VI-NEXT: s_mov_b32 s5, s1
731 ; VI-NEXT: v_add_u32_e64 v0, s[0:1], s0, 32 clamp
732 ; VI-NEXT: s_flbit_i32_b32 s0, s3
733 ; VI-NEXT: v_min3_u32 v0, v0, s0, 64
734 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
737 ; EG-LABEL: s_ctlz_i64_trunc:
739 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
740 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
743 ; EG-NEXT: ALU clause starting at 4:
744 ; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W,
745 ; EG-NEXT: CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
746 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
747 ; EG-NEXT: FFBH_UINT T1.W, KC0[3].X,
748 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
749 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
750 ; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W,
751 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
752 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
754 ; GFX10-LABEL: s_ctlz_i64_trunc:
756 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
757 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
758 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
759 ; GFX10-NEXT: s_flbit_i32_b32 s2, s2
760 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp
761 ; GFX10-NEXT: s_flbit_i32_b32 s2, s3
762 ; GFX10-NEXT: v_min3_u32 v0, v0, s2, 64
763 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
764 ; GFX10-NEXT: s_endpgm
766 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
767 ; GFX10-GISEL: ; %bb.0:
768 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
769 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
770 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
771 ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3]
772 ; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64
773 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
774 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
775 ; GFX10-GISEL-NEXT: s_endpgm
777 ; GFX11-LABEL: s_ctlz_i64_trunc:
779 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
780 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
781 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
782 ; GFX11-NEXT: s_clz_i32_u32 s2, s2
783 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
784 ; GFX11-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp
785 ; GFX11-NEXT: s_clz_i32_u32 s2, s3
786 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
787 ; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64
788 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
789 ; GFX11-NEXT: s_nop 0
790 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
791 ; GFX11-NEXT: s_endpgm
792 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
793 %trunc = trunc i64 %ctlz to i32
794 store i32 %trunc, ptr addrspace(1) %out
798 define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
799 ; SI-LABEL: v_ctlz_i64:
801 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
802 ; SI-NEXT: s_mov_b32 s7, 0xf000
803 ; SI-NEXT: s_mov_b32 s6, 0
804 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
805 ; SI-NEXT: v_mov_b32_e32 v1, 0
806 ; SI-NEXT: s_waitcnt lgkmcnt(0)
807 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
808 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
809 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
810 ; SI-NEXT: s_waitcnt vmcnt(0)
811 ; SI-NEXT: v_ffbh_u32_e32 v2, v2
812 ; SI-NEXT: v_min_u32_e32 v2, 0xffffffdf, v2
813 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
814 ; SI-NEXT: v_ffbh_u32_e32 v3, v3
815 ; SI-NEXT: v_min3_u32 v2, v2, v3, 64
816 ; SI-NEXT: v_mov_b32_e32 v3, v1
817 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
820 ; VI-LABEL: v_ctlz_i64:
822 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
823 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
824 ; VI-NEXT: v_mov_b32_e32 v2, 0
825 ; VI-NEXT: s_waitcnt lgkmcnt(0)
826 ; VI-NEXT: v_mov_b32_e32 v1, s3
827 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3
828 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
829 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
830 ; VI-NEXT: v_mov_b32_e32 v4, s1
831 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
832 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
833 ; VI-NEXT: s_waitcnt vmcnt(0)
834 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
835 ; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp
836 ; VI-NEXT: v_ffbh_u32_e32 v1, v1
837 ; VI-NEXT: v_min3_u32 v1, v0, v1, 64
838 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
841 ; EG-LABEL: v_ctlz_i64:
843 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
845 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
846 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
849 ; EG-NEXT: Fetch clause starting at 6:
850 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
851 ; EG-NEXT: ALU clause starting at 8:
852 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
853 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
854 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
855 ; EG-NEXT: ALU clause starting at 11:
856 ; EG-NEXT: FFBH_UINT * T1.W, T0.X,
857 ; EG-NEXT: CNDE_INT * T1.W, T0.X, literal.x, PV.W,
858 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
859 ; EG-NEXT: FFBH_UINT T2.W, T0.Y,
860 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
861 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
862 ; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W,
863 ; EG-NEXT: MOV T0.Y, 0.0,
864 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
865 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
866 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
868 ; GFX10-LABEL: v_ctlz_i64:
870 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
871 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
872 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
874 ; GFX10-NEXT: s_waitcnt vmcnt(0)
875 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
876 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
877 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
878 ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64
879 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
880 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
881 ; GFX10-NEXT: s_endpgm
883 ; GFX10-GISEL-LABEL: v_ctlz_i64:
884 ; GFX10-GISEL: ; %bb.0:
885 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
886 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
887 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
888 ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
889 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
890 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
891 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
892 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
893 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0
894 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
895 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0
896 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
897 ; GFX10-GISEL-NEXT: s_endpgm
899 ; GFX11-LABEL: v_ctlz_i64:
901 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
902 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
903 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
904 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
905 ; GFX11-NEXT: s_waitcnt vmcnt(0)
906 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
907 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
908 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
909 ; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
910 ; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64
911 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
912 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
913 ; GFX11-NEXT: s_nop 0
914 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
915 ; GFX11-NEXT: s_endpgm
916 %tid = call i32 @llvm.amdgcn.workitem.id.x()
917 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
918 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
919 %val = load i64, ptr addrspace(1) %in.gep
920 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
921 store i64 %ctlz, ptr addrspace(1) %out.gep
925 define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
926 ; SI-LABEL: v_ctlz_i64_trunc:
928 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
929 ; SI-NEXT: s_mov_b32 s7, 0xf000
930 ; SI-NEXT: s_mov_b32 s6, 0
931 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
932 ; SI-NEXT: v_mov_b32_e32 v2, 0
933 ; SI-NEXT: s_waitcnt lgkmcnt(0)
934 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
935 ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
936 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
937 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
938 ; SI-NEXT: s_waitcnt vmcnt(0)
939 ; SI-NEXT: v_ffbh_u32_e32 v0, v3
940 ; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0
941 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
942 ; SI-NEXT: v_ffbh_u32_e32 v3, v4
943 ; SI-NEXT: v_min3_u32 v0, v0, v3, 64
944 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
947 ; VI-LABEL: v_ctlz_i64_trunc:
949 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
950 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
951 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
952 ; VI-NEXT: s_waitcnt lgkmcnt(0)
953 ; VI-NEXT: v_mov_b32_e32 v2, s3
954 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
955 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
956 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
957 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
958 ; VI-NEXT: v_mov_b32_e32 v4, s1
959 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
960 ; VI-NEXT: s_waitcnt vmcnt(0)
961 ; VI-NEXT: v_ffbh_u32_e32 v0, v1
962 ; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp
963 ; VI-NEXT: v_ffbh_u32_e32 v1, v2
964 ; VI-NEXT: v_min3_u32 v0, v0, v1, 64
965 ; VI-NEXT: flat_store_dword v[3:4], v0
968 ; EG-LABEL: v_ctlz_i64_trunc:
970 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
972 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
973 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
976 ; EG-NEXT: Fetch clause starting at 6:
977 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
978 ; EG-NEXT: ALU clause starting at 8:
979 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
980 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
981 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
982 ; EG-NEXT: ALU clause starting at 11:
983 ; EG-NEXT: FFBH_UINT * T0.W, T1.X,
984 ; EG-NEXT: CNDE_INT * T0.W, T1.X, literal.x, PV.W,
985 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
986 ; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
987 ; EG-NEXT: FFBH_UINT T1.W, T1.Y,
988 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y,
989 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
990 ; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W,
991 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
992 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
993 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
995 ; GFX10-LABEL: v_ctlz_i64_trunc:
997 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
998 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
999 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1001 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
1002 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1003 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
1004 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
1005 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
1006 ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64
1007 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1008 ; GFX10-NEXT: s_endpgm
1010 ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
1011 ; GFX10-GISEL: ; %bb.0:
1012 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1013 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1014 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1015 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1016 ; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
1017 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1018 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
1019 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
1020 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
1021 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1
1022 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1
1023 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
1024 ; GFX10-GISEL-NEXT: s_endpgm
1026 ; GFX11-LABEL: v_ctlz_i64_trunc:
1028 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1029 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1030 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1031 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1032 ; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
1033 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1034 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
1035 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2
1036 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1037 ; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
1038 ; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64
1039 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1040 ; GFX11-NEXT: s_nop 0
1041 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1042 ; GFX11-NEXT: s_endpgm
1043 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1044 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1045 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
1046 %val = load i64, ptr addrspace(1) %in.gep
1047 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
1048 %trunc = trunc i64 %ctlz to i32
1049 store i32 %trunc, ptr addrspace(1) %out.gep
1053 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1054 ; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
1056 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1057 ; SI-NEXT: s_mov_b32 s7, 0xf000
1058 ; SI-NEXT: s_mov_b32 s10, 0
1059 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1060 ; SI-NEXT: v_mov_b32_e32 v1, 0
1061 ; SI-NEXT: s_mov_b32 s11, s7
1062 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1063 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1064 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1065 ; SI-NEXT: s_mov_b32 s6, -1
1066 ; SI-NEXT: s_mov_b32 s4, s0
1067 ; SI-NEXT: s_mov_b32 s5, s1
1068 ; SI-NEXT: s_waitcnt vmcnt(0)
1069 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1070 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1073 ; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
1075 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1076 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1077 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1078 ; VI-NEXT: v_mov_b32_e32 v1, s3
1079 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1080 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1081 ; VI-NEXT: flat_load_dword v0, v[0:1]
1082 ; VI-NEXT: s_mov_b32 s3, 0xf000
1083 ; VI-NEXT: s_mov_b32 s2, -1
1084 ; VI-NEXT: s_waitcnt vmcnt(0)
1085 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1086 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1089 ; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
1091 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1093 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
1094 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1097 ; EG-NEXT: Fetch clause starting at 6:
1098 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1099 ; EG-NEXT: ALU clause starting at 8:
1100 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1101 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1102 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1103 ; EG-NEXT: ALU clause starting at 11:
1104 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1105 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1106 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1107 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1108 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1109 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1111 ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
1113 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1114 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1115 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1116 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1117 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1118 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1119 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
1120 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1121 ; GFX10-NEXT: s_endpgm
1123 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
1124 ; GFX10-GISEL: ; %bb.0:
1125 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1126 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1127 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1128 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1129 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1130 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1131 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1132 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1133 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1134 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1135 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1136 ; GFX10-GISEL-NEXT: s_endpgm
1138 ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
1140 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1141 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1142 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1143 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1144 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1145 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
1146 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1147 ; GFX11-NEXT: s_nop 0
1148 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1149 ; GFX11-NEXT: s_endpgm
1150 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1151 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1152 %val = load i32, ptr addrspace(1) %in.gep
1153 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1154 %cmp = icmp eq i32 %val, 0
1155 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1156 store i32 %sel, ptr addrspace(1) %out
1160 define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1161 ; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
1163 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1164 ; SI-NEXT: s_mov_b32 s7, 0xf000
1165 ; SI-NEXT: s_mov_b32 s10, 0
1166 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1167 ; SI-NEXT: v_mov_b32_e32 v1, 0
1168 ; SI-NEXT: s_mov_b32 s11, s7
1169 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1170 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1171 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1172 ; SI-NEXT: s_mov_b32 s6, -1
1173 ; SI-NEXT: s_mov_b32 s4, s0
1174 ; SI-NEXT: s_mov_b32 s5, s1
1175 ; SI-NEXT: s_waitcnt vmcnt(0)
1176 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1177 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1180 ; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
1182 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1183 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1184 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1185 ; VI-NEXT: v_mov_b32_e32 v1, s3
1186 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1187 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1188 ; VI-NEXT: flat_load_dword v0, v[0:1]
1189 ; VI-NEXT: s_mov_b32 s3, 0xf000
1190 ; VI-NEXT: s_mov_b32 s2, -1
1191 ; VI-NEXT: s_waitcnt vmcnt(0)
1192 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1193 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1196 ; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
1198 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1200 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
1201 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1204 ; EG-NEXT: Fetch clause starting at 6:
1205 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1206 ; EG-NEXT: ALU clause starting at 8:
1207 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1208 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1209 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1210 ; EG-NEXT: ALU clause starting at 11:
1211 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1212 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1213 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1214 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1215 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1216 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1218 ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
1220 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1221 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1222 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1223 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1224 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1225 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1226 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
1227 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1228 ; GFX10-NEXT: s_endpgm
1230 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
1231 ; GFX10-GISEL: ; %bb.0:
1232 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1233 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1234 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1235 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1236 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1237 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1238 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
1239 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1240 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1241 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1242 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1243 ; GFX10-GISEL-NEXT: s_endpgm
1245 ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
1247 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1248 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1249 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1250 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1251 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1252 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
1253 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1254 ; GFX11-NEXT: s_nop 0
1255 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1256 ; GFX11-NEXT: s_endpgm
1257 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1258 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1259 %val = load i32, ptr addrspace(1) %in.gep
1260 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1261 %cmp = icmp ne i32 %val, 0
1262 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1263 store i32 %sel, ptr addrspace(1) %out
1267 ; TODO: Should be able to eliminate select here as well.
1268 define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1269 ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1271 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1272 ; SI-NEXT: s_mov_b32 s7, 0xf000
1273 ; SI-NEXT: s_mov_b32 s10, 0
1274 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1275 ; SI-NEXT: v_mov_b32_e32 v1, 0
1276 ; SI-NEXT: s_mov_b32 s11, s7
1277 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1278 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1279 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1280 ; SI-NEXT: s_mov_b32 s6, -1
1281 ; SI-NEXT: s_mov_b32 s4, s0
1282 ; SI-NEXT: s_mov_b32 s5, s1
1283 ; SI-NEXT: s_waitcnt vmcnt(0)
1284 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1285 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
1286 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1287 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1288 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1291 ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1293 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1294 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1296 ; VI-NEXT: v_mov_b32_e32 v1, s3
1297 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1298 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1299 ; VI-NEXT: flat_load_dword v0, v[0:1]
1300 ; VI-NEXT: s_mov_b32 s3, 0xf000
1301 ; VI-NEXT: s_mov_b32 s2, -1
1302 ; VI-NEXT: s_waitcnt vmcnt(0)
1303 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1304 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
1305 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1306 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1307 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1310 ; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1312 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1314 ; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
1315 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1318 ; EG-NEXT: Fetch clause starting at 6:
1319 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1320 ; EG-NEXT: ALU clause starting at 8:
1321 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1322 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1323 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1324 ; EG-NEXT: ALU clause starting at 11:
1325 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1326 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1327 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1328 ; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x,
1329 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1330 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x,
1331 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1332 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1334 ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1336 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1337 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1338 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1339 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1340 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1341 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1342 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
1343 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
1344 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1345 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1346 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1347 ; GFX10-NEXT: s_endpgm
1349 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1350 ; GFX10-GISEL: ; %bb.0:
1351 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1352 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1353 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1354 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1355 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1356 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1357 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1358 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
1359 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0
1360 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1361 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1362 ; GFX10-GISEL-NEXT: s_endpgm
1364 ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1366 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1367 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1368 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1369 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1370 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1371 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
1372 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1373 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
1374 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1375 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1376 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1377 ; GFX11-NEXT: s_nop 0
1378 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1379 ; GFX11-NEXT: s_endpgm
1380 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1381 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1382 %val = load i32, ptr addrspace(1) %in.gep
1383 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1384 %cmp = icmp eq i32 %ctlz, 32
1385 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1386 store i32 %sel, ptr addrspace(1) %out
1390 define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1391 ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1393 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1394 ; SI-NEXT: s_mov_b32 s7, 0xf000
1395 ; SI-NEXT: s_mov_b32 s10, 0
1396 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1397 ; SI-NEXT: v_mov_b32_e32 v1, 0
1398 ; SI-NEXT: s_mov_b32 s11, s7
1399 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1400 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1401 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1402 ; SI-NEXT: s_mov_b32 s6, -1
1403 ; SI-NEXT: s_mov_b32 s4, s0
1404 ; SI-NEXT: s_mov_b32 s5, s1
1405 ; SI-NEXT: s_waitcnt vmcnt(0)
1406 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1407 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
1408 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1409 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1410 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1413 ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1415 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1416 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1417 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1418 ; VI-NEXT: v_mov_b32_e32 v1, s3
1419 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1420 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1421 ; VI-NEXT: flat_load_dword v0, v[0:1]
1422 ; VI-NEXT: s_mov_b32 s3, 0xf000
1423 ; VI-NEXT: s_mov_b32 s2, -1
1424 ; VI-NEXT: s_waitcnt vmcnt(0)
1425 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1426 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
1427 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1428 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1429 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1432 ; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1434 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1436 ; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
1437 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1440 ; EG-NEXT: Fetch clause starting at 6:
1441 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1442 ; EG-NEXT: ALU clause starting at 8:
1443 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1444 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1445 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1446 ; EG-NEXT: ALU clause starting at 11:
1447 ; EG-NEXT: FFBH_UINT * T0.W, T0.X,
1448 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1449 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1450 ; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x,
1451 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1452 ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W,
1453 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1454 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1456 ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1458 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1459 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1460 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1461 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1462 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1463 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1464 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
1465 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
1466 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1467 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1468 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1469 ; GFX10-NEXT: s_endpgm
1471 ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1472 ; GFX10-GISEL: ; %bb.0:
1473 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1474 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1475 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1476 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1477 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1478 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1479 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1480 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
1481 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1482 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1483 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1484 ; GFX10-GISEL-NEXT: s_endpgm
1486 ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1488 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1489 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1490 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1491 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1492 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1493 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
1494 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1495 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0
1496 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1497 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1498 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1499 ; GFX11-NEXT: s_nop 0
1500 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1501 ; GFX11-NEXT: s_endpgm
1502 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1503 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1504 %val = load i32, ptr addrspace(1) %in.gep
1505 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1506 %cmp = icmp ne i32 %ctlz, 32
1507 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1508 store i32 %sel, ptr addrspace(1) %out
1512 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1513 ; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
1515 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1516 ; SI-NEXT: s_mov_b32 s7, 0xf000
1517 ; SI-NEXT: v_mov_b32_e32 v1, 0
1518 ; SI-NEXT: s_mov_b32 s10, 0
1519 ; SI-NEXT: s_mov_b32 s11, s7
1520 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1521 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1522 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1523 ; SI-NEXT: s_mov_b32 s6, -1
1524 ; SI-NEXT: s_mov_b32 s4, s0
1525 ; SI-NEXT: s_mov_b32 s5, s1
1526 ; SI-NEXT: s_waitcnt vmcnt(0)
1527 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1528 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1531 ; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
1533 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1534 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1535 ; VI-NEXT: v_mov_b32_e32 v1, s3
1536 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1537 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1538 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1539 ; VI-NEXT: s_mov_b32 s3, 0xf000
1540 ; VI-NEXT: s_mov_b32 s2, -1
1541 ; VI-NEXT: s_waitcnt vmcnt(0)
1542 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1543 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1546 ; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1548 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1550 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1551 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1554 ; EG-NEXT: Fetch clause starting at 6:
1555 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1556 ; EG-NEXT: ALU clause starting at 8:
1557 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1558 ; EG-NEXT: ALU clause starting at 9:
1559 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1560 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1561 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1562 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1563 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1564 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1565 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1566 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1567 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1568 ; EG-NEXT: MOV T0.Y, 0.0,
1569 ; EG-NEXT: MOV * T0.Z, 0.0,
1570 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1571 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1573 ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
1575 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1576 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1577 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1578 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1579 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1580 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
1581 ; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
1582 ; GFX10-NEXT: s_endpgm
1584 ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
1585 ; GFX10-GISEL: ; %bb.0:
1586 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1587 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1588 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1589 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
1590 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
1591 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
1592 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1593 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1594 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1595 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1596 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1597 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1598 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
1599 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1600 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1601 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
1602 ; GFX10-GISEL-NEXT: s_endpgm
1604 ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
1606 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1607 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1608 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1609 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
1610 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1611 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
1612 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1]
1613 ; GFX11-NEXT: s_nop 0
1614 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1615 ; GFX11-NEXT: s_endpgm
1616 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1617 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1618 %val = load i8, ptr addrspace(1) %valptr.gep
1619 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1620 %cmp = icmp eq i8 %val, 0
1621 %sel = select i1 %cmp, i8 -1, i8 %ctlz
1622 store i8 %sel, ptr addrspace(1) %out
1626 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1627 ; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1629 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1630 ; SI-NEXT: s_mov_b32 s7, 0xf000
1631 ; SI-NEXT: s_mov_b32 s6, -1
1632 ; SI-NEXT: s_mov_b32 s10, s6
1633 ; SI-NEXT: s_mov_b32 s11, s7
1634 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1635 ; SI-NEXT: s_mov_b32 s8, s2
1636 ; SI-NEXT: s_mov_b32 s9, s3
1637 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1638 ; SI-NEXT: s_mov_b32 s4, s0
1639 ; SI-NEXT: s_mov_b32 s5, s1
1640 ; SI-NEXT: s_waitcnt vmcnt(0)
1641 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1642 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
1645 ; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1647 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1648 ; VI-NEXT: s_mov_b32 s7, 0xf000
1649 ; VI-NEXT: s_mov_b32 s6, -1
1650 ; VI-NEXT: s_mov_b32 s10, s6
1651 ; VI-NEXT: s_mov_b32 s11, s7
1652 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1653 ; VI-NEXT: s_mov_b32 s8, s2
1654 ; VI-NEXT: s_mov_b32 s9, s3
1655 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1656 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
1657 ; VI-NEXT: s_mov_b32 s4, s0
1658 ; VI-NEXT: s_mov_b32 s5, s1
1659 ; VI-NEXT: s_waitcnt vmcnt(0)
1660 ; VI-NEXT: v_ffbh_u32_e32 v2, v0
1661 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1662 ; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2
1663 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
1664 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
1665 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
1668 ; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1670 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1672 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1673 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1676 ; EG-NEXT: Fetch clause starting at 6:
1677 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1678 ; EG-NEXT: ALU clause starting at 8:
1679 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1680 ; EG-NEXT: ALU clause starting at 9:
1681 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1682 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1683 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1684 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1685 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1686 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1687 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1688 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1689 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1690 ; EG-NEXT: MOV T0.Y, 0.0,
1691 ; EG-NEXT: MOV * T0.Z, 0.0,
1692 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1693 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1695 ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
1697 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1698 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1699 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1700 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
1701 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1702 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
1703 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
1704 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
1705 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2
1706 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1707 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1708 ; GFX10-NEXT: s_endpgm
1710 ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
1711 ; GFX10-GISEL: ; %bb.0:
1712 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1713 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
1714 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1715 ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
1716 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1717 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
1718 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
1719 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
1720 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2
1721 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
1722 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
1723 ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
1724 ; GFX10-GISEL-NEXT: s_endpgm
1726 ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
1728 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1729 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1730 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1731 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
1732 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1733 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
1734 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
1735 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1736 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
1737 ; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2
1738 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1739 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1740 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
1741 ; GFX11-NEXT: s_nop 0
1742 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1743 ; GFX11-NEXT: s_endpgm
1744 %val = load i16, ptr addrspace(1) %valptr
1745 %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1746 %cmp = icmp eq i16 %val, 0
1747 %sel = select i1 %cmp, i16 -1, i16 %ctlz
1748 store i16 %sel, ptr addrspace(1) %out
1752 ; FIXME: Need to handle non-uniform case for function below (load without gep).
1753 define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1754 ; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1756 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1757 ; SI-NEXT: s_mov_b32 s7, 0xf000
1758 ; SI-NEXT: v_mov_b32_e32 v1, 0
1759 ; SI-NEXT: s_mov_b32 s10, 0
1760 ; SI-NEXT: s_mov_b32 s11, s7
1761 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1762 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1763 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1764 ; SI-NEXT: s_mov_b32 s6, -1
1765 ; SI-NEXT: s_mov_b32 s4, s0
1766 ; SI-NEXT: s_mov_b32 s5, s1
1767 ; SI-NEXT: s_waitcnt vmcnt(0)
1768 ; SI-NEXT: v_ffbh_u32_e32 v0, v0
1769 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
1770 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1773 ; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1775 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1776 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1777 ; VI-NEXT: v_mov_b32_e32 v1, s3
1778 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1779 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1780 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1781 ; VI-NEXT: s_mov_b32 s3, 0xf000
1782 ; VI-NEXT: s_mov_b32 s2, -1
1783 ; VI-NEXT: s_waitcnt vmcnt(0)
1784 ; VI-NEXT: v_ffbh_u32_e32 v0, v0
1785 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
1786 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1789 ; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1791 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1793 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1794 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1797 ; EG-NEXT: Fetch clause starting at 6:
1798 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1799 ; EG-NEXT: ALU clause starting at 8:
1800 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1801 ; EG-NEXT: ALU clause starting at 9:
1802 ; EG-NEXT: FFBH_UINT T0.W, T0.X,
1803 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1804 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1805 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1806 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1807 ; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45)
1808 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1809 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1810 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1811 ; EG-NEXT: MOV T0.Y, 0.0,
1812 ; EG-NEXT: MOV * T0.Z, 0.0,
1813 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1814 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1816 ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
1818 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1819 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1820 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1821 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1822 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1823 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
1824 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0
1825 ; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
1826 ; GFX10-NEXT: s_endpgm
1828 ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
1829 ; GFX10-GISEL: ; %bb.0:
1830 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1831 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1832 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1833 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
1834 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
1835 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
1836 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1837 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1838 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1839 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
1840 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1841 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1842 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1843 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1
1844 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1845 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1846 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
1847 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
1848 ; GFX10-GISEL-NEXT: s_endpgm
1850 ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
1852 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1853 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1854 ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
1855 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1856 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0
1857 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1858 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0
1859 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1]
1860 ; GFX11-NEXT: s_nop 0
1861 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1862 ; GFX11-NEXT: s_endpgm
1863 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1864 %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
1865 %val = load i7, ptr addrspace(1) %valptr.gep
1866 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1867 %cmp = icmp eq i7 %val, 0
1868 %sel = select i1 %cmp, i7 -1, i7 %ctlz
1869 store i7 %sel, ptr addrspace(1) %out