1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4 ; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
8 declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
9 declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
10 declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
12 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
13 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
14 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
16 declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
17 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone
18 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone
20 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
22 define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
23 ; SI-LABEL: s_cttz_i32:
25 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
26 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
27 ; SI-NEXT: s_mov_b32 s3, 0xf000
28 ; SI-NEXT: s_waitcnt lgkmcnt(0)
29 ; SI-NEXT: s_ff1_i32_b32 s2, s2
30 ; SI-NEXT: s_min_u32 s4, s2, 32
31 ; SI-NEXT: s_mov_b32 s2, -1
32 ; SI-NEXT: v_mov_b32_e32 v0, s4
33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
36 ; VI-LABEL: s_cttz_i32:
38 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
39 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
40 ; VI-NEXT: s_mov_b32 s7, 0xf000
41 ; VI-NEXT: s_mov_b32 s6, -1
42 ; VI-NEXT: s_waitcnt lgkmcnt(0)
43 ; VI-NEXT: s_ff1_i32_b32 s0, s0
44 ; VI-NEXT: s_min_u32 s0, s0, 32
45 ; VI-NEXT: v_mov_b32_e32 v0, s0
46 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
49 ; EG-LABEL: s_cttz_i32:
51 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
52 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
55 ; EG-NEXT: ALU clause starting at 4:
56 ; EG-NEXT: FFBL_INT * T0.W, KC0[2].Z,
57 ; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
59 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
61 ; GFX10-LABEL: s_cttz_i32:
63 ; GFX10-NEXT: s_clause 0x1
64 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
65 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
66 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
67 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX10-NEXT: s_ff1_i32_b32 s0, s4
69 ; GFX10-NEXT: s_min_u32 s0, s0, 32
70 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
71 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
72 ; GFX10-NEXT: s_endpgm
74 ; GFX10-GISEL-LABEL: s_cttz_i32:
75 ; GFX10-GISEL: ; %bb.0:
76 ; GFX10-GISEL-NEXT: s_clause 0x1
77 ; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
78 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
79 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
80 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4
82 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32
83 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
84 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
85 ; GFX10-GISEL-NEXT: s_endpgm
86 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
87 store i32 %cttz, i32 addrspace(1)* %out, align 4
91 define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
92 ; SI-LABEL: v_cttz_i32:
94 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
95 ; SI-NEXT: s_mov_b32 s3, 0xf000
96 ; SI-NEXT: s_mov_b32 s6, 0
97 ; SI-NEXT: s_mov_b32 s7, s3
98 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
99 ; SI-NEXT: v_mov_b32_e32 v1, 0
100 ; SI-NEXT: s_waitcnt lgkmcnt(0)
101 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
102 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
103 ; SI-NEXT: s_mov_b32 s2, -1
104 ; SI-NEXT: s_waitcnt vmcnt(0)
105 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
106 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
107 ; SI-NEXT: s_waitcnt lgkmcnt(0)
108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
111 ; VI-LABEL: v_cttz_i32:
113 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
114 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
115 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
116 ; VI-NEXT: s_mov_b32 s7, 0xf000
117 ; VI-NEXT: s_mov_b32 s6, -1
118 ; VI-NEXT: s_waitcnt lgkmcnt(0)
119 ; VI-NEXT: v_mov_b32_e32 v1, s1
120 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
121 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
122 ; VI-NEXT: flat_load_dword v0, v[0:1]
123 ; VI-NEXT: s_waitcnt vmcnt(0)
124 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
125 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
126 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
129 ; EG-LABEL: v_cttz_i32:
131 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
133 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
134 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
137 ; EG-NEXT: Fetch clause starting at 6:
138 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
139 ; EG-NEXT: ALU clause starting at 8:
140 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
141 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
142 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
143 ; EG-NEXT: ALU clause starting at 11:
144 ; EG-NEXT: FFBL_INT * T0.W, T0.X,
145 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
146 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
147 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
149 ; GFX10-LABEL: v_cttz_i32:
151 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
152 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
153 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
154 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
157 ; GFX10-NEXT: s_waitcnt vmcnt(0)
158 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
159 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
160 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
161 ; GFX10-NEXT: s_endpgm
163 ; GFX10-GISEL-LABEL: v_cttz_i32:
164 ; GFX10-GISEL: ; %bb.0:
165 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
166 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
167 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
168 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
169 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
171 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
172 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
173 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
174 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
175 ; GFX10-GISEL-NEXT: s_endpgm
176 %tid = call i32 @llvm.amdgcn.workitem.id.x()
177 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
178 %val = load i32, i32 addrspace(1)* %in.gep, align 4
179 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
180 store i32 %cttz, i32 addrspace(1)* %out, align 4
184 define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
185 ; SI-LABEL: v_cttz_v2i32:
187 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
188 ; SI-NEXT: s_mov_b32 s3, 0xf000
189 ; SI-NEXT: s_mov_b32 s6, 0
190 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
191 ; SI-NEXT: v_mov_b32_e32 v1, 0
192 ; SI-NEXT: s_mov_b32 s7, s3
193 ; SI-NEXT: s_waitcnt lgkmcnt(0)
194 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
195 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
196 ; SI-NEXT: s_mov_b32 s2, -1
197 ; SI-NEXT: s_waitcnt vmcnt(0)
198 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
199 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
200 ; SI-NEXT: v_min_u32_e32 v1, 32, v1
201 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
202 ; SI-NEXT: s_waitcnt lgkmcnt(0)
203 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
206 ; VI-LABEL: v_cttz_v2i32:
208 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
209 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
210 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
211 ; VI-NEXT: s_mov_b32 s7, 0xf000
212 ; VI-NEXT: s_mov_b32 s6, -1
213 ; VI-NEXT: s_waitcnt lgkmcnt(0)
214 ; VI-NEXT: v_mov_b32_e32 v1, s1
215 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
216 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
217 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
218 ; VI-NEXT: s_waitcnt vmcnt(0)
219 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
220 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
221 ; VI-NEXT: v_min_u32_e32 v1, 32, v1
222 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
223 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
226 ; EG-LABEL: v_cttz_v2i32:
228 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
230 ; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
231 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
234 ; EG-NEXT: Fetch clause starting at 6:
235 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
236 ; EG-NEXT: ALU clause starting at 8:
237 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
238 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
239 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
240 ; EG-NEXT: ALU clause starting at 11:
241 ; EG-NEXT: FFBL_INT * T0.W, T0.Y,
242 ; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
243 ; EG-NEXT: FFBL_INT * T0.W, T0.X,
244 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
245 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
246 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
247 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
249 ; GFX10-LABEL: v_cttz_v2i32:
251 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
252 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
253 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
254 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
255 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
257 ; GFX10-NEXT: s_waitcnt vmcnt(0)
258 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
259 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
260 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
261 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
262 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
263 ; GFX10-NEXT: s_endpgm
265 ; GFX10-GISEL-LABEL: v_cttz_v2i32:
266 ; GFX10-GISEL: ; %bb.0:
267 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
268 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
269 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
270 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
271 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
272 ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
273 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
274 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
275 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
276 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
277 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
278 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
279 ; GFX10-GISEL-NEXT: s_endpgm
280 %tid = call i32 @llvm.amdgcn.workitem.id.x()
281 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
282 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
283 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
284 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
288 define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
289 ; SI-LABEL: v_cttz_v4i32:
291 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
292 ; SI-NEXT: s_mov_b32 s3, 0xf000
293 ; SI-NEXT: s_mov_b32 s6, 0
294 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
295 ; SI-NEXT: v_mov_b32_e32 v1, 0
296 ; SI-NEXT: s_mov_b32 s7, s3
297 ; SI-NEXT: s_waitcnt lgkmcnt(0)
298 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
299 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
300 ; SI-NEXT: s_mov_b32 s2, -1
301 ; SI-NEXT: s_waitcnt vmcnt(0)
302 ; SI-NEXT: v_ffbl_b32_e32 v3, v3
303 ; SI-NEXT: v_ffbl_b32_e32 v2, v2
304 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
305 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
306 ; SI-NEXT: v_min_u32_e32 v3, 32, v3
307 ; SI-NEXT: v_min_u32_e32 v2, 32, v2
308 ; SI-NEXT: v_min_u32_e32 v1, 32, v1
309 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
310 ; SI-NEXT: s_waitcnt lgkmcnt(0)
311 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
314 ; VI-LABEL: v_cttz_v4i32:
316 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
317 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
318 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
319 ; VI-NEXT: s_mov_b32 s7, 0xf000
320 ; VI-NEXT: s_mov_b32 s6, -1
321 ; VI-NEXT: s_waitcnt lgkmcnt(0)
322 ; VI-NEXT: v_mov_b32_e32 v1, s1
323 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
324 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
325 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
326 ; VI-NEXT: s_waitcnt vmcnt(0)
327 ; VI-NEXT: v_ffbl_b32_e32 v3, v3
328 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
329 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
330 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
331 ; VI-NEXT: v_min_u32_e32 v3, 32, v3
332 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
333 ; VI-NEXT: v_min_u32_e32 v1, 32, v1
334 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
335 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
338 ; EG-LABEL: v_cttz_v4i32:
340 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
342 ; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
343 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
346 ; EG-NEXT: Fetch clause starting at 6:
347 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
348 ; EG-NEXT: ALU clause starting at 8:
349 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
350 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
351 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
352 ; EG-NEXT: ALU clause starting at 11:
353 ; EG-NEXT: FFBL_INT * T1.W, T0.W,
354 ; EG-NEXT: FFBL_INT T2.W, T0.Z,
355 ; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
356 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
357 ; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
358 ; EG-NEXT: FFBL_INT * T1.W, T0.Y,
359 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
360 ; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
361 ; EG-NEXT: FFBL_INT * T1.W, T0.X,
362 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
363 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
364 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
365 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
367 ; GFX10-LABEL: v_cttz_v4i32:
369 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
370 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
371 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
372 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
373 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
375 ; GFX10-NEXT: s_waitcnt vmcnt(0)
376 ; GFX10-NEXT: v_ffbl_b32_e32 v3, v3
377 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
378 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
379 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
380 ; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
381 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
382 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
383 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
384 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
385 ; GFX10-NEXT: s_endpgm
387 ; GFX10-GISEL-LABEL: v_cttz_v4i32:
388 ; GFX10-GISEL: ; %bb.0:
389 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
390 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
391 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
392 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0
393 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
394 ; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
395 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
396 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
397 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
398 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
399 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
400 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
401 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
402 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
403 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3
404 ; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
405 ; GFX10-GISEL-NEXT: s_endpgm
406 %tid = call i32 @llvm.amdgcn.workitem.id.x()
407 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
408 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
409 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
410 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
414 define amdgpu_kernel void @v_cttz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
415 ; SI-LABEL: v_cttz_i8:
417 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
418 ; SI-NEXT: s_mov_b32 s3, 0xf000
419 ; SI-NEXT: s_mov_b32 s2, -1
420 ; SI-NEXT: s_mov_b32 s6, s2
421 ; SI-NEXT: s_mov_b32 s7, s3
422 ; SI-NEXT: s_waitcnt lgkmcnt(0)
423 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
424 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
425 ; SI-NEXT: s_waitcnt vmcnt(0)
426 ; SI-NEXT: v_or_b32_e32 v0, 0x100, v0
427 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
428 ; SI-NEXT: s_waitcnt lgkmcnt(0)
429 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
432 ; VI-LABEL: v_cttz_i8:
434 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
435 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
436 ; VI-NEXT: s_mov_b32 s7, 0xf000
437 ; VI-NEXT: s_mov_b32 s6, -1
438 ; VI-NEXT: s_mov_b32 s2, s6
439 ; VI-NEXT: s_mov_b32 s3, s7
440 ; VI-NEXT: s_waitcnt lgkmcnt(0)
441 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
442 ; VI-NEXT: s_waitcnt vmcnt(0)
443 ; VI-NEXT: v_or_b32_e32 v0, 0x100, v0
444 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
445 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
448 ; EG-LABEL: v_cttz_i8:
450 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
452 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
453 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
456 ; EG-NEXT: Fetch clause starting at 6:
457 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
458 ; EG-NEXT: ALU clause starting at 8:
459 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
460 ; EG-NEXT: ALU clause starting at 9:
461 ; EG-NEXT: OR_INT * T0.W, T0.X, literal.x,
462 ; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00)
463 ; EG-NEXT: FFBL_INT T0.W, PV.W,
464 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
465 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
466 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
467 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
468 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
469 ; EG-NEXT: LSHL T0.X, PV.W, PS,
470 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
471 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
472 ; EG-NEXT: MOV T0.Y, 0.0,
473 ; EG-NEXT: MOV * T0.Z, 0.0,
474 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
475 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
477 ; GFX10-LABEL: v_cttz_i8:
479 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
480 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
481 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
482 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
484 ; GFX10-NEXT: s_waitcnt vmcnt(0)
485 ; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1
486 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
487 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
488 ; GFX10-NEXT: s_endpgm
490 ; GFX10-GISEL-LABEL: v_cttz_i8:
491 ; GFX10-GISEL: ; %bb.0:
492 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
493 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
494 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
495 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
497 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
498 ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1
499 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
500 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
501 ; GFX10-GISEL-NEXT: s_endpgm
502 %val = load i8, i8 addrspace(1)* %valptr
503 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
504 store i8 %cttz, i8 addrspace(1)* %out
508 define amdgpu_kernel void @s_cttz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
509 ; SI-LABEL: s_cttz_i64:
511 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
512 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
513 ; SI-NEXT: s_mov_b32 s3, 0xf000
514 ; SI-NEXT: s_mov_b32 s2, -1
515 ; SI-NEXT: s_waitcnt lgkmcnt(0)
516 ; SI-NEXT: s_ff1_i32_b32 s5, s5
517 ; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf
518 ; SI-NEXT: s_add_i32 s5, s5, 32
519 ; SI-NEXT: s_ff1_i32_b32 s4, s4
520 ; SI-NEXT: v_mov_b32_e32 v0, s5
521 ; SI-NEXT: v_min3_u32 v0, s4, v0, 64
522 ; SI-NEXT: v_mov_b32_e32 v1, 0
523 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
526 ; VI-LABEL: s_cttz_i64:
528 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
529 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
530 ; VI-NEXT: s_mov_b32 s7, 0xf000
531 ; VI-NEXT: s_mov_b32 s6, -1
532 ; VI-NEXT: v_mov_b32_e32 v1, 0
533 ; VI-NEXT: s_waitcnt lgkmcnt(0)
534 ; VI-NEXT: s_ff1_i32_b32 s1, s1
535 ; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp
536 ; VI-NEXT: s_ff1_i32_b32 s0, s0
537 ; VI-NEXT: v_min3_u32 v0, s0, v0, 64
538 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
541 ; EG-LABEL: s_cttz_i64:
543 ; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
544 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
547 ; EG-NEXT: ALU clause starting at 4:
548 ; EG-NEXT: FFBL_INT * T0.W, KC0[5].X,
549 ; EG-NEXT: CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W,
550 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
551 ; EG-NEXT: FFBL_INT T1.W, KC0[4].W,
552 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
553 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
554 ; EG-NEXT: CNDE_INT T0.X, KC0[4].W, PS, PV.W,
555 ; EG-NEXT: MOV T0.Y, 0.0,
556 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
557 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
559 ; GFX10-LABEL: s_cttz_i64:
561 ; GFX10-NEXT: s_clause 0x1
562 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
563 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
564 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
565 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
566 ; GFX10-NEXT: s_ff1_i32_b32 s0, s3
567 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp
568 ; GFX10-NEXT: s_ff1_i32_b32 s0, s2
569 ; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64
570 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
571 ; GFX10-NEXT: s_endpgm
573 ; GFX10-GISEL-LABEL: s_cttz_i64:
574 ; GFX10-GISEL: ; %bb.0:
575 ; GFX10-GISEL-NEXT: s_clause 0x1
576 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
577 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
578 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
579 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3]
581 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
582 ; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
583 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
584 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
585 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
586 ; GFX10-GISEL-NEXT: s_endpgm
587 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
588 store i64 %cttz, i64 addrspace(1)* %out
592 define amdgpu_kernel void @s_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
593 ; SI-LABEL: s_cttz_i64_trunc:
595 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
596 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
597 ; SI-NEXT: s_mov_b32 s3, 0xf000
598 ; SI-NEXT: s_mov_b32 s2, -1
599 ; SI-NEXT: s_waitcnt lgkmcnt(0)
600 ; SI-NEXT: s_ff1_i32_b32 s5, s5
601 ; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf
602 ; SI-NEXT: s_add_i32 s5, s5, 32
603 ; SI-NEXT: s_ff1_i32_b32 s4, s4
604 ; SI-NEXT: v_mov_b32_e32 v0, s5
605 ; SI-NEXT: v_min3_u32 v0, s4, v0, 64
606 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
609 ; VI-LABEL: s_cttz_i64_trunc:
611 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
612 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
613 ; VI-NEXT: s_mov_b32 s7, 0xf000
614 ; VI-NEXT: s_mov_b32 s6, -1
615 ; VI-NEXT: s_waitcnt lgkmcnt(0)
616 ; VI-NEXT: s_ff1_i32_b32 s1, s1
617 ; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp
618 ; VI-NEXT: s_ff1_i32_b32 s0, s0
619 ; VI-NEXT: v_min3_u32 v0, s0, v0, 64
620 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
623 ; EG-LABEL: s_cttz_i64_trunc:
625 ; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
626 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
629 ; EG-NEXT: ALU clause starting at 4:
630 ; EG-NEXT: FFBL_INT * T0.W, KC0[3].X,
631 ; EG-NEXT: CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
632 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
633 ; EG-NEXT: FFBL_INT T1.W, KC0[2].W,
634 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
635 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
636 ; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W,
637 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
638 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
640 ; GFX10-LABEL: s_cttz_i64_trunc:
642 ; GFX10-NEXT: s_clause 0x1
643 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
644 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
645 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
646 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX10-NEXT: s_ff1_i32_b32 s0, s3
648 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp
649 ; GFX10-NEXT: s_ff1_i32_b32 s0, s2
650 ; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64
651 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
652 ; GFX10-NEXT: s_endpgm
654 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
655 ; GFX10-GISEL: ; %bb.0:
656 ; GFX10-GISEL-NEXT: s_clause 0x1
657 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
658 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
659 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
660 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
661 ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3]
662 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64
663 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
664 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
665 ; GFX10-GISEL-NEXT: s_endpgm
666 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
667 %trunc = trunc i64 %cttz to i32
668 store i32 %trunc, i32 addrspace(1)* %out
672 define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
673 ; SI-LABEL: v_cttz_i64:
675 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
676 ; SI-NEXT: s_mov_b32 s7, 0xf000
677 ; SI-NEXT: s_mov_b32 s6, 0
678 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
679 ; SI-NEXT: v_mov_b32_e32 v1, 0
680 ; SI-NEXT: s_waitcnt lgkmcnt(0)
681 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
682 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
683 ; SI-NEXT: s_waitcnt vmcnt(0)
684 ; SI-NEXT: v_ffbl_b32_e32 v3, v3
685 ; SI-NEXT: v_min_u32_e32 v3, 0xffffffdf, v3
686 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3
687 ; SI-NEXT: v_ffbl_b32_e32 v2, v2
688 ; SI-NEXT: v_min3_u32 v2, v2, v3, 64
689 ; SI-NEXT: v_mov_b32_e32 v3, v1
690 ; SI-NEXT: s_waitcnt lgkmcnt(0)
691 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
694 ; VI-LABEL: v_cttz_i64:
696 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
697 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
698 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
699 ; VI-NEXT: v_mov_b32_e32 v2, 0
700 ; VI-NEXT: s_waitcnt lgkmcnt(0)
701 ; VI-NEXT: v_mov_b32_e32 v4, s3
702 ; VI-NEXT: v_mov_b32_e32 v1, s1
703 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3
704 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
705 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
706 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3
707 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
708 ; VI-NEXT: s_waitcnt vmcnt(0)
709 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
710 ; VI-NEXT: v_add_u32_e64 v1, s[0:1], v1, 32 clamp
711 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
712 ; VI-NEXT: v_min3_u32 v1, v0, v1, 64
713 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
716 ; EG-LABEL: v_cttz_i64:
718 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
720 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
721 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
724 ; EG-NEXT: Fetch clause starting at 6:
725 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
726 ; EG-NEXT: ALU clause starting at 8:
727 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
728 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
729 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
730 ; EG-NEXT: ALU clause starting at 11:
731 ; EG-NEXT: FFBL_INT * T1.W, T0.Y,
732 ; EG-NEXT: CNDE_INT * T1.W, T0.Y, literal.x, PV.W,
733 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
734 ; EG-NEXT: FFBL_INT T2.W, T0.X,
735 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
736 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
737 ; EG-NEXT: CNDE_INT T0.X, T0.X, PS, PV.W,
738 ; EG-NEXT: MOV T0.Y, 0.0,
739 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
740 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
741 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
743 ; GFX10-LABEL: v_cttz_i64:
745 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
746 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
747 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
748 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
749 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
750 ; GFX10-NEXT: s_waitcnt vmcnt(0)
751 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
752 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
753 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
754 ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64
755 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
756 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
757 ; GFX10-NEXT: s_endpgm
759 ; GFX10-GISEL-LABEL: v_cttz_i64:
760 ; GFX10-GISEL: ; %bb.0:
761 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
762 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
763 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
764 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
765 ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
766 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
767 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
768 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
769 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
770 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1
771 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
772 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0
773 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
774 ; GFX10-GISEL-NEXT: s_endpgm
775 %tid = call i32 @llvm.amdgcn.workitem.id.x()
776 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
777 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
778 %val = load i64, i64 addrspace(1)* %in.gep
779 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
780 store i64 %cttz, i64 addrspace(1)* %out.gep
784 define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
785 ; SI-LABEL: v_cttz_i64_trunc:
787 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
788 ; SI-NEXT: s_mov_b32 s7, 0xf000
789 ; SI-NEXT: s_mov_b32 s6, 0
790 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
791 ; SI-NEXT: v_mov_b32_e32 v2, 0
792 ; SI-NEXT: s_waitcnt lgkmcnt(0)
793 ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
794 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
795 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
796 ; SI-NEXT: s_waitcnt vmcnt(0)
797 ; SI-NEXT: v_ffbl_b32_e32 v0, v4
798 ; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0
799 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
800 ; SI-NEXT: v_ffbl_b32_e32 v3, v3
801 ; SI-NEXT: v_min3_u32 v0, v3, v0, 64
802 ; SI-NEXT: s_waitcnt lgkmcnt(0)
803 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
806 ; VI-LABEL: v_cttz_i64_trunc:
808 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
809 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
810 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
811 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
812 ; VI-NEXT: s_waitcnt lgkmcnt(0)
813 ; VI-NEXT: v_mov_b32_e32 v4, s3
814 ; VI-NEXT: v_mov_b32_e32 v2, s1
815 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
816 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
817 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
818 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
819 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
820 ; VI-NEXT: s_waitcnt vmcnt(0)
821 ; VI-NEXT: v_ffbl_b32_e32 v0, v2
822 ; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp
823 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
824 ; VI-NEXT: v_min3_u32 v0, v1, v0, 64
825 ; VI-NEXT: flat_store_dword v[3:4], v0
828 ; EG-LABEL: v_cttz_i64_trunc:
830 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
832 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
833 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
836 ; EG-NEXT: Fetch clause starting at 6:
837 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
838 ; EG-NEXT: ALU clause starting at 8:
839 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
840 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
841 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W,
842 ; EG-NEXT: ALU clause starting at 11:
843 ; EG-NEXT: FFBL_INT * T0.W, T1.Y,
844 ; EG-NEXT: CNDE_INT * T0.W, T1.Y, literal.x, PV.W,
845 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
846 ; EG-NEXT: LSHL T0.Z, T0.X, literal.x,
847 ; EG-NEXT: FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212
848 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y,
849 ; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
850 ; EG-NEXT: CNDE_INT T0.X, T1.X, PS, PV.W,
851 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z,
852 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
853 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
855 ; GFX10-LABEL: v_cttz_i64_trunc:
857 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
858 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0
859 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
860 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
861 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
862 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
863 ; GFX10-NEXT: s_waitcnt vmcnt(0)
864 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
865 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
866 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
867 ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64
868 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
869 ; GFX10-NEXT: s_endpgm
871 ; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
872 ; GFX10-GISEL: ; %bb.0:
873 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
874 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
875 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
876 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
877 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
878 ; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3]
879 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
880 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
881 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
882 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
883 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2
884 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1
885 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
886 ; GFX10-GISEL-NEXT: s_endpgm
887 %tid = call i32 @llvm.amdgcn.workitem.id.x()
888 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
889 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
890 %val = load i64, i64 addrspace(1)* %in.gep
891 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
892 %trunc = trunc i64 %cttz to i32
893 store i32 %trunc, i32 addrspace(1)* %out.gep
897 define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
898 ; SI-LABEL: v_cttz_i32_sel_eq_neg1:
900 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
901 ; SI-NEXT: s_mov_b32 s3, 0xf000
902 ; SI-NEXT: s_mov_b32 s6, 0
903 ; SI-NEXT: s_mov_b32 s7, s3
904 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
905 ; SI-NEXT: v_mov_b32_e32 v1, 0
906 ; SI-NEXT: s_waitcnt lgkmcnt(0)
907 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
908 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
909 ; SI-NEXT: s_mov_b32 s2, -1
910 ; SI-NEXT: s_waitcnt vmcnt(0)
911 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
913 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
916 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
918 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
919 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
920 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
921 ; VI-NEXT: s_mov_b32 s7, 0xf000
922 ; VI-NEXT: s_mov_b32 s6, -1
923 ; VI-NEXT: s_waitcnt lgkmcnt(0)
924 ; VI-NEXT: v_mov_b32_e32 v1, s1
925 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
926 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
927 ; VI-NEXT: flat_load_dword v0, v[0:1]
928 ; VI-NEXT: s_waitcnt vmcnt(0)
929 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
930 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
933 ; EG-LABEL: v_cttz_i32_sel_eq_neg1:
935 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
937 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
938 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
941 ; EG-NEXT: Fetch clause starting at 6:
942 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
943 ; EG-NEXT: ALU clause starting at 8:
944 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
945 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
946 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
947 ; EG-NEXT: ALU clause starting at 11:
948 ; EG-NEXT: FFBL_INT * T0.W, T0.X,
949 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
950 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
951 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
952 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
953 ; EG-NEXT: -1(nan), 2(2.802597e-45)
955 ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
957 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
958 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
959 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
960 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
961 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
962 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
963 ; GFX10-NEXT: s_waitcnt vmcnt(0)
964 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
965 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
966 ; GFX10-NEXT: s_endpgm
968 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
969 ; GFX10-GISEL: ; %bb.0:
970 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
971 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
972 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
973 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
974 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
975 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
976 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0
977 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
978 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
979 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo
980 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
981 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
982 ; GFX10-GISEL-NEXT: s_endpgm
983 %tid = call i32 @llvm.amdgcn.workitem.id.x()
984 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
985 %val = load i32, i32 addrspace(1)* %in.gep
986 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
987 %cmp = icmp eq i32 %val, 0
988 %sel = select i1 %cmp, i32 -1, i32 %cttz
989 store i32 %sel, i32 addrspace(1)* %out
993 define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
994 ; SI-LABEL: v_cttz_i32_sel_ne_neg1:
996 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
997 ; SI-NEXT: s_mov_b32 s3, 0xf000
998 ; SI-NEXT: s_mov_b32 s6, 0
999 ; SI-NEXT: s_mov_b32 s7, s3
1000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1001 ; SI-NEXT: v_mov_b32_e32 v1, 0
1002 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1003 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1004 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1005 ; SI-NEXT: s_mov_b32 s2, -1
1006 ; SI-NEXT: s_waitcnt vmcnt(0)
1007 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1008 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1009 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1012 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1014 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1015 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1016 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1017 ; VI-NEXT: s_mov_b32 s7, 0xf000
1018 ; VI-NEXT: s_mov_b32 s6, -1
1019 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1020 ; VI-NEXT: v_mov_b32_e32 v1, s1
1021 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1022 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1023 ; VI-NEXT: flat_load_dword v0, v[0:1]
1024 ; VI-NEXT: s_waitcnt vmcnt(0)
1025 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1026 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1029 ; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1031 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1033 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
1034 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1037 ; EG-NEXT: Fetch clause starting at 6:
1038 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1039 ; EG-NEXT: ALU clause starting at 8:
1040 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1041 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1042 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1043 ; EG-NEXT: ALU clause starting at 11:
1044 ; EG-NEXT: FFBL_INT * T0.W, T0.X,
1045 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1046 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1047 ; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W,
1048 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1049 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1051 ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
1053 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1054 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1055 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1056 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1057 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1058 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1059 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1061 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1062 ; GFX10-NEXT: s_endpgm
1064 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1065 ; GFX10-GISEL: ; %bb.0:
1066 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1067 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1068 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1069 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1071 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1072 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0
1073 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
1074 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1075 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1076 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1077 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1078 ; GFX10-GISEL-NEXT: s_endpgm
1079 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1080 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1081 %val = load i32, i32 addrspace(1)* %in.gep
1082 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1083 %cmp = icmp ne i32 %val, 0
1084 %sel = select i1 %cmp, i32 %cttz, i32 -1
1085 store i32 %sel, i32 addrspace(1)* %out
1089 ; TODO: Should be able to eliminate select here as well.
1090 define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1091 ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1093 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1094 ; SI-NEXT: s_mov_b32 s3, 0xf000
1095 ; SI-NEXT: s_mov_b32 s6, 0
1096 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1097 ; SI-NEXT: v_mov_b32_e32 v1, 0
1098 ; SI-NEXT: s_mov_b32 s7, s3
1099 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1100 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1101 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1102 ; SI-NEXT: s_mov_b32 s2, -1
1103 ; SI-NEXT: s_waitcnt vmcnt(0)
1104 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1105 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
1106 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1107 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1108 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1109 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1112 ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1114 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1115 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1116 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1117 ; VI-NEXT: s_mov_b32 s7, 0xf000
1118 ; VI-NEXT: s_mov_b32 s6, -1
1119 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1120 ; VI-NEXT: v_mov_b32_e32 v1, s1
1121 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1122 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1123 ; VI-NEXT: flat_load_dword v0, v[0:1]
1124 ; VI-NEXT: s_waitcnt vmcnt(0)
1125 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1126 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
1127 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1128 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1129 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1132 ; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
1134 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1136 ; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
1137 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1140 ; EG-NEXT: Fetch clause starting at 6:
1141 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1142 ; EG-NEXT: ALU clause starting at 8:
1143 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1144 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1145 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1146 ; EG-NEXT: ALU clause starting at 11:
1147 ; EG-NEXT: FFBL_INT * T0.W, T0.X,
1148 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1149 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1150 ; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x,
1151 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1152 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x,
1153 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1154 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1156 ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
1158 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1159 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1160 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1161 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1162 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1163 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1164 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1165 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1166 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
1167 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1168 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1169 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1170 ; GFX10-NEXT: s_endpgm
1172 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
1173 ; GFX10-GISEL: ; %bb.0:
1174 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1175 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1176 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1177 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1178 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1179 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1180 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1181 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
1182 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
1183 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0
1184 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1185 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1186 ; GFX10-GISEL-NEXT: s_endpgm
1187 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1188 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1189 %val = load i32, i32 addrspace(1)* %in.gep
1190 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1191 %cmp = icmp eq i32 %cttz, 32
1192 %sel = select i1 %cmp, i32 -1, i32 %cttz
1193 store i32 %sel, i32 addrspace(1)* %out
1197 define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
1198 ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1200 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1201 ; SI-NEXT: s_mov_b32 s3, 0xf000
1202 ; SI-NEXT: s_mov_b32 s6, 0
1203 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1204 ; SI-NEXT: v_mov_b32_e32 v1, 0
1205 ; SI-NEXT: s_mov_b32 s7, s3
1206 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1207 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1208 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1209 ; SI-NEXT: s_mov_b32 s2, -1
1210 ; SI-NEXT: s_waitcnt vmcnt(0)
1211 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1212 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
1213 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1214 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1215 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1216 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1219 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1221 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1222 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1223 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1224 ; VI-NEXT: s_mov_b32 s7, 0xf000
1225 ; VI-NEXT: s_mov_b32 s6, -1
1226 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1227 ; VI-NEXT: v_mov_b32_e32 v1, s1
1228 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1229 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1230 ; VI-NEXT: flat_load_dword v0, v[0:1]
1231 ; VI-NEXT: s_waitcnt vmcnt(0)
1232 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1233 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
1234 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1235 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1236 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1239 ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1241 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1243 ; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
1244 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1247 ; EG-NEXT: Fetch clause starting at 6:
1248 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1249 ; EG-NEXT: ALU clause starting at 8:
1250 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1251 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1252 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1253 ; EG-NEXT: ALU clause starting at 11:
1254 ; EG-NEXT: FFBL_INT * T0.W, T0.X,
1255 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1256 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1257 ; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x,
1258 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1259 ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W,
1260 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1261 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1263 ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
1265 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1266 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1267 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1268 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1269 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1270 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1271 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1272 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1273 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
1274 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1275 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1276 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1277 ; GFX10-NEXT: s_endpgm
1279 ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1280 ; GFX10-GISEL: ; %bb.0:
1281 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1282 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1283 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1284 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1285 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1286 ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
1287 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1288 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
1289 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
1290 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
1291 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1292 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
1293 ; GFX10-GISEL-NEXT: s_endpgm
1294 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1295 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
1296 %val = load i32, i32 addrspace(1)* %in.gep
1297 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1298 %cmp = icmp ne i32 %cttz, 32
1299 %sel = select i1 %cmp, i32 %cttz, i32 -1
1300 store i32 %sel, i32 addrspace(1)* %out
1304 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
1305 ; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1307 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1308 ; SI-NEXT: s_mov_b32 s3, 0xf000
1309 ; SI-NEXT: v_mov_b32_e32 v1, 0
1310 ; SI-NEXT: s_mov_b32 s6, 0
1311 ; SI-NEXT: s_mov_b32 s7, s3
1312 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1313 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1314 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1315 ; SI-NEXT: s_mov_b32 s2, -1
1316 ; SI-NEXT: s_waitcnt vmcnt(0)
1317 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1319 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1322 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1324 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1325 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1326 ; VI-NEXT: s_mov_b32 s7, 0xf000
1327 ; VI-NEXT: s_mov_b32 s6, -1
1328 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1329 ; VI-NEXT: v_mov_b32_e32 v1, s1
1330 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1331 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1332 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1333 ; VI-NEXT: s_waitcnt vmcnt(0)
1334 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1335 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1338 ; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1340 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1342 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1343 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1346 ; EG-NEXT: Fetch clause starting at 6:
1347 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1348 ; EG-NEXT: ALU clause starting at 8:
1349 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1350 ; EG-NEXT: ALU clause starting at 9:
1351 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1352 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1353 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1354 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1355 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1356 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1357 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1358 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1359 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1360 ; EG-NEXT: MOV T0.Y, 0.0,
1361 ; EG-NEXT: MOV * T0.Z, 0.0,
1362 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1363 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1365 ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
1367 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1368 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1369 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1370 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1371 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1372 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1373 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1374 ; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
1375 ; GFX10-NEXT: s_endpgm
1377 ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1378 ; GFX10-GISEL: ; %bb.0:
1379 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1380 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1381 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1382 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1383 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
1384 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
1385 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
1386 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1387 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
1388 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1389 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1390 ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0
1391 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
1392 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
1393 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2
1394 ; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1]
1395 ; GFX10-GISEL-NEXT: s_endpgm
1396 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1397 %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
1398 %val = load i8, i8 addrspace(1)* %valptr.gep
1399 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1400 %cmp = icmp eq i8 %val, 0
1401 %sel = select i1 %cmp, i8 -1, i8 %cttz
1402 store i8 %sel, i8 addrspace(1)* %out
1406 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
1407 ; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1409 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1410 ; SI-NEXT: s_mov_b32 s3, 0xf000
1411 ; SI-NEXT: s_mov_b32 s2, -1
1412 ; SI-NEXT: s_mov_b32 s6, s2
1413 ; SI-NEXT: s_mov_b32 s7, s3
1414 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1415 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
1416 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1417 ; SI-NEXT: s_waitcnt vmcnt(0)
1418 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1419 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1420 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1423 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1425 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1426 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1427 ; VI-NEXT: s_mov_b32 s7, 0xf000
1428 ; VI-NEXT: s_mov_b32 s6, -1
1429 ; VI-NEXT: s_mov_b32 s2, s6
1430 ; VI-NEXT: s_mov_b32 s3, s7
1431 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1432 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
1433 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
1434 ; VI-NEXT: s_waitcnt vmcnt(0)
1435 ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
1436 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
1437 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1438 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
1439 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
1440 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
1443 ; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1445 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1447 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1448 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1451 ; EG-NEXT: Fetch clause starting at 6:
1452 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1453 ; EG-NEXT: ALU clause starting at 8:
1454 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1455 ; EG-NEXT: ALU clause starting at 9:
1456 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1457 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1458 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1459 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1460 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1461 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1462 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1463 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1464 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1465 ; EG-NEXT: MOV T0.Y, 0.0,
1466 ; EG-NEXT: MOV * T0.Z, 0.0,
1467 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1468 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1470 ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
1472 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1473 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1474 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1475 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1476 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
1477 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1478 ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
1479 ; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
1480 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
1481 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
1482 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1483 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1484 ; GFX10-NEXT: s_endpgm
1486 ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1487 ; GFX10-GISEL: ; %bb.0:
1488 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1489 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
1490 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1491 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1492 ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
1493 ; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
1494 ; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff
1495 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1496 ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
1497 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
1498 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
1499 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2
1500 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo
1501 ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
1502 ; GFX10-GISEL-NEXT: s_endpgm
1503 %val = load i16, i16 addrspace(1)* %valptr
1504 %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1505 %cmp = icmp eq i16 %val, 0
1506 %sel = select i1 %cmp, i16 -1, i16 %cttz
1507 store i16 %sel, i16 addrspace(1)* %out
1511 ; FIXME: Need to handle non-uniform case for function below (load without gep).
1512 define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
1513 ; SI-LABEL: v_cttz_i7_sel_eq_neg1:
1515 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1516 ; SI-NEXT: s_mov_b32 s3, 0xf000
1517 ; SI-NEXT: v_mov_b32_e32 v1, 0
1518 ; SI-NEXT: s_mov_b32 s6, 0
1519 ; SI-NEXT: s_mov_b32 s7, s3
1520 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1521 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1522 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1523 ; SI-NEXT: s_mov_b32 s2, -1
1524 ; SI-NEXT: s_waitcnt vmcnt(0)
1525 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1526 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
1527 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1528 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1531 ; VI-LABEL: v_cttz_i7_sel_eq_neg1:
1533 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1534 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1535 ; VI-NEXT: s_mov_b32 s7, 0xf000
1536 ; VI-NEXT: s_mov_b32 s6, -1
1537 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1538 ; VI-NEXT: v_mov_b32_e32 v1, s1
1539 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1540 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1541 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1542 ; VI-NEXT: s_waitcnt vmcnt(0)
1543 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1544 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
1545 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1548 ; EG-LABEL: v_cttz_i7_sel_eq_neg1:
1550 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1552 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1553 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1556 ; EG-NEXT: Fetch clause starting at 6:
1557 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1558 ; EG-NEXT: ALU clause starting at 8:
1559 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
1560 ; EG-NEXT: ALU clause starting at 9:
1561 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1562 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1563 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1564 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1565 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1566 ; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45)
1567 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1568 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1569 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1570 ; EG-NEXT: MOV T0.Y, 0.0,
1571 ; EG-NEXT: MOV * T0.Z, 0.0,
1572 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1573 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1575 ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
1577 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1578 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1579 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1580 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1581 ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
1582 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1583 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
1584 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0
1585 ; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
1586 ; GFX10-NEXT: s_endpgm
1588 ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
1589 ; GFX10-GISEL: ; %bb.0:
1590 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1591 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
1592 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1593 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1594 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
1595 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
1596 ; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f
1597 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
1598 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1599 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1600 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
1601 ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0
1602 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0
1603 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
1604 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1605 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1606 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
1607 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0
1608 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
1609 ; GFX10-GISEL-NEXT: s_endpgm
1610 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1611 %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
1612 %val = load i7, i7 addrspace(1)* %valptr.gep
1613 %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone
1614 %cmp = icmp eq i7 %val, 0
1615 %sel = select i1 %cmp, i7 -1, i7 %cttz
1616 store i7 %sel, i7 addrspace(1)* %out