1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
7 declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
8 declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
9 declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
10 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
11 declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
12 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
13 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
14 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
16 define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
17 ; SI-LABEL: s_cttz_zero_undef_i32:
19 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
20 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
21 ; SI-NEXT: s_mov_b32 s3, 0xf000
22 ; SI-NEXT: s_waitcnt lgkmcnt(0)
23 ; SI-NEXT: s_ff1_i32_b32 s4, s2
24 ; SI-NEXT: s_mov_b32 s2, -1
25 ; SI-NEXT: v_mov_b32_e32 v0, s4
26 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
29 ; VI-LABEL: s_cttz_zero_undef_i32:
31 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
32 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: s_ff1_i32_b32 s2, s2
35 ; VI-NEXT: v_mov_b32_e32 v0, s0
36 ; VI-NEXT: v_mov_b32_e32 v1, s1
37 ; VI-NEXT: v_mov_b32_e32 v2, s2
38 ; VI-NEXT: flat_store_dword v[0:1], v2
41 ; EG-LABEL: s_cttz_zero_undef_i32:
43 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
44 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
47 ; EG-NEXT: ALU clause starting at 4:
48 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
49 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
50 ; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z,
52 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32:
53 ; GFX9-GISEL: ; %bb.0:
54 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
55 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
56 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
57 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
58 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
59 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
60 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
61 ; GFX9-GISEL-NEXT: s_endpgm
62 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
63 store i32 %cttz, ptr addrspace(1) %out, align 4
67 define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
68 ; SI-LABEL: v_cttz_zero_undef_i32:
70 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
71 ; SI-NEXT: s_mov_b32 s7, 0xf000
72 ; SI-NEXT: s_mov_b32 s10, 0
73 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
74 ; SI-NEXT: v_mov_b32_e32 v1, 0
75 ; SI-NEXT: s_mov_b32 s11, s7
76 ; SI-NEXT: s_waitcnt lgkmcnt(0)
77 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
78 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
79 ; SI-NEXT: s_mov_b32 s6, -1
80 ; SI-NEXT: s_mov_b32 s4, s0
81 ; SI-NEXT: s_mov_b32 s5, s1
82 ; SI-NEXT: s_waitcnt vmcnt(0)
83 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
84 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
87 ; VI-LABEL: v_cttz_zero_undef_i32:
89 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
90 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
91 ; VI-NEXT: s_waitcnt lgkmcnt(0)
92 ; VI-NEXT: v_mov_b32_e32 v1, s3
93 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
94 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
95 ; VI-NEXT: flat_load_dword v0, v[0:1]
96 ; VI-NEXT: s_waitcnt vmcnt(0)
97 ; VI-NEXT: v_ffbl_b32_e32 v2, v0
98 ; VI-NEXT: v_mov_b32_e32 v0, s0
99 ; VI-NEXT: v_mov_b32_e32 v1, s1
100 ; VI-NEXT: flat_store_dword v[0:1], v2
103 ; EG-LABEL: v_cttz_zero_undef_i32:
105 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
107 ; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
108 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
111 ; EG-NEXT: Fetch clause starting at 6:
112 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
113 ; EG-NEXT: ALU clause starting at 8:
114 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
115 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
116 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
117 ; EG-NEXT: ALU clause starting at 11:
118 ; EG-NEXT: FFBL_INT T0.X, T0.X,
119 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
120 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
122 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
123 ; GFX9-GISEL: ; %bb.0:
124 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
125 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
126 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
127 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
129 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
130 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
131 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
132 ; GFX9-GISEL-NEXT: s_endpgm
133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
135 %val = load i32, ptr addrspace(1) %in.gep, align 4
136 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
137 store i32 %cttz, ptr addrspace(1) %out, align 4
141 define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
142 ; SI-LABEL: v_cttz_zero_undef_v2i32:
144 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
145 ; SI-NEXT: s_mov_b32 s7, 0xf000
146 ; SI-NEXT: s_mov_b32 s10, 0
147 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
148 ; SI-NEXT: v_mov_b32_e32 v1, 0
149 ; SI-NEXT: s_mov_b32 s11, s7
150 ; SI-NEXT: s_waitcnt lgkmcnt(0)
151 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
152 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
153 ; SI-NEXT: s_mov_b32 s6, -1
154 ; SI-NEXT: s_mov_b32 s4, s0
155 ; SI-NEXT: s_mov_b32 s5, s1
156 ; SI-NEXT: s_waitcnt vmcnt(0)
157 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
158 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
159 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
162 ; VI-LABEL: v_cttz_zero_undef_v2i32:
164 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
165 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
166 ; VI-NEXT: s_waitcnt lgkmcnt(0)
167 ; VI-NEXT: v_mov_b32_e32 v1, s3
168 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
169 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
170 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
171 ; VI-NEXT: v_mov_b32_e32 v3, s1
172 ; VI-NEXT: v_mov_b32_e32 v2, s0
173 ; VI-NEXT: s_waitcnt vmcnt(0)
174 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
175 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
176 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
179 ; EG-LABEL: v_cttz_zero_undef_v2i32:
181 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
183 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
184 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
187 ; EG-NEXT: Fetch clause starting at 6:
188 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
189 ; EG-NEXT: ALU clause starting at 8:
190 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
191 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
192 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
193 ; EG-NEXT: ALU clause starting at 11:
194 ; EG-NEXT: FFBL_INT * T0.Y, T0.Y,
195 ; EG-NEXT: FFBL_INT T0.X, T0.X,
196 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
197 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
199 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
200 ; GFX9-GISEL: ; %bb.0:
201 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
202 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
203 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
204 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
205 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
206 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
207 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
208 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
209 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
210 ; GFX9-GISEL-NEXT: s_endpgm
211 %tid = call i32 @llvm.amdgcn.workitem.id.x()
212 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
213 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
214 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
215 store <2 x i32> %cttz, ptr addrspace(1) %out, align 8
219 define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
220 ; SI-LABEL: v_cttz_zero_undef_v4i32:
222 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
223 ; SI-NEXT: s_mov_b32 s7, 0xf000
224 ; SI-NEXT: s_mov_b32 s10, 0
225 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
226 ; SI-NEXT: v_mov_b32_e32 v1, 0
227 ; SI-NEXT: s_mov_b32 s11, s7
228 ; SI-NEXT: s_waitcnt lgkmcnt(0)
229 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
230 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
231 ; SI-NEXT: s_mov_b32 s6, -1
232 ; SI-NEXT: s_mov_b32 s4, s0
233 ; SI-NEXT: s_mov_b32 s5, s1
234 ; SI-NEXT: s_waitcnt vmcnt(0)
235 ; SI-NEXT: v_ffbl_b32_e32 v3, v3
236 ; SI-NEXT: v_ffbl_b32_e32 v2, v2
237 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
238 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
239 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
242 ; VI-LABEL: v_cttz_zero_undef_v4i32:
244 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
245 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
246 ; VI-NEXT: s_waitcnt lgkmcnt(0)
247 ; VI-NEXT: v_mov_b32_e32 v1, s3
248 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
249 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
250 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
251 ; VI-NEXT: v_mov_b32_e32 v5, s1
252 ; VI-NEXT: v_mov_b32_e32 v4, s0
253 ; VI-NEXT: s_waitcnt vmcnt(0)
254 ; VI-NEXT: v_ffbl_b32_e32 v3, v3
255 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
256 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
257 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
258 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
261 ; EG-LABEL: v_cttz_zero_undef_v4i32:
263 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
265 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
266 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
269 ; EG-NEXT: Fetch clause starting at 6:
270 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
271 ; EG-NEXT: ALU clause starting at 8:
272 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
273 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
274 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
275 ; EG-NEXT: ALU clause starting at 11:
276 ; EG-NEXT: FFBL_INT * T0.W, T0.W,
277 ; EG-NEXT: FFBL_INT * T0.Z, T0.Z,
278 ; EG-NEXT: FFBL_INT * T0.Y, T0.Y,
279 ; EG-NEXT: FFBL_INT T0.X, T0.X,
280 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
281 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
283 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
284 ; GFX9-GISEL: ; %bb.0:
285 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
286 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
287 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
288 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
290 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
291 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
292 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
293 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
294 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
295 ; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
296 ; GFX9-GISEL-NEXT: s_endpgm
297 %tid = call i32 @llvm.amdgcn.workitem.id.x()
298 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
299 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
300 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
301 store <4 x i32> %cttz, ptr addrspace(1) %out, align 16
305 define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
306 ; SI-LABEL: s_cttz_zero_undef_i8_with_select:
308 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
309 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
310 ; SI-NEXT: s_mov_b32 s3, 0xf000
311 ; SI-NEXT: s_waitcnt lgkmcnt(0)
312 ; SI-NEXT: s_ff1_i32_b32 s4, s2
313 ; SI-NEXT: s_mov_b32 s2, -1
314 ; SI-NEXT: v_mov_b32_e32 v0, s4
315 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
318 ; VI-LABEL: s_cttz_zero_undef_i8_with_select:
320 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
321 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
322 ; VI-NEXT: s_waitcnt lgkmcnt(0)
323 ; VI-NEXT: s_ff1_i32_b32 s2, s2
324 ; VI-NEXT: v_mov_b32_e32 v0, s0
325 ; VI-NEXT: v_mov_b32_e32 v1, s1
326 ; VI-NEXT: v_mov_b32_e32 v2, s2
327 ; VI-NEXT: flat_store_byte v[0:1], v2
330 ; EG-LABEL: s_cttz_zero_undef_i8_with_select:
332 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
334 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
335 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
338 ; EG-NEXT: Fetch clause starting at 6:
339 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
340 ; EG-NEXT: ALU clause starting at 8:
341 ; EG-NEXT: MOV * T0.X, 0.0,
342 ; EG-NEXT: ALU clause starting at 9:
343 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
344 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
345 ; EG-NEXT: FFBL_INT T0.W, PV.W,
346 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
347 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
348 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
349 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
350 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
351 ; EG-NEXT: LSHL T0.X, PV.W, PS,
352 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
353 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
354 ; EG-NEXT: MOV T0.Y, 0.0,
355 ; EG-NEXT: MOV * T0.Z, 0.0,
356 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
357 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
359 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select:
360 ; GFX9-GISEL: ; %bb.0:
361 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
362 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
363 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
364 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
366 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
367 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
368 ; GFX9-GISEL-NEXT: s_endpgm
369 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
370 %cttz_ret = icmp ne i8 %val, 0
371 %ret = select i1 %cttz_ret, i8 %cttz, i8 32
372 store i8 %cttz, ptr addrspace(1) %out, align 4
376 define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
377 ; SI-LABEL: s_cttz_zero_undef_i16_with_select:
379 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
380 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
381 ; SI-NEXT: s_mov_b32 s3, 0xf000
382 ; SI-NEXT: s_waitcnt lgkmcnt(0)
383 ; SI-NEXT: s_ff1_i32_b32 s4, s2
384 ; SI-NEXT: s_mov_b32 s2, -1
385 ; SI-NEXT: v_mov_b32_e32 v0, s4
386 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
389 ; VI-LABEL: s_cttz_zero_undef_i16_with_select:
391 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
392 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
393 ; VI-NEXT: s_waitcnt lgkmcnt(0)
394 ; VI-NEXT: s_ff1_i32_b32 s2, s2
395 ; VI-NEXT: v_mov_b32_e32 v0, s0
396 ; VI-NEXT: v_mov_b32_e32 v1, s1
397 ; VI-NEXT: v_mov_b32_e32 v2, s2
398 ; VI-NEXT: flat_store_short v[0:1], v2
401 ; EG-LABEL: s_cttz_zero_undef_i16_with_select:
403 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
405 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
406 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
409 ; EG-NEXT: Fetch clause starting at 6:
410 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
411 ; EG-NEXT: ALU clause starting at 8:
412 ; EG-NEXT: MOV * T0.X, 0.0,
413 ; EG-NEXT: ALU clause starting at 9:
414 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
415 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
416 ; EG-NEXT: FFBL_INT T0.W, PV.W,
417 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
418 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
419 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
420 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
421 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
422 ; EG-NEXT: LSHL T0.X, PV.W, PS,
423 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
424 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
425 ; EG-NEXT: MOV T0.Y, 0.0,
426 ; EG-NEXT: MOV * T0.Z, 0.0,
427 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
428 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
430 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select:
431 ; GFX9-GISEL: ; %bb.0:
432 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
433 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
434 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
435 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
436 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
437 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
438 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
439 ; GFX9-GISEL-NEXT: s_endpgm
440 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
441 %cttz_ret = icmp ne i16 %val, 0
442 %ret = select i1 %cttz_ret, i16 %cttz, i16 32
443 store i16 %cttz, ptr addrspace(1) %out, align 4
447 define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
448 ; SI-LABEL: s_cttz_zero_undef_i32_with_select:
450 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
451 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
452 ; SI-NEXT: s_mov_b32 s3, 0xf000
453 ; SI-NEXT: s_waitcnt lgkmcnt(0)
454 ; SI-NEXT: s_ff1_i32_b32 s4, s2
455 ; SI-NEXT: s_mov_b32 s2, -1
456 ; SI-NEXT: v_mov_b32_e32 v0, s4
457 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
460 ; VI-LABEL: s_cttz_zero_undef_i32_with_select:
462 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
463 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
464 ; VI-NEXT: s_waitcnt lgkmcnt(0)
465 ; VI-NEXT: s_ff1_i32_b32 s2, s2
466 ; VI-NEXT: v_mov_b32_e32 v0, s0
467 ; VI-NEXT: v_mov_b32_e32 v1, s1
468 ; VI-NEXT: v_mov_b32_e32 v2, s2
469 ; VI-NEXT: flat_store_dword v[0:1], v2
472 ; EG-LABEL: s_cttz_zero_undef_i32_with_select:
474 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
475 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
478 ; EG-NEXT: ALU clause starting at 4:
479 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
480 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
481 ; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z,
483 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select:
484 ; GFX9-GISEL: ; %bb.0:
485 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
486 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
487 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
488 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
490 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
491 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
492 ; GFX9-GISEL-NEXT: s_endpgm
493 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
494 %cttz_ret = icmp ne i32 %val, 0
495 %ret = select i1 %cttz_ret, i32 %cttz, i32 32
496 store i32 %cttz, ptr addrspace(1) %out, align 4
500 define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
501 ; SI-LABEL: s_cttz_zero_undef_i64_with_select:
503 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
504 ; SI-NEXT: s_mov_b32 s7, 0xf000
505 ; SI-NEXT: s_mov_b32 s6, -1
506 ; SI-NEXT: s_waitcnt lgkmcnt(0)
507 ; SI-NEXT: s_ff1_i32_b32 s3, s3
508 ; SI-NEXT: s_ff1_i32_b32 s2, s2
509 ; SI-NEXT: s_add_i32 s3, s3, 32
510 ; SI-NEXT: s_min_u32 s2, s2, s3
511 ; SI-NEXT: v_mov_b32_e32 v1, 0
512 ; SI-NEXT: s_mov_b32 s4, s0
513 ; SI-NEXT: s_mov_b32 s5, s1
514 ; SI-NEXT: v_mov_b32_e32 v0, s2
515 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
518 ; VI-LABEL: s_cttz_zero_undef_i64_with_select:
520 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
521 ; VI-NEXT: v_mov_b32_e32 v1, 0
522 ; VI-NEXT: s_waitcnt lgkmcnt(0)
523 ; VI-NEXT: s_ff1_i32_b32 s3, s3
524 ; VI-NEXT: s_ff1_i32_b32 s2, s2
525 ; VI-NEXT: s_add_i32 s3, s3, 32
526 ; VI-NEXT: s_min_u32 s2, s2, s3
527 ; VI-NEXT: v_mov_b32_e32 v3, s1
528 ; VI-NEXT: v_mov_b32_e32 v0, s2
529 ; VI-NEXT: v_mov_b32_e32 v2, s0
530 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
533 ; EG-LABEL: s_cttz_zero_undef_i64_with_select:
535 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
536 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
539 ; EG-NEXT: ALU clause starting at 4:
540 ; EG-NEXT: FFBL_INT * T0.W, KC0[3].X,
541 ; EG-NEXT: FFBL_INT T1.W, KC0[2].W,
542 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
543 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
544 ; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W,
545 ; EG-NEXT: MOV T0.Y, 0.0,
546 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
547 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
549 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
550 ; GFX9-GISEL: ; %bb.0:
551 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
552 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0
553 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
554 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
555 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3]
556 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4
557 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5
558 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
559 ; GFX9-GISEL-NEXT: s_endpgm
560 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
561 %cttz_ret = icmp ne i64 %val, 0
562 %ret = select i1 %cttz_ret, i64 %cttz, i64 32
563 store i64 %cttz, ptr addrspace(1) %out, align 4
567 define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
568 ; SI-LABEL: v_cttz_zero_undef_i8_with_select:
570 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
571 ; SI-NEXT: s_mov_b32 s7, 0xf000
572 ; SI-NEXT: s_mov_b32 s6, -1
573 ; SI-NEXT: s_mov_b32 s10, s6
574 ; SI-NEXT: s_mov_b32 s11, s7
575 ; SI-NEXT: s_waitcnt lgkmcnt(0)
576 ; SI-NEXT: s_mov_b32 s8, s2
577 ; SI-NEXT: s_mov_b32 s9, s3
578 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
579 ; SI-NEXT: s_mov_b32 s4, s0
580 ; SI-NEXT: s_mov_b32 s5, s1
581 ; SI-NEXT: s_waitcnt vmcnt(0)
582 ; SI-NEXT: v_ffbl_b32_e32 v1, v0
583 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
584 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
585 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
588 ; VI-LABEL: v_cttz_zero_undef_i8_with_select:
590 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
591 ; VI-NEXT: s_waitcnt lgkmcnt(0)
592 ; VI-NEXT: v_mov_b32_e32 v0, s2
593 ; VI-NEXT: v_mov_b32_e32 v1, s3
594 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
595 ; VI-NEXT: s_waitcnt vmcnt(0)
596 ; VI-NEXT: v_ffbl_b32_e32 v1, v0
597 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
598 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
599 ; VI-NEXT: v_mov_b32_e32 v0, s0
600 ; VI-NEXT: v_mov_b32_e32 v1, s1
601 ; VI-NEXT: flat_store_byte v[0:1], v2
604 ; EG-LABEL: v_cttz_zero_undef_i8_with_select:
606 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
608 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
609 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
612 ; EG-NEXT: Fetch clause starting at 6:
613 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
614 ; EG-NEXT: ALU clause starting at 8:
615 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
616 ; EG-NEXT: ALU clause starting at 9:
617 ; EG-NEXT: FFBL_INT T0.W, T0.X,
618 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
619 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
620 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
621 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
622 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
623 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
624 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
625 ; EG-NEXT: LSHL T0.X, PV.W, PS,
626 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
627 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
628 ; EG-NEXT: MOV T0.Y, 0.0,
629 ; EG-NEXT: MOV * T0.Z, 0.0,
630 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
631 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
633 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
634 ; GFX9-GISEL: ; %bb.0:
635 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
636 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
637 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
638 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
639 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
640 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
641 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
642 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
643 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
644 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
645 ; GFX9-GISEL-NEXT: s_endpgm
646 %val = load i8, ptr addrspace(1) %arrayidx, align 1
647 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
648 %cttz_ret = icmp ne i8 %val, 0
649 %ret = select i1 %cttz_ret, i8 %cttz, i8 32
650 store i8 %ret, ptr addrspace(1) %out, align 4
654 define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
655 ; SI-LABEL: v_cttz_zero_undef_i16_with_select:
657 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
658 ; SI-NEXT: s_mov_b32 s7, 0xf000
659 ; SI-NEXT: s_mov_b32 s6, -1
660 ; SI-NEXT: s_mov_b32 s10, s6
661 ; SI-NEXT: s_mov_b32 s11, s7
662 ; SI-NEXT: s_waitcnt lgkmcnt(0)
663 ; SI-NEXT: s_mov_b32 s8, s2
664 ; SI-NEXT: s_mov_b32 s9, s3
665 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
666 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
667 ; SI-NEXT: s_mov_b32 s4, s0
668 ; SI-NEXT: s_mov_b32 s5, s1
669 ; SI-NEXT: s_waitcnt vmcnt(1)
670 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
671 ; SI-NEXT: s_waitcnt vmcnt(0)
672 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
673 ; SI-NEXT: v_ffbl_b32_e32 v1, v0
674 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
675 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
676 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
679 ; VI-LABEL: v_cttz_zero_undef_i16_with_select:
681 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
682 ; VI-NEXT: s_waitcnt lgkmcnt(0)
683 ; VI-NEXT: s_add_u32 s4, s2, 1
684 ; VI-NEXT: s_addc_u32 s5, s3, 0
685 ; VI-NEXT: v_mov_b32_e32 v2, s4
686 ; VI-NEXT: v_mov_b32_e32 v0, s2
687 ; VI-NEXT: v_mov_b32_e32 v3, s5
688 ; VI-NEXT: v_mov_b32_e32 v1, s3
689 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
690 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
691 ; VI-NEXT: s_waitcnt vmcnt(1)
692 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2
693 ; VI-NEXT: s_waitcnt vmcnt(0)
694 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
695 ; VI-NEXT: v_ffbl_b32_e32 v1, v0
696 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
697 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
698 ; VI-NEXT: v_mov_b32_e32 v0, s0
699 ; VI-NEXT: v_mov_b32_e32 v1, s1
700 ; VI-NEXT: flat_store_short v[0:1], v2
703 ; EG-LABEL: v_cttz_zero_undef_i16_with_select:
705 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
707 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
708 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
711 ; EG-NEXT: Fetch clause starting at 6:
712 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
713 ; EG-NEXT: ALU clause starting at 8:
714 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
715 ; EG-NEXT: ALU clause starting at 9:
716 ; EG-NEXT: FFBL_INT T0.W, T0.X,
717 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
718 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
719 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
720 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
721 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
722 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
723 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
724 ; EG-NEXT: LSHL T0.X, PV.W, PS,
725 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
726 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
727 ; EG-NEXT: MOV T0.Y, 0.0,
728 ; EG-NEXT: MOV * T0.Z, 0.0,
729 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
730 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
732 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
733 ; GFX9-GISEL: ; %bb.0:
734 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
735 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
736 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
737 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
738 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
739 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
740 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
741 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
742 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
743 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
744 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
745 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
746 ; GFX9-GISEL-NEXT: s_endpgm
747 %val = load i16, ptr addrspace(1) %arrayidx, align 1
748 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
749 %cttz_ret = icmp ne i16 %val, 0
750 %ret = select i1 %cttz_ret, i16 %cttz, i16 32
751 store i16 %ret, ptr addrspace(1) %out, align 4
755 define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
756 ; SI-LABEL: v_cttz_zero_undef_i32_with_select:
758 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
759 ; SI-NEXT: s_mov_b32 s7, 0xf000
760 ; SI-NEXT: s_mov_b32 s6, -1
761 ; SI-NEXT: s_mov_b32 s10, s6
762 ; SI-NEXT: s_mov_b32 s11, s7
763 ; SI-NEXT: s_waitcnt lgkmcnt(0)
764 ; SI-NEXT: s_mov_b32 s8, s2
765 ; SI-NEXT: s_mov_b32 s9, s3
766 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
767 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3
768 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
769 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2
770 ; SI-NEXT: s_mov_b32 s4, s0
771 ; SI-NEXT: s_mov_b32 s5, s1
772 ; SI-NEXT: s_waitcnt vmcnt(3)
773 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
774 ; SI-NEXT: s_waitcnt vmcnt(2)
775 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
776 ; SI-NEXT: s_waitcnt vmcnt(1)
777 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
778 ; SI-NEXT: s_waitcnt vmcnt(0)
779 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
780 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
781 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
782 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
783 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
784 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
787 ; VI-LABEL: v_cttz_zero_undef_i32_with_select:
789 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
790 ; VI-NEXT: s_waitcnt lgkmcnt(0)
791 ; VI-NEXT: s_add_u32 s4, s2, 3
792 ; VI-NEXT: s_addc_u32 s5, s3, 0
793 ; VI-NEXT: v_mov_b32_e32 v2, s4
794 ; VI-NEXT: v_mov_b32_e32 v3, s5
795 ; VI-NEXT: s_add_u32 s4, s2, 2
796 ; VI-NEXT: v_mov_b32_e32 v0, s2
797 ; VI-NEXT: s_addc_u32 s5, s3, 0
798 ; VI-NEXT: v_mov_b32_e32 v1, s3
799 ; VI-NEXT: s_add_u32 s2, s2, 1
800 ; VI-NEXT: s_addc_u32 s3, s3, 0
801 ; VI-NEXT: v_mov_b32_e32 v4, s4
802 ; VI-NEXT: v_mov_b32_e32 v7, s3
803 ; VI-NEXT: v_mov_b32_e32 v5, s5
804 ; VI-NEXT: v_mov_b32_e32 v6, s2
805 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
806 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
807 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
808 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
809 ; VI-NEXT: s_waitcnt vmcnt(3)
810 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
811 ; VI-NEXT: s_waitcnt vmcnt(2)
812 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
813 ; VI-NEXT: s_waitcnt vmcnt(1)
814 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
815 ; VI-NEXT: s_waitcnt vmcnt(0)
816 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
817 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
818 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
819 ; VI-NEXT: v_min_u32_e32 v2, 32, v0
820 ; VI-NEXT: v_mov_b32_e32 v0, s0
821 ; VI-NEXT: v_mov_b32_e32 v1, s1
822 ; VI-NEXT: flat_store_dword v[0:1], v2
825 ; EG-LABEL: v_cttz_zero_undef_i32_with_select:
827 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
829 ; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
830 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
833 ; EG-NEXT: Fetch clause starting at 6:
834 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
835 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
836 ; EG-NEXT: ALU clause starting at 10:
837 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
838 ; EG-NEXT: ALU clause starting at 11:
839 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
840 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
841 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
842 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
843 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
844 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
845 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
847 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
848 ; GFX9-GISEL: ; %bb.0:
849 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
850 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
851 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
852 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
853 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
854 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
855 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
856 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
857 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
858 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
859 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
860 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
861 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4
862 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
863 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
864 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
865 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
866 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
867 ; GFX9-GISEL-NEXT: s_endpgm
868 %val = load i32, ptr addrspace(1) %arrayidx, align 1
869 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
870 %cttz_ret = icmp ne i32 %val, 0
871 %ret = select i1 %cttz_ret, i32 %cttz, i32 32
872 store i32 %ret, ptr addrspace(1) %out, align 4
876 define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
877 ; SI-LABEL: v_cttz_zero_undef_i64_with_select:
879 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
880 ; SI-NEXT: s_mov_b32 s3, 0xf000
881 ; SI-NEXT: s_mov_b32 s2, -1
882 ; SI-NEXT: s_mov_b32 s10, s2
883 ; SI-NEXT: s_mov_b32 s11, s3
884 ; SI-NEXT: s_waitcnt lgkmcnt(0)
885 ; SI-NEXT: s_mov_b32 s0, s4
886 ; SI-NEXT: s_mov_b32 s1, s5
887 ; SI-NEXT: s_mov_b32 s8, s6
888 ; SI-NEXT: s_mov_b32 s9, s7
889 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
890 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
891 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
892 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3
893 ; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:4
894 ; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:5
895 ; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:6
896 ; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:7
897 ; SI-NEXT: s_waitcnt vmcnt(6)
898 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
899 ; SI-NEXT: s_waitcnt vmcnt(4)
900 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
901 ; SI-NEXT: s_waitcnt vmcnt(2)
902 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
903 ; SI-NEXT: s_waitcnt vmcnt(0)
904 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
905 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
906 ; SI-NEXT: v_or_b32_e32 v1, v3, v2
907 ; SI-NEXT: v_or_b32_e32 v2, v5, v4
908 ; SI-NEXT: v_or_b32_e32 v3, v7, v6
909 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
910 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
911 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
912 ; SI-NEXT: v_or_b32_e32 v1, v3, v2
913 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
914 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
915 ; SI-NEXT: v_min_u32_e32 v1, 0xffffffdf, v1
916 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1
917 ; SI-NEXT: v_min3_u32 v0, v0, v1, 64
918 ; SI-NEXT: v_mov_b32_e32 v1, 0
919 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
922 ; VI-LABEL: v_cttz_zero_undef_i64_with_select:
924 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
925 ; VI-NEXT: s_waitcnt lgkmcnt(0)
926 ; VI-NEXT: s_add_u32 s4, s2, 5
927 ; VI-NEXT: s_addc_u32 s5, s3, 0
928 ; VI-NEXT: v_mov_b32_e32 v0, s4
929 ; VI-NEXT: v_mov_b32_e32 v1, s5
930 ; VI-NEXT: s_add_u32 s4, s2, 4
931 ; VI-NEXT: s_addc_u32 s5, s3, 0
932 ; VI-NEXT: v_mov_b32_e32 v2, s4
933 ; VI-NEXT: v_mov_b32_e32 v3, s5
934 ; VI-NEXT: s_add_u32 s4, s2, 7
935 ; VI-NEXT: s_addc_u32 s5, s3, 0
936 ; VI-NEXT: v_mov_b32_e32 v4, s4
937 ; VI-NEXT: v_mov_b32_e32 v5, s5
938 ; VI-NEXT: s_add_u32 s4, s2, 6
939 ; VI-NEXT: s_addc_u32 s5, s3, 0
940 ; VI-NEXT: v_mov_b32_e32 v7, s5
941 ; VI-NEXT: v_mov_b32_e32 v6, s4
942 ; VI-NEXT: s_add_u32 s4, s2, 3
943 ; VI-NEXT: s_addc_u32 s5, s3, 0
944 ; VI-NEXT: v_mov_b32_e32 v9, s5
945 ; VI-NEXT: v_mov_b32_e32 v8, s4
946 ; VI-NEXT: s_add_u32 s4, s2, 2
947 ; VI-NEXT: s_addc_u32 s5, s3, 0
948 ; VI-NEXT: v_mov_b32_e32 v11, s5
949 ; VI-NEXT: v_mov_b32_e32 v10, s4
950 ; VI-NEXT: flat_load_ubyte v12, v[0:1]
951 ; VI-NEXT: flat_load_ubyte v13, v[2:3]
952 ; VI-NEXT: flat_load_ubyte v4, v[4:5]
953 ; VI-NEXT: flat_load_ubyte v5, v[6:7]
954 ; VI-NEXT: s_add_u32 s4, s2, 1
955 ; VI-NEXT: flat_load_ubyte v6, v[8:9]
956 ; VI-NEXT: s_addc_u32 s5, s3, 0
957 ; VI-NEXT: v_mov_b32_e32 v0, s4
958 ; VI-NEXT: v_mov_b32_e32 v2, s2
959 ; VI-NEXT: v_mov_b32_e32 v1, s5
960 ; VI-NEXT: v_mov_b32_e32 v3, s3
961 ; VI-NEXT: flat_load_ubyte v7, v[10:11]
962 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
963 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
964 ; VI-NEXT: v_mov_b32_e32 v1, 0
965 ; VI-NEXT: s_waitcnt vmcnt(7)
966 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12
967 ; VI-NEXT: s_waitcnt vmcnt(6)
968 ; VI-NEXT: v_or_b32_e32 v3, v3, v13
969 ; VI-NEXT: s_waitcnt vmcnt(5)
970 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
971 ; VI-NEXT: s_waitcnt vmcnt(4)
972 ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
973 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
974 ; VI-NEXT: s_waitcnt vmcnt(3)
975 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
976 ; VI-NEXT: v_ffbl_b32_e32 v3, v3
977 ; VI-NEXT: v_add_u32_e64 v3, s[2:3], v3, 32 clamp
978 ; VI-NEXT: s_waitcnt vmcnt(2)
979 ; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
980 ; VI-NEXT: s_waitcnt vmcnt(1)
981 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
982 ; VI-NEXT: s_waitcnt vmcnt(0)
983 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
984 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
985 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
986 ; VI-NEXT: v_min3_u32 v0, v0, v3, 64
987 ; VI-NEXT: v_mov_b32_e32 v3, s1
988 ; VI-NEXT: v_mov_b32_e32 v2, s0
989 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
992 ; EG-LABEL: v_cttz_zero_undef_i64_with_select:
994 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
996 ; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
997 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1000 ; EG-NEXT: Fetch clause starting at 6:
1001 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1
1002 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
1003 ; EG-NEXT: VTX_READ_16 T3.X, T0.X, 2, #1
1004 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
1005 ; EG-NEXT: ALU clause starting at 14:
1006 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1007 ; EG-NEXT: ALU clause starting at 15:
1008 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1009 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1010 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1011 ; EG-NEXT: FFBL_INT T1.W, PV.W,
1012 ; EG-NEXT: LSHL * T2.W, T3.X, literal.x,
1013 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1014 ; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W,
1015 ; EG-NEXT: OR_INT * T1.W, PS, T2.X,
1016 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1017 ; EG-NEXT: FFBL_INT T2.W, PS,
1018 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
1019 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1020 ; EG-NEXT: CNDE_INT T0.X, T1.W, PS, PV.W,
1021 ; EG-NEXT: MOV T0.Y, 0.0,
1022 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1023 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1025 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
1026 ; GFX9-GISEL: ; %bb.0:
1027 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1028 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1029 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1030 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3]
1031 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1
1032 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2
1033 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3
1034 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4
1035 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5
1036 ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6
1037 ; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7
1038 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6)
1039 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0
1040 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5)
1041 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1042 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4)
1043 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4
1044 ; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0
1045 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1046 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5
1047 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1048 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v7
1049 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1050 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v8, 24, v5
1051 ; GFX9-GISEL-NEXT: v_or3_b32 v3, v0, v4, 0
1052 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3
1053 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2
1054 ; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4
1055 ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1056 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4
1057 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
1058 ; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
1059 ; GFX9-GISEL-NEXT: s_endpgm
1060 %val = load i64, ptr addrspace(1) %arrayidx, align 1
1061 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
1062 %cttz_ret = icmp ne i64 %val, 0
1063 %ret = select i1 %cttz_ret, i64 %cttz, i64 64
1064 store i64 %ret, ptr addrspace(1) %out, align 4
1068 define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1069 ; SI-LABEL: v_cttz_i32_sel_eq_neg1:
1071 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1072 ; SI-NEXT: s_mov_b32 s7, 0xf000
1073 ; SI-NEXT: s_mov_b32 s6, -1
1074 ; SI-NEXT: s_mov_b32 s10, s6
1075 ; SI-NEXT: s_mov_b32 s11, s7
1076 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1077 ; SI-NEXT: s_mov_b32 s8, s2
1078 ; SI-NEXT: s_mov_b32 s9, s3
1079 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1080 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3
1081 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
1082 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2
1083 ; SI-NEXT: s_mov_b32 s4, s0
1084 ; SI-NEXT: s_mov_b32 s5, s1
1085 ; SI-NEXT: s_waitcnt vmcnt(3)
1086 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1087 ; SI-NEXT: s_waitcnt vmcnt(2)
1088 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1089 ; SI-NEXT: s_waitcnt vmcnt(1)
1090 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1091 ; SI-NEXT: s_waitcnt vmcnt(0)
1092 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1093 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1094 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1095 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1096 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1099 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
1101 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1103 ; VI-NEXT: s_add_u32 s4, s2, 3
1104 ; VI-NEXT: s_addc_u32 s5, s3, 0
1105 ; VI-NEXT: v_mov_b32_e32 v2, s4
1106 ; VI-NEXT: v_mov_b32_e32 v3, s5
1107 ; VI-NEXT: s_add_u32 s4, s2, 2
1108 ; VI-NEXT: v_mov_b32_e32 v0, s2
1109 ; VI-NEXT: s_addc_u32 s5, s3, 0
1110 ; VI-NEXT: v_mov_b32_e32 v1, s3
1111 ; VI-NEXT: s_add_u32 s2, s2, 1
1112 ; VI-NEXT: s_addc_u32 s3, s3, 0
1113 ; VI-NEXT: v_mov_b32_e32 v4, s4
1114 ; VI-NEXT: v_mov_b32_e32 v7, s3
1115 ; VI-NEXT: v_mov_b32_e32 v5, s5
1116 ; VI-NEXT: v_mov_b32_e32 v6, s2
1117 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1118 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1119 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
1120 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1121 ; VI-NEXT: s_waitcnt vmcnt(3)
1122 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1123 ; VI-NEXT: s_waitcnt vmcnt(2)
1124 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1125 ; VI-NEXT: s_waitcnt vmcnt(1)
1126 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
1127 ; VI-NEXT: s_waitcnt vmcnt(0)
1128 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1129 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1130 ; VI-NEXT: v_ffbl_b32_e32 v2, v0
1131 ; VI-NEXT: v_mov_b32_e32 v0, s0
1132 ; VI-NEXT: v_mov_b32_e32 v1, s1
1133 ; VI-NEXT: flat_store_dword v[0:1], v2
1136 ; EG-LABEL: v_cttz_i32_sel_eq_neg1:
1138 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1140 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1141 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1144 ; EG-NEXT: Fetch clause starting at 6:
1145 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1146 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1147 ; EG-NEXT: ALU clause starting at 10:
1148 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1149 ; EG-NEXT: ALU clause starting at 11:
1150 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1151 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1152 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1153 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
1154 ; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1155 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1156 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
1157 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1158 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1160 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
1161 ; GFX9-GISEL: ; %bb.0:
1162 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1163 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1164 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1165 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
1166 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
1167 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
1168 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
1169 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1170 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
1171 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1172 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1173 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1174 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1175 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
1176 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
1177 ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
1178 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1179 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
1180 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
1181 ; GFX9-GISEL-NEXT: s_endpgm
1182 %val = load i32, ptr addrspace(1) %arrayidx, align 1
1183 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1184 %cmp = icmp eq i32 %val, 0
1185 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1186 store i32 %sel, ptr addrspace(1) %out
1190 define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1191 ; SI-LABEL: v_cttz_i32_sel_ne_neg1:
1193 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1194 ; SI-NEXT: s_mov_b32 s7, 0xf000
1195 ; SI-NEXT: s_mov_b32 s6, -1
1196 ; SI-NEXT: s_mov_b32 s10, s6
1197 ; SI-NEXT: s_mov_b32 s11, s7
1198 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1199 ; SI-NEXT: s_mov_b32 s8, s2
1200 ; SI-NEXT: s_mov_b32 s9, s3
1201 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1202 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3
1203 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
1204 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2
1205 ; SI-NEXT: s_mov_b32 s4, s0
1206 ; SI-NEXT: s_mov_b32 s5, s1
1207 ; SI-NEXT: s_waitcnt vmcnt(3)
1208 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1209 ; SI-NEXT: s_waitcnt vmcnt(2)
1210 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1211 ; SI-NEXT: s_waitcnt vmcnt(1)
1212 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1213 ; SI-NEXT: s_waitcnt vmcnt(0)
1214 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1215 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1216 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1217 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1218 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1221 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1223 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1224 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1225 ; VI-NEXT: s_add_u32 s4, s2, 3
1226 ; VI-NEXT: s_addc_u32 s5, s3, 0
1227 ; VI-NEXT: v_mov_b32_e32 v2, s4
1228 ; VI-NEXT: v_mov_b32_e32 v3, s5
1229 ; VI-NEXT: s_add_u32 s4, s2, 2
1230 ; VI-NEXT: v_mov_b32_e32 v0, s2
1231 ; VI-NEXT: s_addc_u32 s5, s3, 0
1232 ; VI-NEXT: v_mov_b32_e32 v1, s3
1233 ; VI-NEXT: s_add_u32 s2, s2, 1
1234 ; VI-NEXT: s_addc_u32 s3, s3, 0
1235 ; VI-NEXT: v_mov_b32_e32 v4, s4
1236 ; VI-NEXT: v_mov_b32_e32 v7, s3
1237 ; VI-NEXT: v_mov_b32_e32 v5, s5
1238 ; VI-NEXT: v_mov_b32_e32 v6, s2
1239 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1240 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1241 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
1242 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1243 ; VI-NEXT: s_waitcnt vmcnt(3)
1244 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1245 ; VI-NEXT: s_waitcnt vmcnt(2)
1246 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1247 ; VI-NEXT: s_waitcnt vmcnt(1)
1248 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
1249 ; VI-NEXT: s_waitcnt vmcnt(0)
1250 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1251 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1252 ; VI-NEXT: v_ffbl_b32_e32 v2, v0
1253 ; VI-NEXT: v_mov_b32_e32 v0, s0
1254 ; VI-NEXT: v_mov_b32_e32 v1, s1
1255 ; VI-NEXT: flat_store_dword v[0:1], v2
1258 ; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1260 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1262 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1263 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1266 ; EG-NEXT: Fetch clause starting at 6:
1267 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1268 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1269 ; EG-NEXT: ALU clause starting at 10:
1270 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1271 ; EG-NEXT: ALU clause starting at 11:
1272 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1273 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1274 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1275 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
1276 ; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1277 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1278 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
1279 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1280 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1282 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1283 ; GFX9-GISEL: ; %bb.0:
1284 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1285 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1286 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1287 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
1288 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
1289 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
1290 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
1291 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1292 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
1293 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1294 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1295 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1296 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1297 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
1298 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
1299 ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
1300 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1301 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
1302 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
1303 ; GFX9-GISEL-NEXT: s_endpgm
1304 %val = load i32, ptr addrspace(1) %arrayidx, align 1
1305 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1306 %cmp = icmp ne i32 %val, 0
1307 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1308 store i32 %sel, ptr addrspace(1) %out
1312 define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1313 ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1315 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1316 ; SI-NEXT: s_mov_b32 s7, 0xf000
1317 ; SI-NEXT: s_mov_b32 s6, -1
1318 ; SI-NEXT: s_mov_b32 s10, s6
1319 ; SI-NEXT: s_mov_b32 s11, s7
1320 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1321 ; SI-NEXT: s_mov_b32 s8, s2
1322 ; SI-NEXT: s_mov_b32 s9, s3
1323 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1324 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3
1325 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0
1326 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2
1327 ; SI-NEXT: s_mov_b32 s4, s0
1328 ; SI-NEXT: s_mov_b32 s5, s1
1329 ; SI-NEXT: s_waitcnt vmcnt(3)
1330 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1331 ; SI-NEXT: s_waitcnt vmcnt(2)
1332 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1333 ; SI-NEXT: s_waitcnt vmcnt(1)
1334 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1335 ; SI-NEXT: s_waitcnt vmcnt(0)
1336 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1337 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1338 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1339 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1340 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
1341 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1342 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1343 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1346 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1348 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1349 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1350 ; VI-NEXT: s_add_u32 s4, s2, 3
1351 ; VI-NEXT: s_addc_u32 s5, s3, 0
1352 ; VI-NEXT: v_mov_b32_e32 v2, s4
1353 ; VI-NEXT: v_mov_b32_e32 v3, s5
1354 ; VI-NEXT: s_add_u32 s4, s2, 2
1355 ; VI-NEXT: v_mov_b32_e32 v0, s2
1356 ; VI-NEXT: s_addc_u32 s5, s3, 0
1357 ; VI-NEXT: v_mov_b32_e32 v1, s3
1358 ; VI-NEXT: s_add_u32 s2, s2, 1
1359 ; VI-NEXT: s_addc_u32 s3, s3, 0
1360 ; VI-NEXT: v_mov_b32_e32 v4, s4
1361 ; VI-NEXT: v_mov_b32_e32 v7, s3
1362 ; VI-NEXT: v_mov_b32_e32 v5, s5
1363 ; VI-NEXT: v_mov_b32_e32 v6, s2
1364 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1365 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1366 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
1367 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1368 ; VI-NEXT: s_waitcnt vmcnt(3)
1369 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1370 ; VI-NEXT: s_waitcnt vmcnt(2)
1371 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1372 ; VI-NEXT: s_waitcnt vmcnt(1)
1373 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
1374 ; VI-NEXT: s_waitcnt vmcnt(0)
1375 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1376 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1377 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1378 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
1379 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1380 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
1381 ; VI-NEXT: v_mov_b32_e32 v0, s0
1382 ; VI-NEXT: v_mov_b32_e32 v1, s1
1383 ; VI-NEXT: flat_store_dword v[0:1], v2
1386 ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1388 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1390 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
1391 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1394 ; EG-NEXT: Fetch clause starting at 6:
1395 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1396 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1397 ; EG-NEXT: ALU clause starting at 10:
1398 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1399 ; EG-NEXT: ALU clause starting at 11:
1400 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1401 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1402 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1403 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
1404 ; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W,
1405 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1406 ; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x,
1407 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1408 ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W,
1409 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1410 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1412 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1413 ; GFX9-GISEL: ; %bb.0:
1414 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1415 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1416 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1417 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
1418 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
1419 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
1420 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2
1421 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1422 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
1423 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1424 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3
1425 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1426 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1427 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
1428 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
1429 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1430 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1
1431 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
1432 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
1433 ; GFX9-GISEL-NEXT: s_endpgm
1434 %val = load i32, ptr addrspace(1) %arrayidx, align 1
1435 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1436 %cmp = icmp ne i32 %ctlz, 32
1437 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1438 store i32 %sel, ptr addrspace(1) %out
1442 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1443 ; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1445 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1446 ; SI-NEXT: s_mov_b32 s7, 0xf000
1447 ; SI-NEXT: s_mov_b32 s6, -1
1448 ; SI-NEXT: s_mov_b32 s10, s6
1449 ; SI-NEXT: s_mov_b32 s11, s7
1450 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1451 ; SI-NEXT: s_mov_b32 s8, s2
1452 ; SI-NEXT: s_mov_b32 s9, s3
1453 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
1454 ; SI-NEXT: s_mov_b32 s4, s0
1455 ; SI-NEXT: s_mov_b32 s5, s1
1456 ; SI-NEXT: s_waitcnt vmcnt(0)
1457 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1458 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1461 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1463 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1464 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1465 ; VI-NEXT: v_mov_b32_e32 v0, s2
1466 ; VI-NEXT: v_mov_b32_e32 v1, s3
1467 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1468 ; VI-NEXT: v_mov_b32_e32 v1, 0xff
1469 ; VI-NEXT: s_waitcnt vmcnt(0)
1470 ; VI-NEXT: v_or_b32_e32 v2, 0x100, v0
1471 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
1472 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
1473 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
1474 ; VI-NEXT: v_mov_b32_e32 v0, s0
1475 ; VI-NEXT: v_mov_b32_e32 v1, s1
1476 ; VI-NEXT: flat_store_byte v[0:1], v2
1479 ; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1481 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1483 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1484 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1487 ; EG-NEXT: Fetch clause starting at 6:
1488 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1489 ; EG-NEXT: ALU clause starting at 8:
1490 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1491 ; EG-NEXT: ALU clause starting at 9:
1492 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1493 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1494 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1495 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1496 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1497 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1498 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1499 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1500 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1501 ; EG-NEXT: MOV T0.Y, 0.0,
1502 ; EG-NEXT: MOV * T0.Z, 0.0,
1503 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1504 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1506 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1507 ; GFX9-GISEL: ; %bb.0:
1508 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1509 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1510 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff
1511 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1512 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
1513 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1514 ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1
1515 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
1516 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
1517 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1518 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
1519 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
1520 ; GFX9-GISEL-NEXT: s_endpgm
1521 %val = load i8, ptr addrspace(1) %arrayidx, align 1
1522 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1523 %cmp = icmp eq i8 %val, 0
1524 %sel = select i1 %cmp, i8 -1, i8 %ctlz
1525 store i8 %sel, ptr addrspace(1) %out
1529 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1530 ; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1532 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1533 ; SI-NEXT: s_mov_b32 s7, 0xf000
1534 ; SI-NEXT: s_mov_b32 s6, -1
1535 ; SI-NEXT: s_mov_b32 s10, s6
1536 ; SI-NEXT: s_mov_b32 s11, s7
1537 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1538 ; SI-NEXT: s_mov_b32 s8, s2
1539 ; SI-NEXT: s_mov_b32 s9, s3
1540 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1541 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
1542 ; SI-NEXT: s_mov_b32 s4, s0
1543 ; SI-NEXT: s_mov_b32 s5, s1
1544 ; SI-NEXT: s_waitcnt vmcnt(1)
1545 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1546 ; SI-NEXT: s_waitcnt vmcnt(0)
1547 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
1548 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1549 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
1552 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1554 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1555 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1556 ; VI-NEXT: s_add_u32 s4, s2, 1
1557 ; VI-NEXT: s_addc_u32 s5, s3, 0
1558 ; VI-NEXT: v_mov_b32_e32 v2, s4
1559 ; VI-NEXT: v_mov_b32_e32 v0, s2
1560 ; VI-NEXT: v_mov_b32_e32 v3, s5
1561 ; VI-NEXT: v_mov_b32_e32 v1, s3
1562 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1563 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1564 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
1565 ; VI-NEXT: s_waitcnt vmcnt(1)
1566 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
1567 ; VI-NEXT: s_waitcnt vmcnt(0)
1568 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1569 ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
1570 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
1571 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1572 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
1573 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
1574 ; VI-NEXT: v_mov_b32_e32 v0, s0
1575 ; VI-NEXT: v_mov_b32_e32 v1, s1
1576 ; VI-NEXT: flat_store_short v[0:1], v2
1579 ; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1581 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1583 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1584 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1587 ; EG-NEXT: Fetch clause starting at 6:
1588 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1589 ; EG-NEXT: ALU clause starting at 8:
1590 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1591 ; EG-NEXT: ALU clause starting at 9:
1592 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1593 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1594 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1595 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1596 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1597 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1598 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1599 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1600 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1601 ; EG-NEXT: MOV T0.Y, 0.0,
1602 ; EG-NEXT: MOV * T0.Z, 0.0,
1603 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1604 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1606 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1607 ; GFX9-GISEL: ; %bb.0:
1608 ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1609 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1610 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff
1611 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1612 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
1613 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
1614 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1615 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
1616 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
1617 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
1618 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
1619 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1620 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
1621 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
1622 ; GFX9-GISEL-NEXT: s_endpgm
1623 %val = load i16, ptr addrspace(1) %arrayidx, align 1
1624 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1625 %cmp = icmp eq i16 %val, 0
1626 %sel = select i1 %cmp, i16 -1, i16 %ctlz
1627 store i16 %sel, ptr addrspace(1) %out