1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
7 declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
8 declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
9 declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
10 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
11 declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
12 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
13 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
14 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
16 define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
17 ; SI-LABEL: s_cttz_zero_undef_i32:
19 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
20 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
21 ; SI-NEXT: s_mov_b32 s3, 0xf000
22 ; SI-NEXT: s_waitcnt lgkmcnt(0)
23 ; SI-NEXT: s_ff1_i32_b32 s4, s2
24 ; SI-NEXT: s_mov_b32 s2, -1
25 ; SI-NEXT: v_mov_b32_e32 v0, s4
26 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
29 ; VI-LABEL: s_cttz_zero_undef_i32:
31 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
32 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: v_mov_b32_e32 v0, s2
35 ; VI-NEXT: s_ff1_i32_b32 s0, s0
36 ; VI-NEXT: v_mov_b32_e32 v1, s3
37 ; VI-NEXT: v_mov_b32_e32 v2, s0
38 ; VI-NEXT: flat_store_dword v[0:1], v2
41 ; EG-LABEL: s_cttz_zero_undef_i32:
43 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
44 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
47 ; EG-NEXT: ALU clause starting at 4:
48 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
49 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
50 ; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z,
52 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32:
53 ; GFX9-GISEL: ; %bb.0:
54 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
55 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
56 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
57 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
58 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
59 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
60 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
61 ; GFX9-GISEL-NEXT: s_endpgm
62 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
63 store i32 %cttz, i32 addrspace(1)* %out, align 4
67 define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
68 ; SI-LABEL: v_cttz_zero_undef_i32:
70 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
71 ; SI-NEXT: s_mov_b32 s3, 0xf000
72 ; SI-NEXT: s_mov_b32 s6, 0
73 ; SI-NEXT: s_mov_b32 s7, s3
74 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
75 ; SI-NEXT: v_mov_b32_e32 v1, 0
76 ; SI-NEXT: s_waitcnt lgkmcnt(0)
77 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
78 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
79 ; SI-NEXT: s_mov_b32 s2, -1
80 ; SI-NEXT: s_waitcnt vmcnt(0)
81 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
82 ; SI-NEXT: s_waitcnt lgkmcnt(0)
83 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
86 ; VI-LABEL: v_cttz_zero_undef_i32:
88 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
89 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
90 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
91 ; VI-NEXT: s_waitcnt lgkmcnt(0)
92 ; VI-NEXT: v_mov_b32_e32 v1, s1
93 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
94 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
95 ; VI-NEXT: flat_load_dword v0, v[0:1]
96 ; VI-NEXT: s_waitcnt vmcnt(0)
97 ; VI-NEXT: v_ffbl_b32_e32 v2, v0
98 ; VI-NEXT: v_mov_b32_e32 v0, s2
99 ; VI-NEXT: v_mov_b32_e32 v1, s3
100 ; VI-NEXT: flat_store_dword v[0:1], v2
103 ; EG-LABEL: v_cttz_zero_undef_i32:
105 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
107 ; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[]
108 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
111 ; EG-NEXT: Fetch clause starting at 6:
112 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
113 ; EG-NEXT: ALU clause starting at 8:
114 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
115 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
116 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
117 ; EG-NEXT: ALU clause starting at 11:
118 ; EG-NEXT: FFBL_INT T0.X, T0.X,
119 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
120 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
122 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
123 ; GFX9-GISEL: ; %bb.0:
124 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
125 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
126 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
127 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
128 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[4:5]
130 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
131 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
132 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
133 ; GFX9-GISEL-NEXT: s_endpgm
134 %tid = call i32 @llvm.amdgcn.workitem.id.x()
135 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
136 %val = load i32, i32 addrspace(1)* %in.gep, align 4
137 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
138 store i32 %cttz, i32 addrspace(1)* %out, align 4
142 define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
143 ; SI-LABEL: v_cttz_zero_undef_v2i32:
145 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
146 ; SI-NEXT: s_mov_b32 s3, 0xf000
147 ; SI-NEXT: s_mov_b32 s6, 0
148 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
149 ; SI-NEXT: v_mov_b32_e32 v1, 0
150 ; SI-NEXT: s_mov_b32 s7, s3
151 ; SI-NEXT: s_waitcnt lgkmcnt(0)
152 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
153 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
154 ; SI-NEXT: s_mov_b32 s2, -1
155 ; SI-NEXT: s_waitcnt vmcnt(0)
156 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
157 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
158 ; SI-NEXT: s_waitcnt lgkmcnt(0)
159 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
162 ; VI-LABEL: v_cttz_zero_undef_v2i32:
164 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
165 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
166 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
167 ; VI-NEXT: s_waitcnt lgkmcnt(0)
168 ; VI-NEXT: v_mov_b32_e32 v2, s2
169 ; VI-NEXT: v_mov_b32_e32 v1, s1
170 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
171 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
172 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
173 ; VI-NEXT: v_mov_b32_e32 v3, s3
174 ; VI-NEXT: s_waitcnt vmcnt(0)
175 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
176 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
177 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
180 ; EG-LABEL: v_cttz_zero_undef_v2i32:
182 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
184 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
185 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
188 ; EG-NEXT: Fetch clause starting at 6:
189 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
190 ; EG-NEXT: ALU clause starting at 8:
191 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
192 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
193 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
194 ; EG-NEXT: ALU clause starting at 11:
195 ; EG-NEXT: FFBL_INT * T0.Y, T0.Y,
196 ; EG-NEXT: FFBL_INT T0.X, T0.X,
197 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
198 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
200 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
201 ; GFX9-GISEL: ; %bb.0:
202 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
203 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
204 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
205 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
206 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[4:5]
208 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
209 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
210 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
211 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
212 ; GFX9-GISEL-NEXT: s_endpgm
213 %tid = call i32 @llvm.amdgcn.workitem.id.x()
214 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
215 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
216 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
217 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
221 define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
222 ; SI-LABEL: v_cttz_zero_undef_v4i32:
224 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
225 ; SI-NEXT: s_mov_b32 s3, 0xf000
226 ; SI-NEXT: s_mov_b32 s6, 0
227 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
228 ; SI-NEXT: v_mov_b32_e32 v1, 0
229 ; SI-NEXT: s_mov_b32 s7, s3
230 ; SI-NEXT: s_waitcnt lgkmcnt(0)
231 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
232 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
233 ; SI-NEXT: s_mov_b32 s2, -1
234 ; SI-NEXT: s_waitcnt vmcnt(0)
235 ; SI-NEXT: v_ffbl_b32_e32 v3, v3
236 ; SI-NEXT: v_ffbl_b32_e32 v2, v2
237 ; SI-NEXT: v_ffbl_b32_e32 v1, v1
238 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
239 ; SI-NEXT: s_waitcnt lgkmcnt(0)
240 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
243 ; VI-LABEL: v_cttz_zero_undef_v4i32:
245 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
246 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
247 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
248 ; VI-NEXT: s_waitcnt lgkmcnt(0)
249 ; VI-NEXT: v_mov_b32_e32 v5, s3
250 ; VI-NEXT: v_mov_b32_e32 v1, s1
251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
252 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
253 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
254 ; VI-NEXT: v_mov_b32_e32 v4, s2
255 ; VI-NEXT: s_waitcnt vmcnt(0)
256 ; VI-NEXT: v_ffbl_b32_e32 v3, v3
257 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
258 ; VI-NEXT: v_ffbl_b32_e32 v1, v1
259 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
260 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
263 ; EG-LABEL: v_cttz_zero_undef_v4i32:
265 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
267 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
268 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
271 ; EG-NEXT: Fetch clause starting at 6:
272 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
273 ; EG-NEXT: ALU clause starting at 8:
274 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
275 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
276 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
277 ; EG-NEXT: ALU clause starting at 11:
278 ; EG-NEXT: FFBL_INT * T0.W, T0.W,
279 ; EG-NEXT: FFBL_INT * T0.Z, T0.Z,
280 ; EG-NEXT: FFBL_INT * T0.Y, T0.Y,
281 ; EG-NEXT: FFBL_INT T0.X, T0.X,
282 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
283 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
285 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
286 ; GFX9-GISEL: ; %bb.0:
287 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
288 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
289 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
290 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0
291 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
292 ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5]
293 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
294 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
295 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
296 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
297 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
298 ; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
299 ; GFX9-GISEL-NEXT: s_endpgm
300 %tid = call i32 @llvm.amdgcn.workitem.id.x()
301 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
302 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
303 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
304 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
308 define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind {
309 ; SI-LABEL: s_cttz_zero_undef_i8_with_select:
311 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
312 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
313 ; SI-NEXT: s_mov_b32 s3, 0xf000
314 ; SI-NEXT: s_waitcnt lgkmcnt(0)
315 ; SI-NEXT: s_ff1_i32_b32 s4, s2
316 ; SI-NEXT: s_mov_b32 s2, -1
317 ; SI-NEXT: v_mov_b32_e32 v0, s4
318 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
321 ; VI-LABEL: s_cttz_zero_undef_i8_with_select:
323 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
324 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
325 ; VI-NEXT: s_waitcnt lgkmcnt(0)
326 ; VI-NEXT: v_mov_b32_e32 v0, s2
327 ; VI-NEXT: s_ff1_i32_b32 s0, s0
328 ; VI-NEXT: v_mov_b32_e32 v1, s3
329 ; VI-NEXT: v_mov_b32_e32 v2, s0
330 ; VI-NEXT: flat_store_byte v[0:1], v2
333 ; EG-LABEL: s_cttz_zero_undef_i8_with_select:
335 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
337 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
338 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
341 ; EG-NEXT: Fetch clause starting at 6:
342 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
343 ; EG-NEXT: ALU clause starting at 8:
344 ; EG-NEXT: MOV * T0.X, 0.0,
345 ; EG-NEXT: ALU clause starting at 9:
346 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
347 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
348 ; EG-NEXT: FFBL_INT T0.W, PV.W,
349 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
350 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
351 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
352 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
353 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
354 ; EG-NEXT: LSHL T0.X, PV.W, PS,
355 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
356 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
357 ; EG-NEXT: MOV T0.Y, 0.0,
358 ; EG-NEXT: MOV * T0.Z, 0.0,
359 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
360 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
362 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select:
363 ; GFX9-GISEL: ; %bb.0:
364 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
365 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
366 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
367 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
368 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
369 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
370 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
371 ; GFX9-GISEL-NEXT: s_endpgm
372 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
373 %cttz_ret = icmp ne i8 %val, 0
374 %ret = select i1 %cttz_ret, i8 %cttz, i8 32
375 store i8 %cttz, i8 addrspace(1)* %out, align 4
379 define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
380 ; SI-LABEL: s_cttz_zero_undef_i16_with_select:
382 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
383 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
384 ; SI-NEXT: s_mov_b32 s3, 0xf000
385 ; SI-NEXT: s_waitcnt lgkmcnt(0)
386 ; SI-NEXT: s_ff1_i32_b32 s4, s2
387 ; SI-NEXT: s_mov_b32 s2, -1
388 ; SI-NEXT: v_mov_b32_e32 v0, s4
389 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
392 ; VI-LABEL: s_cttz_zero_undef_i16_with_select:
394 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
395 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
396 ; VI-NEXT: s_waitcnt lgkmcnt(0)
397 ; VI-NEXT: v_mov_b32_e32 v0, s2
398 ; VI-NEXT: s_ff1_i32_b32 s0, s0
399 ; VI-NEXT: v_mov_b32_e32 v1, s3
400 ; VI-NEXT: v_mov_b32_e32 v2, s0
401 ; VI-NEXT: flat_store_short v[0:1], v2
404 ; EG-LABEL: s_cttz_zero_undef_i16_with_select:
406 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
408 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
409 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
412 ; EG-NEXT: Fetch clause starting at 6:
413 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
414 ; EG-NEXT: ALU clause starting at 8:
415 ; EG-NEXT: MOV * T0.X, 0.0,
416 ; EG-NEXT: ALU clause starting at 9:
417 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
418 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
419 ; EG-NEXT: FFBL_INT T0.W, PV.W,
420 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
421 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
422 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
423 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
424 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
425 ; EG-NEXT: LSHL T0.X, PV.W, PS,
426 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
427 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
428 ; EG-NEXT: MOV T0.Y, 0.0,
429 ; EG-NEXT: MOV * T0.Z, 0.0,
430 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
431 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
433 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select:
434 ; GFX9-GISEL: ; %bb.0:
435 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
436 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
437 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
438 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
439 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
440 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
441 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
442 ; GFX9-GISEL-NEXT: s_endpgm
443 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
444 %cttz_ret = icmp ne i16 %val, 0
445 %ret = select i1 %cttz_ret, i16 %cttz, i16 32
446 store i16 %cttz, i16 addrspace(1)* %out, align 4
450 define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
451 ; SI-LABEL: s_cttz_zero_undef_i32_with_select:
453 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
454 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
455 ; SI-NEXT: s_mov_b32 s3, 0xf000
456 ; SI-NEXT: s_waitcnt lgkmcnt(0)
457 ; SI-NEXT: s_ff1_i32_b32 s4, s2
458 ; SI-NEXT: s_mov_b32 s2, -1
459 ; SI-NEXT: v_mov_b32_e32 v0, s4
460 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
463 ; VI-LABEL: s_cttz_zero_undef_i32_with_select:
465 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
466 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
468 ; VI-NEXT: v_mov_b32_e32 v0, s2
469 ; VI-NEXT: s_ff1_i32_b32 s0, s0
470 ; VI-NEXT: v_mov_b32_e32 v1, s3
471 ; VI-NEXT: v_mov_b32_e32 v2, s0
472 ; VI-NEXT: flat_store_dword v[0:1], v2
475 ; EG-LABEL: s_cttz_zero_undef_i32_with_select:
477 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
478 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
481 ; EG-NEXT: ALU clause starting at 4:
482 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
483 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
484 ; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z,
486 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select:
487 ; GFX9-GISEL: ; %bb.0:
488 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
489 ; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
490 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
491 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4
493 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
494 ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
495 ; GFX9-GISEL-NEXT: s_endpgm
496 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
497 %cttz_ret = icmp ne i32 %val, 0
498 %ret = select i1 %cttz_ret, i32 %cttz, i32 32
499 store i32 %cttz, i32 addrspace(1)* %out, align 4
503 define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
504 ; SI-LABEL: s_cttz_zero_undef_i64_with_select:
506 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
507 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
508 ; SI-NEXT: s_mov_b32 s3, 0xf000
509 ; SI-NEXT: s_mov_b32 s2, -1
510 ; SI-NEXT: s_waitcnt lgkmcnt(0)
511 ; SI-NEXT: s_ff1_i32_b32 s5, s5
512 ; SI-NEXT: s_ff1_i32_b32 s4, s4
513 ; SI-NEXT: s_add_i32 s5, s5, 32
514 ; SI-NEXT: s_min_u32 s4, s4, s5
515 ; SI-NEXT: v_mov_b32_e32 v1, 0
516 ; SI-NEXT: v_mov_b32_e32 v0, s4
517 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
520 ; VI-LABEL: s_cttz_zero_undef_i64_with_select:
522 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
523 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
524 ; VI-NEXT: v_mov_b32_e32 v1, 0
525 ; VI-NEXT: s_waitcnt lgkmcnt(0)
526 ; VI-NEXT: v_mov_b32_e32 v2, s2
527 ; VI-NEXT: s_ff1_i32_b32 s1, s1
528 ; VI-NEXT: s_ff1_i32_b32 s0, s0
529 ; VI-NEXT: s_add_i32 s1, s1, 32
530 ; VI-NEXT: s_min_u32 s0, s0, s1
531 ; VI-NEXT: v_mov_b32_e32 v0, s0
532 ; VI-NEXT: v_mov_b32_e32 v3, s3
533 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
536 ; EG-LABEL: s_cttz_zero_undef_i64_with_select:
538 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
539 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
542 ; EG-NEXT: ALU clause starting at 4:
543 ; EG-NEXT: FFBL_INT * T0.W, KC0[3].X,
544 ; EG-NEXT: FFBL_INT T1.W, KC0[2].W,
545 ; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
546 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
547 ; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W,
548 ; EG-NEXT: MOV T0.Y, 0.0,
549 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
550 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
552 ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
553 ; GFX9-GISEL: ; %bb.0:
554 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
555 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
556 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
557 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
558 ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[4:5]
559 ; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000
560 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
561 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
562 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
563 ; GFX9-GISEL-NEXT: s_endpgm
564 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
565 %cttz_ret = icmp ne i64 %val, 0
566 %ret = select i1 %cttz_ret, i64 %cttz, i64 32
567 store i64 %cttz, i64 addrspace(1)* %out, align 4
571 define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
572 ; SI-LABEL: v_cttz_zero_undef_i8_with_select:
574 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
575 ; SI-NEXT: s_mov_b32 s3, 0xf000
576 ; SI-NEXT: s_mov_b32 s2, -1
577 ; SI-NEXT: s_mov_b32 s6, s2
578 ; SI-NEXT: s_mov_b32 s7, s3
579 ; SI-NEXT: s_waitcnt lgkmcnt(0)
580 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
581 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
582 ; SI-NEXT: s_waitcnt vmcnt(0)
583 ; SI-NEXT: v_ffbl_b32_e32 v1, v0
584 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
585 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
586 ; SI-NEXT: s_waitcnt lgkmcnt(0)
587 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
590 ; VI-LABEL: v_cttz_zero_undef_i8_with_select:
592 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
593 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
594 ; VI-NEXT: s_waitcnt lgkmcnt(0)
595 ; VI-NEXT: v_mov_b32_e32 v0, s0
596 ; VI-NEXT: v_mov_b32_e32 v1, s1
597 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
598 ; VI-NEXT: s_waitcnt vmcnt(0)
599 ; VI-NEXT: v_ffbl_b32_e32 v1, v0
600 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
601 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
602 ; VI-NEXT: v_mov_b32_e32 v0, s2
603 ; VI-NEXT: v_mov_b32_e32 v1, s3
604 ; VI-NEXT: flat_store_byte v[0:1], v2
607 ; EG-LABEL: v_cttz_zero_undef_i8_with_select:
609 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
611 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
612 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
615 ; EG-NEXT: Fetch clause starting at 6:
616 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
617 ; EG-NEXT: ALU clause starting at 8:
618 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
619 ; EG-NEXT: ALU clause starting at 9:
620 ; EG-NEXT: FFBL_INT T0.W, T0.X,
621 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
622 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
623 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
624 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
625 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
626 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
627 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
628 ; EG-NEXT: LSHL T0.X, PV.W, PS,
629 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
630 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
631 ; EG-NEXT: MOV T0.Y, 0.0,
632 ; EG-NEXT: MOV * T0.Z, 0.0,
633 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
634 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
636 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
637 ; GFX9-GISEL: ; %bb.0:
638 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
639 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
640 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
641 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
642 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
643 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
644 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
645 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
646 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
647 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
648 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3]
649 ; GFX9-GISEL-NEXT: s_endpgm
650 %val = load i8, i8 addrspace(1)* %arrayidx, align 1
651 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
652 %cttz_ret = icmp ne i8 %val, 0
653 %ret = select i1 %cttz_ret, i8 %cttz, i8 32
654 store i8 %ret, i8 addrspace(1)* %out, align 4
658 define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
659 ; SI-LABEL: v_cttz_zero_undef_i16_with_select:
661 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
662 ; SI-NEXT: s_mov_b32 s3, 0xf000
663 ; SI-NEXT: s_mov_b32 s2, -1
664 ; SI-NEXT: s_mov_b32 s6, s2
665 ; SI-NEXT: s_mov_b32 s7, s3
666 ; SI-NEXT: s_waitcnt lgkmcnt(0)
667 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1
668 ; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
669 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
670 ; SI-NEXT: s_waitcnt vmcnt(1)
671 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
672 ; SI-NEXT: s_waitcnt vmcnt(0)
673 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
674 ; SI-NEXT: v_ffbl_b32_e32 v1, v0
675 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
676 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
677 ; SI-NEXT: s_waitcnt lgkmcnt(0)
678 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
681 ; VI-LABEL: v_cttz_zero_undef_i16_with_select:
683 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
684 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
685 ; VI-NEXT: s_waitcnt lgkmcnt(0)
686 ; VI-NEXT: s_add_u32 s4, s0, 1
687 ; VI-NEXT: s_addc_u32 s5, s1, 0
688 ; VI-NEXT: v_mov_b32_e32 v2, s4
689 ; VI-NEXT: v_mov_b32_e32 v0, s0
690 ; VI-NEXT: v_mov_b32_e32 v3, s5
691 ; VI-NEXT: v_mov_b32_e32 v1, s1
692 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
693 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
694 ; VI-NEXT: s_waitcnt vmcnt(1)
695 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2
696 ; VI-NEXT: s_waitcnt vmcnt(0)
697 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
698 ; VI-NEXT: v_ffbl_b32_e32 v1, v0
699 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
700 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
701 ; VI-NEXT: v_mov_b32_e32 v0, s2
702 ; VI-NEXT: v_mov_b32_e32 v1, s3
703 ; VI-NEXT: flat_store_short v[0:1], v2
706 ; EG-LABEL: v_cttz_zero_undef_i16_with_select:
708 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
710 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
711 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
714 ; EG-NEXT: Fetch clause starting at 6:
715 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
716 ; EG-NEXT: ALU clause starting at 8:
717 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
718 ; EG-NEXT: ALU clause starting at 9:
719 ; EG-NEXT: FFBL_INT T0.W, T0.X,
720 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
721 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
722 ; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
723 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
724 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
725 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
726 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
727 ; EG-NEXT: LSHL T0.X, PV.W, PS,
728 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
729 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
730 ; EG-NEXT: MOV T0.Y, 0.0,
731 ; EG-NEXT: MOV * T0.Z, 0.0,
732 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
733 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
735 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
736 ; GFX9-GISEL: ; %bb.0:
737 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
738 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
739 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
740 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
742 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
743 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
744 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
745 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
746 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
747 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
748 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
749 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3]
750 ; GFX9-GISEL-NEXT: s_endpgm
751 %val = load i16, i16 addrspace(1)* %arrayidx, align 1
752 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
753 %cttz_ret = icmp ne i16 %val, 0
754 %ret = select i1 %cttz_ret, i16 %cttz, i16 32
755 store i16 %ret, i16 addrspace(1)* %out, align 4
759 define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
760 ; SI-LABEL: v_cttz_zero_undef_i32_with_select:
762 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
763 ; SI-NEXT: s_mov_b32 s3, 0xf000
764 ; SI-NEXT: s_mov_b32 s2, -1
765 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
766 ; SI-NEXT: s_mov_b32 s6, s2
767 ; SI-NEXT: s_mov_b32 s7, s3
768 ; SI-NEXT: s_waitcnt lgkmcnt(0)
769 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1
770 ; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3
771 ; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0
772 ; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
773 ; SI-NEXT: s_waitcnt vmcnt(3)
774 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
775 ; SI-NEXT: s_waitcnt vmcnt(2)
776 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
777 ; SI-NEXT: s_waitcnt vmcnt(1)
778 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
779 ; SI-NEXT: s_waitcnt vmcnt(0)
780 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
781 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
782 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
783 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
784 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
785 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
788 ; VI-LABEL: v_cttz_zero_undef_i32_with_select:
790 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
791 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
792 ; VI-NEXT: s_waitcnt lgkmcnt(0)
793 ; VI-NEXT: s_add_u32 s4, s0, 3
794 ; VI-NEXT: s_addc_u32 s5, s1, 0
795 ; VI-NEXT: v_mov_b32_e32 v2, s4
796 ; VI-NEXT: v_mov_b32_e32 v3, s5
797 ; VI-NEXT: s_add_u32 s4, s0, 2
798 ; VI-NEXT: v_mov_b32_e32 v0, s0
799 ; VI-NEXT: s_addc_u32 s5, s1, 0
800 ; VI-NEXT: v_mov_b32_e32 v1, s1
801 ; VI-NEXT: s_add_u32 s0, s0, 1
802 ; VI-NEXT: s_addc_u32 s1, s1, 0
803 ; VI-NEXT: v_mov_b32_e32 v4, s4
804 ; VI-NEXT: v_mov_b32_e32 v7, s1
805 ; VI-NEXT: v_mov_b32_e32 v5, s5
806 ; VI-NEXT: v_mov_b32_e32 v6, s0
807 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
808 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
809 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
810 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
811 ; VI-NEXT: s_waitcnt vmcnt(3)
812 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
813 ; VI-NEXT: s_waitcnt vmcnt(2)
814 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
815 ; VI-NEXT: s_waitcnt vmcnt(1)
816 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
817 ; VI-NEXT: s_waitcnt vmcnt(0)
818 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
819 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
820 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
821 ; VI-NEXT: v_min_u32_e32 v2, 32, v0
822 ; VI-NEXT: v_mov_b32_e32 v0, s2
823 ; VI-NEXT: v_mov_b32_e32 v1, s3
824 ; VI-NEXT: flat_store_dword v[0:1], v2
827 ; EG-LABEL: v_cttz_zero_undef_i32_with_select:
829 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
831 ; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
832 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
835 ; EG-NEXT: Fetch clause starting at 6:
836 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
837 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
838 ; EG-NEXT: ALU clause starting at 10:
839 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
840 ; EG-NEXT: ALU clause starting at 11:
841 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
842 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
843 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
844 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
845 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
846 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
847 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
849 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
850 ; GFX9-GISEL: ; %bb.0:
851 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
852 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
853 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
854 ; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
855 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
856 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
857 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
858 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
859 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
860 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
861 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
862 ; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
863 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
864 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
865 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
866 ; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
867 ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
868 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
869 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
870 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
871 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
872 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
873 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
874 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3]
875 ; GFX9-GISEL-NEXT: s_endpgm
876 %val = load i32, i32 addrspace(1)* %arrayidx, align 1
877 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
878 %cttz_ret = icmp ne i32 %val, 0
879 %ret = select i1 %cttz_ret, i32 %cttz, i32 32
880 store i32 %ret, i32 addrspace(1)* %out, align 4
884 define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind {
885 ; SI-LABEL: v_cttz_zero_undef_i64_with_select:
887 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
888 ; SI-NEXT: s_mov_b32 s3, 0xf000
889 ; SI-NEXT: s_mov_b32 s2, -1
890 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
891 ; SI-NEXT: s_mov_b32 s6, s2
892 ; SI-NEXT: s_mov_b32 s7, s3
893 ; SI-NEXT: s_waitcnt lgkmcnt(0)
894 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
895 ; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:1
896 ; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
897 ; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:3
898 ; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:4
899 ; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:5
900 ; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:7
901 ; SI-NEXT: buffer_load_ubyte v8, off, s[4:7], 0 offset:6
902 ; SI-NEXT: v_mov_b32_e32 v1, 0
903 ; SI-NEXT: s_waitcnt vmcnt(2)
904 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
905 ; SI-NEXT: s_waitcnt vmcnt(1)
906 ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
907 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
908 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
909 ; SI-NEXT: v_or_b32_e32 v5, v6, v5
910 ; SI-NEXT: s_waitcnt vmcnt(0)
911 ; SI-NEXT: v_or_b32_e32 v6, v7, v8
912 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
913 ; SI-NEXT: v_or_b32_e32 v2, v4, v3
914 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
915 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
916 ; SI-NEXT: v_or_b32_e32 v3, v3, v5
917 ; SI-NEXT: v_or_b32_e32 v2, v2, v0
918 ; SI-NEXT: v_ffbl_b32_e32 v0, v3
919 ; SI-NEXT: v_ffbl_b32_e32 v4, v2
920 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
921 ; SI-NEXT: v_min_u32_e32 v0, v4, v0
922 ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
923 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
924 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
927 ; VI-LABEL: v_cttz_zero_undef_i64_with_select:
929 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
930 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
931 ; VI-NEXT: s_waitcnt lgkmcnt(0)
932 ; VI-NEXT: s_add_u32 s4, s0, 5
933 ; VI-NEXT: s_addc_u32 s5, s1, 0
934 ; VI-NEXT: v_mov_b32_e32 v0, s4
935 ; VI-NEXT: v_mov_b32_e32 v1, s5
936 ; VI-NEXT: s_add_u32 s4, s0, 4
937 ; VI-NEXT: s_addc_u32 s5, s1, 0
938 ; VI-NEXT: v_mov_b32_e32 v2, s4
939 ; VI-NEXT: v_mov_b32_e32 v3, s5
940 ; VI-NEXT: s_add_u32 s4, s0, 7
941 ; VI-NEXT: s_addc_u32 s5, s1, 0
942 ; VI-NEXT: v_mov_b32_e32 v4, s4
943 ; VI-NEXT: v_mov_b32_e32 v5, s5
944 ; VI-NEXT: s_add_u32 s4, s0, 6
945 ; VI-NEXT: s_addc_u32 s5, s1, 0
946 ; VI-NEXT: v_mov_b32_e32 v7, s5
947 ; VI-NEXT: v_mov_b32_e32 v6, s4
948 ; VI-NEXT: s_add_u32 s4, s0, 3
949 ; VI-NEXT: s_addc_u32 s5, s1, 0
950 ; VI-NEXT: v_mov_b32_e32 v9, s5
951 ; VI-NEXT: v_mov_b32_e32 v8, s4
952 ; VI-NEXT: s_add_u32 s4, s0, 2
953 ; VI-NEXT: s_addc_u32 s5, s1, 0
954 ; VI-NEXT: v_mov_b32_e32 v11, s5
955 ; VI-NEXT: v_mov_b32_e32 v10, s4
956 ; VI-NEXT: s_add_u32 s4, s0, 1
957 ; VI-NEXT: s_addc_u32 s5, s1, 0
958 ; VI-NEXT: v_mov_b32_e32 v13, s5
959 ; VI-NEXT: v_mov_b32_e32 v15, s1
960 ; VI-NEXT: v_mov_b32_e32 v12, s4
961 ; VI-NEXT: v_mov_b32_e32 v14, s0
962 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
963 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
964 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
965 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
966 ; VI-NEXT: flat_load_ubyte v5, v[8:9]
967 ; VI-NEXT: flat_load_ubyte v6, v[10:11]
968 ; VI-NEXT: flat_load_ubyte v7, v[12:13]
969 ; VI-NEXT: flat_load_ubyte v8, v[14:15]
970 ; VI-NEXT: v_mov_b32_e32 v1, 0
971 ; VI-NEXT: s_waitcnt vmcnt(7)
972 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
973 ; VI-NEXT: s_waitcnt vmcnt(6)
974 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
975 ; VI-NEXT: s_waitcnt vmcnt(5)
976 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
977 ; VI-NEXT: s_waitcnt vmcnt(4)
978 ; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
979 ; VI-NEXT: v_or_b32_e32 v3, v2, v0
980 ; VI-NEXT: v_ffbl_b32_e32 v2, v3
981 ; VI-NEXT: s_waitcnt vmcnt(3)
982 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
983 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v2
984 ; VI-NEXT: s_waitcnt vmcnt(1)
985 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7
986 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
987 ; VI-NEXT: s_waitcnt vmcnt(0)
988 ; VI-NEXT: v_or_b32_e32 v2, v2, v8
989 ; VI-NEXT: v_or_b32_e32 v2, v0, v2
990 ; VI-NEXT: v_ffbl_b32_e32 v0, v2
991 ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
992 ; VI-NEXT: v_min_u32_e32 v0, v0, v4
993 ; VI-NEXT: v_mov_b32_e32 v2, s2
994 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
995 ; VI-NEXT: v_mov_b32_e32 v3, s3
996 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
999 ; EG-LABEL: v_cttz_zero_undef_i64_with_select:
1001 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1003 ; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[]
1004 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1007 ; EG-NEXT: Fetch clause starting at 6:
1008 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1
1009 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
1010 ; EG-NEXT: VTX_READ_16 T3.X, T0.X, 4, #1
1011 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
1012 ; EG-NEXT: ALU clause starting at 14:
1013 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1014 ; EG-NEXT: ALU clause starting at 15:
1015 ; EG-NEXT: LSHL T0.W, T1.X, literal.x,
1016 ; EG-NEXT: LSHL * T1.W, T0.X, literal.x,
1017 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1018 ; EG-NEXT: OR_INT * T0.W, PV.W, T3.X,
1019 ; EG-NEXT: FFBL_INT T0.W, PV.W,
1020 ; EG-NEXT: OR_INT * T1.W, T1.W, T2.X,
1021 ; EG-NEXT: OR_INT * T0.Y, T2.X, T3.X,
1022 ; EG-NEXT: OR_INT T0.Z, T0.X, T1.X,
1023 ; EG-NEXT: FFBL_INT T2.W, T1.W,
1024 ; EG-NEXT: ADD_INT * T0.W, T0.W, literal.x,
1025 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1026 ; EG-NEXT: CNDE_INT T0.W, T1.W, PS, PV.W,
1027 ; EG-NEXT: OR_INT * T1.W, T0.Y, PV.Z,
1028 ; EG-NEXT: CNDE_INT T0.X, PS, literal.x, PV.W,
1029 ; EG-NEXT: MOV T0.Y, 0.0,
1030 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1031 ; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
1033 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
1034 ; GFX9-GISEL: ; %bb.0:
1035 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1036 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
1037 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1038 ; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
1039 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1040 ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[4:5]
1041 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[4:5] offset:1
1042 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[4:5] offset:2
1043 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[4:5] offset:3
1044 ; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[4:5] offset:4
1045 ; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[4:5] offset:5
1046 ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[4:5] offset:6
1047 ; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[4:5] offset:7
1048 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6)
1049 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, s0, v2
1050 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2
1051 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4)
1052 ; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
1053 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4
1054 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1055 ; GFX9-GISEL-NEXT: v_and_b32_e32 v6, s0, v6
1056 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v6, 8, v6
1057 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1058 ; GFX9-GISEL-NEXT: v_and_b32_e32 v8, s0, v8
1059 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v8, 8, v8
1060 ; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1061 ; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1062 ; GFX9-GISEL-NEXT: v_or_b32_sdwa v3, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1063 ; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1064 ; GFX9-GISEL-NEXT: v_bfe_u32 v3, v3, 0, 16
1065 ; GFX9-GISEL-NEXT: v_bfe_u32 v4, v4, 0, 16
1066 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3
1067 ; GFX9-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
1068 ; GFX9-GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16
1069 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v2, v2, 16, v0
1070 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3
1071 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2
1072 ; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4
1073 ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1074 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4
1075 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
1076 ; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3]
1077 ; GFX9-GISEL-NEXT: s_endpgm
1078 %val = load i64, i64 addrspace(1)* %arrayidx, align 1
1079 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
1080 %cttz_ret = icmp ne i64 %val, 0
1081 %ret = select i1 %cttz_ret, i64 %cttz, i64 32
1082 store i64 %ret, i64 addrspace(1)* %out, align 4
1086 define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1087 ; SI-LABEL: v_cttz_i32_sel_eq_neg1:
1089 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1090 ; SI-NEXT: s_mov_b32 s3, 0xf000
1091 ; SI-NEXT: s_mov_b32 s2, -1
1092 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1093 ; SI-NEXT: s_mov_b32 s6, s2
1094 ; SI-NEXT: s_mov_b32 s7, s3
1095 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1096 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1097 ; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3
1098 ; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0
1099 ; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
1100 ; SI-NEXT: s_waitcnt vmcnt(3)
1101 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1102 ; SI-NEXT: s_waitcnt vmcnt(2)
1103 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1104 ; SI-NEXT: s_waitcnt vmcnt(1)
1105 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1106 ; SI-NEXT: s_waitcnt vmcnt(0)
1107 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1108 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1109 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1110 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1111 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1114 ; VI-LABEL: v_cttz_i32_sel_eq_neg1:
1116 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1117 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1118 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1119 ; VI-NEXT: s_add_u32 s4, s0, 3
1120 ; VI-NEXT: s_addc_u32 s5, s1, 0
1121 ; VI-NEXT: v_mov_b32_e32 v2, s4
1122 ; VI-NEXT: v_mov_b32_e32 v3, s5
1123 ; VI-NEXT: s_add_u32 s4, s0, 2
1124 ; VI-NEXT: v_mov_b32_e32 v0, s0
1125 ; VI-NEXT: s_addc_u32 s5, s1, 0
1126 ; VI-NEXT: v_mov_b32_e32 v1, s1
1127 ; VI-NEXT: s_add_u32 s0, s0, 1
1128 ; VI-NEXT: s_addc_u32 s1, s1, 0
1129 ; VI-NEXT: v_mov_b32_e32 v4, s4
1130 ; VI-NEXT: v_mov_b32_e32 v7, s1
1131 ; VI-NEXT: v_mov_b32_e32 v5, s5
1132 ; VI-NEXT: v_mov_b32_e32 v6, s0
1133 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1134 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1135 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
1136 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1137 ; VI-NEXT: s_waitcnt vmcnt(3)
1138 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1139 ; VI-NEXT: s_waitcnt vmcnt(2)
1140 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1141 ; VI-NEXT: s_waitcnt vmcnt(1)
1142 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
1143 ; VI-NEXT: s_waitcnt vmcnt(0)
1144 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1145 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1146 ; VI-NEXT: v_ffbl_b32_e32 v2, v0
1147 ; VI-NEXT: v_mov_b32_e32 v0, s2
1148 ; VI-NEXT: v_mov_b32_e32 v1, s3
1149 ; VI-NEXT: flat_store_dword v[0:1], v2
1152 ; EG-LABEL: v_cttz_i32_sel_eq_neg1:
1154 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1156 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1157 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1160 ; EG-NEXT: Fetch clause starting at 6:
1161 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1162 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1163 ; EG-NEXT: ALU clause starting at 10:
1164 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1165 ; EG-NEXT: ALU clause starting at 11:
1166 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1167 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1168 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1169 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
1170 ; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1171 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1172 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
1173 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1174 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1176 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
1177 ; GFX9-GISEL: ; %bb.0:
1178 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1179 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
1180 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1181 ; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
1182 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
1183 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1184 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
1185 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
1186 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
1187 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
1188 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1189 ; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1190 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1191 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
1192 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1193 ; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
1194 ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
1195 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1196 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
1197 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
1198 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
1199 ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
1200 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1201 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
1202 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3]
1203 ; GFX9-GISEL-NEXT: s_endpgm
1204 %val = load i32, i32 addrspace(1)* %arrayidx, align 1
1205 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1206 %cmp = icmp eq i32 %val, 0
1207 %sel = select i1 %cmp, i32 -1, i32 %ctlz
1208 store i32 %sel, i32 addrspace(1)* %out
1212 define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1213 ; SI-LABEL: v_cttz_i32_sel_ne_neg1:
1215 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1216 ; SI-NEXT: s_mov_b32 s3, 0xf000
1217 ; SI-NEXT: s_mov_b32 s2, -1
1218 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1219 ; SI-NEXT: s_mov_b32 s6, s2
1220 ; SI-NEXT: s_mov_b32 s7, s3
1221 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1222 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1223 ; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3
1224 ; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0
1225 ; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
1226 ; SI-NEXT: s_waitcnt vmcnt(3)
1227 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1228 ; SI-NEXT: s_waitcnt vmcnt(2)
1229 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1230 ; SI-NEXT: s_waitcnt vmcnt(1)
1231 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1232 ; SI-NEXT: s_waitcnt vmcnt(0)
1233 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1234 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1235 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1236 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1237 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1240 ; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1242 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1243 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1244 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1245 ; VI-NEXT: s_add_u32 s4, s0, 3
1246 ; VI-NEXT: s_addc_u32 s5, s1, 0
1247 ; VI-NEXT: v_mov_b32_e32 v2, s4
1248 ; VI-NEXT: v_mov_b32_e32 v3, s5
1249 ; VI-NEXT: s_add_u32 s4, s0, 2
1250 ; VI-NEXT: v_mov_b32_e32 v0, s0
1251 ; VI-NEXT: s_addc_u32 s5, s1, 0
1252 ; VI-NEXT: v_mov_b32_e32 v1, s1
1253 ; VI-NEXT: s_add_u32 s0, s0, 1
1254 ; VI-NEXT: s_addc_u32 s1, s1, 0
1255 ; VI-NEXT: v_mov_b32_e32 v4, s4
1256 ; VI-NEXT: v_mov_b32_e32 v7, s1
1257 ; VI-NEXT: v_mov_b32_e32 v5, s5
1258 ; VI-NEXT: v_mov_b32_e32 v6, s0
1259 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1260 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1261 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
1262 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1263 ; VI-NEXT: s_waitcnt vmcnt(3)
1264 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1265 ; VI-NEXT: s_waitcnt vmcnt(2)
1266 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1267 ; VI-NEXT: s_waitcnt vmcnt(1)
1268 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
1269 ; VI-NEXT: s_waitcnt vmcnt(0)
1270 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1271 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1272 ; VI-NEXT: v_ffbl_b32_e32 v2, v0
1273 ; VI-NEXT: v_mov_b32_e32 v0, s2
1274 ; VI-NEXT: v_mov_b32_e32 v1, s3
1275 ; VI-NEXT: flat_store_dword v[0:1], v2
1278 ; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1280 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1282 ; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
1283 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1286 ; EG-NEXT: Fetch clause starting at 6:
1287 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1288 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1289 ; EG-NEXT: ALU clause starting at 10:
1290 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1291 ; EG-NEXT: ALU clause starting at 11:
1292 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1293 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1294 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1295 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
1296 ; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1297 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1298 ; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
1299 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1300 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1302 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1303 ; GFX9-GISEL: ; %bb.0:
1304 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1305 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
1306 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1307 ; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
1308 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
1309 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1310 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
1311 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
1312 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
1313 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
1314 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1315 ; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1316 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1317 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
1318 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1319 ; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
1320 ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
1321 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1322 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
1323 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
1324 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
1325 ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
1326 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1327 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
1328 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3]
1329 ; GFX9-GISEL-NEXT: s_endpgm
1330 %val = load i32, i32 addrspace(1)* %arrayidx, align 1
1331 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1332 %cmp = icmp ne i32 %val, 0
1333 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1334 store i32 %sel, i32 addrspace(1)* %out
1338 define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1339 ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1341 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1342 ; SI-NEXT: s_mov_b32 s3, 0xf000
1343 ; SI-NEXT: s_mov_b32 s2, -1
1344 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1345 ; SI-NEXT: s_mov_b32 s6, s2
1346 ; SI-NEXT: s_mov_b32 s7, s3
1347 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1348 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1349 ; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3
1350 ; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0
1351 ; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
1352 ; SI-NEXT: s_waitcnt vmcnt(3)
1353 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1354 ; SI-NEXT: s_waitcnt vmcnt(2)
1355 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1356 ; SI-NEXT: s_waitcnt vmcnt(1)
1357 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1358 ; SI-NEXT: s_waitcnt vmcnt(0)
1359 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1360 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1361 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1362 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1363 ; SI-NEXT: v_min_u32_e32 v0, 32, v0
1364 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1365 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1366 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1369 ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1371 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1372 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1373 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1374 ; VI-NEXT: s_add_u32 s4, s0, 3
1375 ; VI-NEXT: s_addc_u32 s5, s1, 0
1376 ; VI-NEXT: v_mov_b32_e32 v2, s4
1377 ; VI-NEXT: v_mov_b32_e32 v3, s5
1378 ; VI-NEXT: s_add_u32 s4, s0, 2
1379 ; VI-NEXT: v_mov_b32_e32 v0, s0
1380 ; VI-NEXT: s_addc_u32 s5, s1, 0
1381 ; VI-NEXT: v_mov_b32_e32 v1, s1
1382 ; VI-NEXT: s_add_u32 s0, s0, 1
1383 ; VI-NEXT: s_addc_u32 s1, s1, 0
1384 ; VI-NEXT: v_mov_b32_e32 v4, s4
1385 ; VI-NEXT: v_mov_b32_e32 v7, s1
1386 ; VI-NEXT: v_mov_b32_e32 v5, s5
1387 ; VI-NEXT: v_mov_b32_e32 v6, s0
1388 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1389 ; VI-NEXT: flat_load_ubyte v3, v[4:5]
1390 ; VI-NEXT: flat_load_ubyte v4, v[6:7]
1391 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1392 ; VI-NEXT: s_waitcnt vmcnt(3)
1393 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1394 ; VI-NEXT: s_waitcnt vmcnt(2)
1395 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1396 ; VI-NEXT: s_waitcnt vmcnt(1)
1397 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
1398 ; VI-NEXT: s_waitcnt vmcnt(0)
1399 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1400 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1401 ; VI-NEXT: v_ffbl_b32_e32 v0, v0
1402 ; VI-NEXT: v_min_u32_e32 v0, 32, v0
1403 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
1404 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
1405 ; VI-NEXT: v_mov_b32_e32 v0, s2
1406 ; VI-NEXT: v_mov_b32_e32 v1, s3
1407 ; VI-NEXT: flat_store_dword v[0:1], v2
1410 ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1412 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1414 ; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
1415 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1418 ; EG-NEXT: Fetch clause starting at 6:
1419 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1420 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1421 ; EG-NEXT: ALU clause starting at 10:
1422 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1423 ; EG-NEXT: ALU clause starting at 11:
1424 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1425 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1426 ; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
1427 ; EG-NEXT: FFBL_INT * T1.W, PV.W,
1428 ; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W,
1429 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1430 ; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x,
1431 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1432 ; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W,
1433 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1434 ; EG-NEXT: -1(nan), 2(2.802597e-45)
1436 ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1437 ; GFX9-GISEL: ; %bb.0:
1438 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1439 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
1440 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1441 ; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff
1442 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8
1443 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1444 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
1445 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
1446 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2
1447 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3
1448 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2)
1449 ; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1450 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1)
1451 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3
1452 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1453 ; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4
1454 ; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2
1455 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1456 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4
1457 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3
1458 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
1459 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
1460 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1
1461 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
1462 ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3]
1463 ; GFX9-GISEL-NEXT: s_endpgm
1464 %val = load i32, i32 addrspace(1)* %arrayidx, align 1
1465 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1466 %cmp = icmp ne i32 %ctlz, 32
1467 %sel = select i1 %cmp, i32 %ctlz, i32 -1
1468 store i32 %sel, i32 addrspace(1)* %out
1472 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1473 ; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1475 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1476 ; SI-NEXT: s_mov_b32 s3, 0xf000
1477 ; SI-NEXT: s_mov_b32 s2, -1
1478 ; SI-NEXT: s_mov_b32 s6, s2
1479 ; SI-NEXT: s_mov_b32 s7, s3
1480 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1481 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
1482 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1483 ; SI-NEXT: s_waitcnt vmcnt(0)
1484 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1485 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1486 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1489 ; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1491 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1492 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1493 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1494 ; VI-NEXT: v_mov_b32_e32 v0, s0
1495 ; VI-NEXT: v_mov_b32_e32 v1, s1
1496 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1497 ; VI-NEXT: v_mov_b32_e32 v1, 0xff
1498 ; VI-NEXT: s_waitcnt vmcnt(0)
1499 ; VI-NEXT: v_or_b32_e32 v2, 0x100, v0
1500 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
1501 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
1502 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
1503 ; VI-NEXT: v_mov_b32_e32 v0, s2
1504 ; VI-NEXT: v_mov_b32_e32 v1, s3
1505 ; VI-NEXT: flat_store_byte v[0:1], v2
1508 ; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1510 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1512 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1513 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1516 ; EG-NEXT: Fetch clause starting at 6:
1517 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1518 ; EG-NEXT: ALU clause starting at 8:
1519 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1520 ; EG-NEXT: ALU clause starting at 9:
1521 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1522 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1523 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1524 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1525 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1526 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
1527 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1528 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1529 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1530 ; EG-NEXT: MOV T0.Y, 0.0,
1531 ; EG-NEXT: MOV * T0.Z, 0.0,
1532 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1533 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1535 ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1536 ; GFX9-GISEL: ; %bb.0:
1537 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1538 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
1539 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1540 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff
1541 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1542 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
1543 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1544 ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1
1545 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
1546 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
1547 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1548 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
1549 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3]
1550 ; GFX9-GISEL-NEXT: s_endpgm
1551 %val = load i8, i8 addrspace(1)* %arrayidx, align 1
1552 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1553 %cmp = icmp eq i8 %val, 0
1554 %sel = select i1 %cmp, i8 -1, i8 %ctlz
1555 store i8 %sel, i8 addrspace(1)* %out
1559 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
1560 ; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1562 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
1563 ; SI-NEXT: s_mov_b32 s3, 0xf000
1564 ; SI-NEXT: s_mov_b32 s2, -1
1565 ; SI-NEXT: s_mov_b32 s6, s2
1566 ; SI-NEXT: s_mov_b32 s7, s3
1567 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1568 ; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1
1569 ; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0
1570 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1571 ; SI-NEXT: s_waitcnt vmcnt(1)
1572 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1573 ; SI-NEXT: s_waitcnt vmcnt(0)
1574 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
1575 ; SI-NEXT: v_ffbl_b32_e32 v0, v0
1576 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1577 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1580 ; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1582 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1583 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
1584 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1585 ; VI-NEXT: s_add_u32 s4, s0, 1
1586 ; VI-NEXT: s_addc_u32 s5, s1, 0
1587 ; VI-NEXT: v_mov_b32_e32 v2, s4
1588 ; VI-NEXT: v_mov_b32_e32 v0, s0
1589 ; VI-NEXT: v_mov_b32_e32 v3, s5
1590 ; VI-NEXT: v_mov_b32_e32 v1, s1
1591 ; VI-NEXT: flat_load_ubyte v2, v[2:3]
1592 ; VI-NEXT: flat_load_ubyte v0, v[0:1]
1593 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff
1594 ; VI-NEXT: s_waitcnt vmcnt(1)
1595 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
1596 ; VI-NEXT: s_waitcnt vmcnt(0)
1597 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1598 ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
1599 ; VI-NEXT: v_ffbl_b32_e32 v2, v2
1600 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
1601 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
1602 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
1603 ; VI-NEXT: v_mov_b32_e32 v0, s2
1604 ; VI-NEXT: v_mov_b32_e32 v1, s3
1605 ; VI-NEXT: flat_store_short v[0:1], v2
1608 ; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1610 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1612 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1613 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1616 ; EG-NEXT: Fetch clause starting at 6:
1617 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1618 ; EG-NEXT: ALU clause starting at 8:
1619 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1620 ; EG-NEXT: ALU clause starting at 9:
1621 ; EG-NEXT: FFBL_INT T0.W, T0.X,
1622 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
1623 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1624 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
1625 ; EG-NEXT: LSHL * T1.W, PS, literal.y,
1626 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1627 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1628 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1629 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1630 ; EG-NEXT: MOV T0.Y, 0.0,
1631 ; EG-NEXT: MOV * T0.Z, 0.0,
1632 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1633 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1635 ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1636 ; GFX9-GISEL: ; %bb.0:
1637 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1638 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
1639 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
1640 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff
1641 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1642 ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5]
1643 ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1
1644 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1645 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
1646 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
1647 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
1648 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
1649 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
1650 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
1651 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3]
1652 ; GFX9-GISEL-NEXT: s_endpgm
1653 %val = load i16, i16 addrspace(1)* %arrayidx, align 1
1654 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1655 %cmp = icmp eq i16 %val, 0
1656 %sel = select i1 %cmp, i16 -1, i16 %ctlz
1657 store i16 %sel, i16 addrspace(1)* %out