1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
7 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
8 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
9 declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
10 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
11 declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
13 declare i65 @llvm.ctpop.i65(i65) nounwind readnone
14 declare i128 @llvm.ctpop.i128(i128) nounwind readnone
16 define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
17 ; SI-LABEL: s_ctpop_i64:
19 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13
20 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
21 ; SI-NEXT: s_mov_b32 s3, 0xf000
22 ; SI-NEXT: s_mov_b32 s2, -1
23 ; SI-NEXT: s_waitcnt lgkmcnt(0)
24 ; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
25 ; SI-NEXT: v_mov_b32_e32 v0, s4
26 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
29 ; VI-LABEL: s_ctpop_i64:
31 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c
32 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
33 ; VI-NEXT: s_mov_b32 s3, 0xf000
34 ; VI-NEXT: s_mov_b32 s2, -1
35 ; VI-NEXT: s_waitcnt lgkmcnt(0)
36 ; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
37 ; VI-NEXT: v_mov_b32_e32 v0, s4
38 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
40 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
41 %truncctpop = trunc i64 %ctpop to i32
42 store i32 %truncctpop, ptr addrspace(1) %out, align 4
46 define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
47 ; SI-LABEL: v_ctpop_i64:
49 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
50 ; SI-NEXT: s_mov_b32 s7, 0xf000
51 ; SI-NEXT: s_mov_b32 s10, 0
52 ; SI-NEXT: s_mov_b32 s11, s7
53 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
54 ; SI-NEXT: s_waitcnt lgkmcnt(0)
55 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
56 ; SI-NEXT: v_mov_b32_e32 v1, 0
57 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
58 ; SI-NEXT: s_mov_b32 s6, -1
59 ; SI-NEXT: s_mov_b32 s4, s0
60 ; SI-NEXT: s_mov_b32 s5, s1
61 ; SI-NEXT: s_waitcnt vmcnt(0)
62 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
63 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0
64 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
67 ; VI-LABEL: v_ctpop_i64:
69 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
70 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
71 ; VI-NEXT: s_waitcnt lgkmcnt(0)
72 ; VI-NEXT: v_mov_b32_e32 v1, s3
73 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
74 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
75 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
76 ; VI-NEXT: s_mov_b32 s3, 0xf000
77 ; VI-NEXT: s_mov_b32 s2, -1
78 ; VI-NEXT: s_waitcnt vmcnt(0)
79 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
80 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
81 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
83 %tid = call i32 @llvm.amdgcn.workitem.id.x()
84 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
85 %val = load i64, ptr addrspace(1) %in.gep, align 8
86 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
87 %truncctpop = trunc i64 %ctpop to i32
88 store i32 %truncctpop, ptr addrspace(1) %out, align 4
92 define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind {
93 ; SI-LABEL: v_ctpop_i64_user:
95 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
96 ; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd
97 ; SI-NEXT: s_mov_b32 s3, 0xf000
98 ; SI-NEXT: s_mov_b32 s10, 0
99 ; SI-NEXT: s_mov_b32 s11, s3
100 ; SI-NEXT: s_waitcnt lgkmcnt(0)
101 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
102 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
103 ; SI-NEXT: v_mov_b32_e32 v1, 0
104 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
105 ; SI-NEXT: s_mov_b32 s2, -1
106 ; SI-NEXT: s_mov_b32 s0, s4
107 ; SI-NEXT: s_mov_b32 s1, s5
108 ; SI-NEXT: s_waitcnt vmcnt(0)
109 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
110 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0
111 ; SI-NEXT: v_mov_b32_e32 v1, s13
112 ; SI-NEXT: v_or_b32_e32 v0, s12, v0
113 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
116 ; VI-LABEL: v_ctpop_i64_user:
118 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
119 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
120 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
121 ; VI-NEXT: s_waitcnt lgkmcnt(0)
122 ; VI-NEXT: v_mov_b32_e32 v1, s7
123 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
124 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
125 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
126 ; VI-NEXT: s_mov_b32 s7, 0xf000
127 ; VI-NEXT: s_mov_b32 s6, -1
128 ; VI-NEXT: s_waitcnt vmcnt(0)
129 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
130 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
131 ; VI-NEXT: v_mov_b32_e32 v1, s1
132 ; VI-NEXT: v_or_b32_e32 v0, s0, v0
133 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
135 %tid = call i32 @llvm.amdgcn.workitem.id.x()
136 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
137 %val = load i64, ptr addrspace(1) %in.gep, align 8
138 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
139 %or = or i64 %ctpop, %s.val
140 store i64 %or, ptr addrspace(1) %out
144 define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind {
145 ; SI-LABEL: s_ctpop_v2i64:
147 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
148 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
149 ; SI-NEXT: s_mov_b32 s3, 0xf000
150 ; SI-NEXT: s_mov_b32 s2, -1
151 ; SI-NEXT: s_waitcnt lgkmcnt(0)
152 ; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
153 ; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
154 ; SI-NEXT: v_mov_b32_e32 v0, s4
155 ; SI-NEXT: v_mov_b32_e32 v1, s5
156 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
159 ; VI-LABEL: s_ctpop_v2i64:
161 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
162 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
163 ; VI-NEXT: s_mov_b32 s3, 0xf000
164 ; VI-NEXT: s_mov_b32 s2, -1
165 ; VI-NEXT: s_waitcnt lgkmcnt(0)
166 ; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
167 ; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
168 ; VI-NEXT: v_mov_b32_e32 v0, s4
169 ; VI-NEXT: v_mov_b32_e32 v1, s5
170 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
172 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
173 %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
174 store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8
178 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind {
179 ; SI-LABEL: s_ctpop_v4i64:
181 ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
182 ; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9
183 ; SI-NEXT: s_mov_b32 s15, 0xf000
184 ; SI-NEXT: s_mov_b32 s14, -1
185 ; SI-NEXT: s_waitcnt lgkmcnt(0)
186 ; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
187 ; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
188 ; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
189 ; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
190 ; SI-NEXT: v_mov_b32_e32 v0, s0
191 ; SI-NEXT: v_mov_b32_e32 v1, s1
192 ; SI-NEXT: v_mov_b32_e32 v2, s2
193 ; SI-NEXT: v_mov_b32_e32 v3, s3
194 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
197 ; VI-LABEL: s_ctpop_v4i64:
199 ; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
200 ; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
201 ; VI-NEXT: s_mov_b32 s15, 0xf000
202 ; VI-NEXT: s_mov_b32 s14, -1
203 ; VI-NEXT: s_waitcnt lgkmcnt(0)
204 ; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
205 ; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
206 ; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
207 ; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
208 ; VI-NEXT: v_mov_b32_e32 v0, s0
209 ; VI-NEXT: v_mov_b32_e32 v1, s1
210 ; VI-NEXT: v_mov_b32_e32 v2, s2
211 ; VI-NEXT: v_mov_b32_e32 v3, s3
212 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
214 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
215 %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
216 store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16
220 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
221 ; SI-LABEL: v_ctpop_v2i64:
223 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
224 ; SI-NEXT: s_mov_b32 s7, 0xf000
225 ; SI-NEXT: s_mov_b32 s10, 0
226 ; SI-NEXT: s_mov_b32 s11, s7
227 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
228 ; SI-NEXT: s_waitcnt lgkmcnt(0)
229 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
230 ; SI-NEXT: v_mov_b32_e32 v1, 0
231 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
232 ; SI-NEXT: s_mov_b32 s6, -1
233 ; SI-NEXT: s_mov_b32 s4, s0
234 ; SI-NEXT: s_mov_b32 s5, s1
235 ; SI-NEXT: s_waitcnt vmcnt(0)
236 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
237 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
238 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0
239 ; SI-NEXT: v_bcnt_u32_b32_e32 v1, v3, v2
240 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
243 ; VI-LABEL: v_ctpop_v2i64:
245 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
246 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
247 ; VI-NEXT: s_waitcnt lgkmcnt(0)
248 ; VI-NEXT: v_mov_b32_e32 v1, s3
249 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
250 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
251 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
252 ; VI-NEXT: s_mov_b32 s3, 0xf000
253 ; VI-NEXT: s_mov_b32 s2, -1
254 ; VI-NEXT: s_waitcnt vmcnt(0)
255 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
256 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
257 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
258 ; VI-NEXT: v_bcnt_u32_b32 v1, v3, v2
259 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
261 %tid = call i32 @llvm.amdgcn.workitem.id.x()
262 %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
263 %val = load <2 x i64>, ptr addrspace(1) %in.gep, align 16
264 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
265 %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
266 store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8
270 define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
271 ; SI-LABEL: v_ctpop_v4i64:
273 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
274 ; SI-NEXT: s_mov_b32 s7, 0xf000
275 ; SI-NEXT: s_mov_b32 s10, 0
276 ; SI-NEXT: s_mov_b32 s11, s7
277 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
278 ; SI-NEXT: s_waitcnt lgkmcnt(0)
279 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
280 ; SI-NEXT: v_mov_b32_e32 v5, 0
281 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64
282 ; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64 offset:16
283 ; SI-NEXT: s_mov_b32 s6, -1
284 ; SI-NEXT: s_mov_b32 s4, s0
285 ; SI-NEXT: s_mov_b32 s5, s1
286 ; SI-NEXT: s_waitcnt vmcnt(1)
287 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
288 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
289 ; SI-NEXT: s_waitcnt vmcnt(0)
290 ; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0
291 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0
292 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0
293 ; SI-NEXT: v_bcnt_u32_b32_e32 v1, v3, v2
294 ; SI-NEXT: v_bcnt_u32_b32_e32 v2, v5, v4
295 ; SI-NEXT: v_bcnt_u32_b32_e32 v3, v7, v6
296 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
299 ; VI-LABEL: v_ctpop_v4i64:
301 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
302 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
303 ; VI-NEXT: s_waitcnt lgkmcnt(0)
304 ; VI-NEXT: v_mov_b32_e32 v1, s3
305 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0
306 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
307 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
308 ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4
309 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
310 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
311 ; VI-NEXT: s_mov_b32 s3, 0xf000
312 ; VI-NEXT: s_mov_b32 s2, -1
313 ; VI-NEXT: s_waitcnt vmcnt(1)
314 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
315 ; VI-NEXT: v_bcnt_u32_b32 v8, v2, 0
316 ; VI-NEXT: v_bcnt_u32_b32 v2, v1, v0
317 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, v8
318 ; VI-NEXT: s_waitcnt vmcnt(0)
319 ; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0
320 ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0
321 ; VI-NEXT: v_bcnt_u32_b32 v4, v5, v4
322 ; VI-NEXT: v_bcnt_u32_b32 v5, v7, v6
323 ; VI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0
325 %tid = call i32 @llvm.amdgcn.workitem.id.x()
326 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
327 %val = load <4 x i64>, ptr addrspace(1) %in.gep, align 32
328 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
329 %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
330 store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16
334 define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
335 ; SI-LABEL: ctpop_i64_in_br:
336 ; SI: ; %bb.0: ; %entry
337 ; SI-NEXT: s_load_dword s0, s[2:3], 0xf
338 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
339 ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd
340 ; SI-NEXT: s_waitcnt lgkmcnt(0)
341 ; SI-NEXT: s_cmp_lg_u32 s0, 0
342 ; SI-NEXT: s_cbranch_scc0 .LBB7_4
343 ; SI-NEXT: ; %bb.1: ; %else
344 ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2
345 ; SI-NEXT: s_mov_b64 s[6:7], 0
346 ; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7]
347 ; SI-NEXT: s_waitcnt lgkmcnt(0)
348 ; SI-NEXT: s_mov_b64 vcc, vcc
349 ; SI-NEXT: s_cbranch_vccnz .LBB7_3
350 ; SI-NEXT: .LBB7_2: ; %if
351 ; SI-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
352 ; SI-NEXT: s_mov_b32 s1, 0
353 ; SI-NEXT: .LBB7_3: ; %endif
354 ; SI-NEXT: v_mov_b32_e32 v0, s0
355 ; SI-NEXT: s_mov_b32 s7, 0xf000
356 ; SI-NEXT: s_mov_b32 s6, -1
357 ; SI-NEXT: v_mov_b32_e32 v1, s1
358 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
361 ; SI-NEXT: ; implicit-def: $sgpr0_sgpr1
362 ; SI-NEXT: s_branch .LBB7_2
364 ; VI-LABEL: ctpop_i64_in_br:
365 ; VI: ; %bb.0: ; %entry
366 ; VI-NEXT: s_load_dword s0, s[2:3], 0x3c
367 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
368 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
369 ; VI-NEXT: s_waitcnt lgkmcnt(0)
370 ; VI-NEXT: s_cmp_lg_u32 s0, 0
371 ; VI-NEXT: s_cbranch_scc0 .LBB7_4
372 ; VI-NEXT: ; %bb.1: ; %else
373 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8
374 ; VI-NEXT: s_cbranch_execnz .LBB7_3
375 ; VI-NEXT: .LBB7_2: ; %if
376 ; VI-NEXT: s_waitcnt lgkmcnt(0)
377 ; VI-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
378 ; VI-NEXT: s_mov_b32 s1, 0
379 ; VI-NEXT: .LBB7_3: ; %endif
380 ; VI-NEXT: s_waitcnt lgkmcnt(0)
381 ; VI-NEXT: v_mov_b32_e32 v0, s0
382 ; VI-NEXT: s_mov_b32 s7, 0xf000
383 ; VI-NEXT: s_mov_b32 s6, -1
384 ; VI-NEXT: v_mov_b32_e32 v1, s1
385 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
388 ; VI-NEXT: ; implicit-def: $sgpr0_sgpr1
389 ; VI-NEXT: s_branch .LBB7_2
391 %tmp0 = icmp eq i32 %cond, 0
392 br i1 %tmp0, label %if, label %else
395 %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
399 %tmp3 = getelementptr i64, ptr addrspace(1) %in, i32 1
400 %tmp4 = load i64, ptr addrspace(1) %tmp3
404 %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
405 store i64 %tmp5, ptr addrspace(1) %out
409 define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind {
410 ; SI-LABEL: s_ctpop_i128:
412 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
413 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
414 ; SI-NEXT: s_mov_b32 s3, 0xf000
415 ; SI-NEXT: s_mov_b32 s2, -1
416 ; SI-NEXT: s_waitcnt lgkmcnt(0)
417 ; SI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
418 ; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
419 ; SI-NEXT: s_add_i32 s4, s4, s6
420 ; SI-NEXT: v_mov_b32_e32 v0, s4
421 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
424 ; VI-LABEL: s_ctpop_i128:
426 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
427 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
428 ; VI-NEXT: s_mov_b32 s3, 0xf000
429 ; VI-NEXT: s_mov_b32 s2, -1
430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
431 ; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
432 ; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
433 ; VI-NEXT: s_add_i32 s4, s4, s6
434 ; VI-NEXT: v_mov_b32_e32 v0, s4
435 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
437 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
438 %truncctpop = trunc i128 %ctpop to i32
439 store i32 %truncctpop, ptr addrspace(1) %out, align 4
443 define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind {
444 ; SI-LABEL: s_ctpop_i65:
446 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
447 ; SI-NEXT: s_load_dword s8, s[2:3], 0xd
448 ; SI-NEXT: s_mov_b32 s3, 0xf000
449 ; SI-NEXT: s_mov_b32 s2, -1
450 ; SI-NEXT: s_waitcnt lgkmcnt(0)
451 ; SI-NEXT: s_mov_b32 s0, s4
452 ; SI-NEXT: s_and_b32 s4, s8, 0xff
453 ; SI-NEXT: s_mov_b32 s1, s5
454 ; SI-NEXT: s_bcnt1_i32_b32 s4, s4
455 ; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
456 ; SI-NEXT: s_add_i32 s4, s5, s4
457 ; SI-NEXT: v_mov_b32_e32 v0, s4
458 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
461 ; VI-LABEL: s_ctpop_i65:
463 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
464 ; VI-NEXT: s_load_dword s8, s[2:3], 0x34
465 ; VI-NEXT: s_mov_b32 s3, 0xf000
466 ; VI-NEXT: s_mov_b32 s2, -1
467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
468 ; VI-NEXT: s_mov_b32 s0, s4
469 ; VI-NEXT: s_and_b32 s4, s8, 0xff
470 ; VI-NEXT: s_mov_b32 s1, s5
471 ; VI-NEXT: s_bcnt1_i32_b32 s4, s4
472 ; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
473 ; VI-NEXT: s_add_i32 s4, s5, s4
474 ; VI-NEXT: v_mov_b32_e32 v0, s4
475 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
477 %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
478 %truncctpop = trunc i65 %ctpop to i32
479 store i32 %truncctpop, ptr addrspace(1) %out, align 4
483 ; FIXME: Should not have extra add
484 define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
485 ; SI-LABEL: v_ctpop_i128:
487 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
488 ; SI-NEXT: s_mov_b32 s7, 0xf000
489 ; SI-NEXT: s_mov_b32 s10, 0
490 ; SI-NEXT: s_mov_b32 s11, s7
491 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
492 ; SI-NEXT: s_waitcnt lgkmcnt(0)
493 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
494 ; SI-NEXT: v_mov_b32_e32 v1, 0
495 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
496 ; SI-NEXT: s_mov_b32 s6, -1
497 ; SI-NEXT: s_mov_b32 s4, s0
498 ; SI-NEXT: s_mov_b32 s5, s1
499 ; SI-NEXT: s_waitcnt vmcnt(0)
500 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
501 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
502 ; SI-NEXT: v_bcnt_u32_b32_e32 v2, v3, v2
503 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0
504 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
505 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
508 ; VI-LABEL: v_ctpop_i128:
510 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
511 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
512 ; VI-NEXT: s_waitcnt lgkmcnt(0)
513 ; VI-NEXT: v_mov_b32_e32 v1, s3
514 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
515 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
516 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
517 ; VI-NEXT: s_mov_b32 s3, 0xf000
518 ; VI-NEXT: s_mov_b32 s2, -1
519 ; VI-NEXT: s_waitcnt vmcnt(0)
520 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
521 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
522 ; VI-NEXT: v_bcnt_u32_b32 v2, v3, v2
523 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
524 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
525 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
527 %tid = call i32 @llvm.amdgcn.workitem.id.x()
528 %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid
529 %val = load i128, ptr addrspace(1) %in.gep, align 8
530 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
531 %truncctpop = trunc i128 %ctpop to i32
532 store i32 %truncctpop, ptr addrspace(1) %out, align 4