1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
5 declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
6 declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
8 define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
9 ; GFX9-LABEL: ctlz_i64_poison:
11 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
12 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
15 ; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
16 ; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
17 ; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
18 ; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
19 ; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
20 ; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
21 ; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
22 ; GFX9-NEXT: s_waitcnt vmcnt(7)
23 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
24 ; GFX9-NEXT: s_waitcnt vmcnt(5)
25 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
26 ; GFX9-NEXT: s_waitcnt vmcnt(4)
27 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
28 ; GFX9-NEXT: s_waitcnt vmcnt(3)
29 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
30 ; GFX9-NEXT: s_waitcnt vmcnt(2)
31 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
32 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
33 ; GFX9-NEXT: s_waitcnt vmcnt(1)
34 ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
35 ; GFX9-NEXT: s_waitcnt vmcnt(0)
36 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
37 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
38 ; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
39 ; GFX9-NEXT: v_ffbh_u32_e32 v2, v2
40 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0
41 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp
42 ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0
43 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
46 ; GFX10-LABEL: ctlz_i64_poison:
48 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
49 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
50 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
51 ; GFX10-NEXT: s_clause 0x7
52 ; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
53 ; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
54 ; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
55 ; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
56 ; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
57 ; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7]
58 ; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2
59 ; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4
60 ; GFX10-NEXT: s_waitcnt vmcnt(7)
61 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
62 ; GFX10-NEXT: s_waitcnt vmcnt(5)
63 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
64 ; GFX10-NEXT: s_waitcnt vmcnt(4)
65 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
66 ; GFX10-NEXT: s_waitcnt vmcnt(3)
67 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
68 ; GFX10-NEXT: s_waitcnt vmcnt(0)
69 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v8
70 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
71 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
72 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
73 ; GFX10-NEXT: v_or_b32_e32 v3, v5, v4
74 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
75 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v3
76 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
77 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
78 ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
79 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
80 ; GFX10-NEXT: s_endpgm
81 %val = load i64, ptr addrspace(1) %arrayidx, align 1
82 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
83 store i64 %ctlz, ptr addrspace(1) %out, align 8
87 define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
88 ; GFX9-LABEL: ctlz_i64:
90 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
91 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
92 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
93 ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
94 ; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
95 ; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
96 ; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
97 ; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
98 ; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
99 ; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
100 ; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
101 ; GFX9-NEXT: s_waitcnt vmcnt(7)
102 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
103 ; GFX9-NEXT: s_waitcnt vmcnt(5)
104 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
105 ; GFX9-NEXT: s_waitcnt vmcnt(4)
106 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
107 ; GFX9-NEXT: s_waitcnt vmcnt(3)
108 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
109 ; GFX9-NEXT: s_waitcnt vmcnt(2)
110 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
111 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
112 ; GFX9-NEXT: s_waitcnt vmcnt(1)
113 ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
115 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
116 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
117 ; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
118 ; GFX9-NEXT: v_ffbh_u32_e32 v2, v2
119 ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0
120 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp
121 ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0
122 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0
123 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
124 ; GFX9-NEXT: s_endpgm
126 ; GFX10-LABEL: ctlz_i64:
128 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
129 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
130 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
131 ; GFX10-NEXT: s_clause 0x7
132 ; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
133 ; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
134 ; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
135 ; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
136 ; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
137 ; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7]
138 ; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2
139 ; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4
140 ; GFX10-NEXT: s_waitcnt vmcnt(7)
141 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
142 ; GFX10-NEXT: s_waitcnt vmcnt(5)
143 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
144 ; GFX10-NEXT: s_waitcnt vmcnt(4)
145 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
146 ; GFX10-NEXT: s_waitcnt vmcnt(3)
147 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
148 ; GFX10-NEXT: s_waitcnt vmcnt(0)
149 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v8
150 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
151 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
152 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
153 ; GFX10-NEXT: v_or_b32_e32 v3, v5, v4
154 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
155 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v3
156 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
157 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
158 ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
159 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
160 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
161 ; GFX10-NEXT: s_endpgm
162 %val = load i64, ptr addrspace(1) %arrayidx, align 1
163 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
164 store i64 %ctlz, ptr addrspace(1) %out, align 8
168 define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
169 ; GFX9-LABEL: cttz_i64_poison:
171 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
172 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
173 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
175 ; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
176 ; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
177 ; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
178 ; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
179 ; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
180 ; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
181 ; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
182 ; GFX9-NEXT: s_waitcnt vmcnt(7)
183 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
184 ; GFX9-NEXT: s_waitcnt vmcnt(5)
185 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
186 ; GFX9-NEXT: s_waitcnt vmcnt(4)
187 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
188 ; GFX9-NEXT: s_waitcnt vmcnt(3)
189 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
190 ; GFX9-NEXT: s_waitcnt vmcnt(2)
191 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
192 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
193 ; GFX9-NEXT: s_waitcnt vmcnt(1)
194 ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
195 ; GFX9-NEXT: s_waitcnt vmcnt(0)
196 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
197 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
198 ; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
199 ; GFX9-NEXT: v_ffbl_b32_e32 v0, v0
200 ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2
201 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp
202 ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2
203 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
204 ; GFX9-NEXT: s_endpgm
206 ; GFX10-LABEL: cttz_i64_poison:
208 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
209 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
210 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX10-NEXT: s_clause 0x7
212 ; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
213 ; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7
214 ; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6
215 ; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
216 ; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
217 ; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
218 ; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7]
219 ; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
220 ; GFX10-NEXT: s_waitcnt vmcnt(7)
221 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
222 ; GFX10-NEXT: s_waitcnt vmcnt(6)
223 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
224 ; GFX10-NEXT: s_waitcnt vmcnt(4)
225 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
226 ; GFX10-NEXT: s_waitcnt vmcnt(3)
227 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
228 ; GFX10-NEXT: s_waitcnt vmcnt(2)
229 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
230 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
231 ; GFX10-NEXT: s_waitcnt vmcnt(1)
232 ; GFX10-NEXT: v_or_b32_e32 v3, v4, v7
233 ; GFX10-NEXT: s_waitcnt vmcnt(0)
234 ; GFX10-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
235 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
236 ; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
237 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
238 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
239 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
240 ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
241 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
242 ; GFX10-NEXT: s_endpgm
243 %val = load i64, ptr addrspace(1) %arrayidx, align 1
244 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
245 store i64 %cttz, ptr addrspace(1) %out, align 8
249 define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
250 ; GFX9-LABEL: cttz_i64:
252 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
253 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
254 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
256 ; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6
257 ; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7
258 ; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
259 ; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
260 ; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
261 ; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7]
262 ; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
263 ; GFX9-NEXT: s_waitcnt vmcnt(7)
264 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0
265 ; GFX9-NEXT: s_waitcnt vmcnt(5)
266 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
267 ; GFX9-NEXT: s_waitcnt vmcnt(4)
268 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4
269 ; GFX9-NEXT: s_waitcnt vmcnt(3)
270 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
271 ; GFX9-NEXT: s_waitcnt vmcnt(2)
272 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v6
273 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
274 ; GFX9-NEXT: s_waitcnt vmcnt(1)
275 ; GFX9-NEXT: v_or_b32_e32 v3, v4, v7
276 ; GFX9-NEXT: s_waitcnt vmcnt(0)
277 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
278 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
279 ; GFX9-NEXT: v_or_b32_e32 v2, v4, v3
280 ; GFX9-NEXT: v_ffbl_b32_e32 v0, v0
281 ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2
282 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp
283 ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2
284 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0
285 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
286 ; GFX9-NEXT: s_endpgm
288 ; GFX10-LABEL: cttz_i64:
290 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
291 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
292 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
293 ; GFX10-NEXT: s_clause 0x7
294 ; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5
295 ; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7
296 ; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6
297 ; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1
298 ; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3
299 ; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4
300 ; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7]
301 ; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2
302 ; GFX10-NEXT: s_waitcnt vmcnt(7)
303 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
304 ; GFX10-NEXT: s_waitcnt vmcnt(6)
305 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
306 ; GFX10-NEXT: s_waitcnt vmcnt(4)
307 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4
308 ; GFX10-NEXT: s_waitcnt vmcnt(3)
309 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 8, v5
310 ; GFX10-NEXT: s_waitcnt vmcnt(2)
311 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v6
312 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
313 ; GFX10-NEXT: s_waitcnt vmcnt(1)
314 ; GFX10-NEXT: v_or_b32_e32 v3, v4, v7
315 ; GFX10-NEXT: s_waitcnt vmcnt(0)
316 ; GFX10-NEXT: v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
317 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
318 ; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
319 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
320 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
321 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
322 ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
323 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
324 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
325 ; GFX10-NEXT: s_endpgm
326 %val = load i64, ptr addrspace(1) %arrayidx, align 1
327 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
328 store i64 %cttz, ptr addrspace(1) %out, align 8
331 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: