1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
4 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
5 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
6 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-FLAT
7 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-GISEL
9 declare i32 @llvm.amdgcn.workitem.id.x() #1
11 declare i16 @llvm.bitreverse.i16(i16) #1
12 declare i32 @llvm.bitreverse.i32(i32) #1
13 declare i64 @llvm.bitreverse.i64(i64) #1
15 declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
16 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
18 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
19 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
21 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 {
22 ; SI-LABEL: s_brev_i16:
24 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
25 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
26 ; SI-NEXT: s_mov_b32 s3, 0xf000
27 ; SI-NEXT: s_mov_b32 s2, -1
28 ; SI-NEXT: s_waitcnt lgkmcnt(0)
29 ; SI-NEXT: s_brev_b32 s4, s4
30 ; SI-NEXT: s_lshr_b32 s4, s4, 16
31 ; SI-NEXT: v_mov_b32_e32 v0, s4
32 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
35 ; FLAT-LABEL: s_brev_i16:
37 ; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
38 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
39 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
40 ; FLAT-NEXT: s_mov_b32 s2, -1
41 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
42 ; FLAT-NEXT: s_brev_b32 s4, s4
43 ; FLAT-NEXT: s_lshr_b32 s4, s4, 16
44 ; FLAT-NEXT: v_mov_b32_e32 v0, s4
45 ; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0
48 ; GISEL-LABEL: s_brev_i16:
50 ; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
51 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
52 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
53 ; GISEL-NEXT: s_and_b32 s2, s2, 0xffff
54 ; GISEL-NEXT: s_brev_b32 s2, s2
55 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16
56 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
57 ; GISEL-NEXT: v_mov_b32_e32 v2, s2
58 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
59 ; GISEL-NEXT: flat_store_short v[0:1], v2
60 ; GISEL-NEXT: s_endpgm
62 ; GFX11-FLAT-LABEL: s_brev_i16:
63 ; GFX11-FLAT: ; %bb.0:
64 ; GFX11-FLAT-NEXT: s_clause 0x1
65 ; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
66 ; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
67 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
69 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
70 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
71 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
72 ; GFX11-FLAT-NEXT: s_nop 0
73 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
74 ; GFX11-FLAT-NEXT: s_endpgm
76 ; GFX11-GISEL-LABEL: s_brev_i16:
77 ; GFX11-GISEL: ; %bb.0:
78 ; GFX11-GISEL-NEXT: s_clause 0x1
79 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
80 ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
81 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
82 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
83 ; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
84 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
85 ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
86 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16
87 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
88 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
89 ; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
90 ; GFX11-GISEL-NEXT: s_nop 0
91 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
92 ; GFX11-GISEL-NEXT: s_endpgm
93 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
94 store i16 %brev, ptr addrspace(1) %out
98 define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
99 ; SI-LABEL: v_brev_i16:
101 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
102 ; SI-NEXT: s_mov_b32 s7, 0xf000
103 ; SI-NEXT: s_mov_b32 s6, -1
104 ; SI-NEXT: s_mov_b32 s10, s6
105 ; SI-NEXT: s_mov_b32 s11, s7
106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
107 ; SI-NEXT: s_mov_b32 s8, s2
108 ; SI-NEXT: s_mov_b32 s9, s3
109 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
110 ; SI-NEXT: s_mov_b32 s4, s0
111 ; SI-NEXT: s_mov_b32 s5, s1
112 ; SI-NEXT: s_waitcnt vmcnt(0)
113 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
114 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
115 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
118 ; FLAT-LABEL: v_brev_i16:
120 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
121 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
122 ; FLAT-NEXT: s_mov_b32 s6, -1
123 ; FLAT-NEXT: s_mov_b32 s10, s6
124 ; FLAT-NEXT: s_mov_b32 s11, s7
125 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
126 ; FLAT-NEXT: s_mov_b32 s8, s2
127 ; FLAT-NEXT: s_mov_b32 s9, s3
128 ; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0
129 ; FLAT-NEXT: s_mov_b32 s4, s0
130 ; FLAT-NEXT: s_mov_b32 s5, s1
131 ; FLAT-NEXT: s_waitcnt vmcnt(0)
132 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
133 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0
134 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
135 ; FLAT-NEXT: s_endpgm
137 ; GISEL-LABEL: v_brev_i16:
139 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
140 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
141 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
142 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
143 ; GISEL-NEXT: flat_load_ushort v0, v[0:1]
144 ; GISEL-NEXT: s_waitcnt vmcnt(0)
145 ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
146 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
147 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
148 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
149 ; GISEL-NEXT: flat_store_short v[0:1], v2
150 ; GISEL-NEXT: s_endpgm
152 ; GFX11-FLAT-LABEL: v_brev_i16:
153 ; GFX11-FLAT: ; %bb.0:
154 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
155 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
156 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
157 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0
158 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX11-FLAT-NEXT: s_mov_b32 s4, s2
160 ; GFX11-FLAT-NEXT: s_mov_b32 s5, s3
161 ; GFX11-FLAT-NEXT: buffer_load_u16 v0, off, s[4:7], 0
162 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
163 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
164 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
165 ; GFX11-FLAT-NEXT: s_nop 0
166 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
167 ; GFX11-FLAT-NEXT: s_endpgm
169 ; GFX11-GISEL-LABEL: v_brev_i16:
170 ; GFX11-GISEL: ; %bb.0:
171 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
172 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
173 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3]
175 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
176 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1
177 ; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
178 ; GFX11-GISEL-NEXT: s_nop 0
179 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
180 ; GFX11-GISEL-NEXT: s_endpgm
181 %val = load i16, ptr addrspace(1) %valptr
182 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
183 store i16 %brev, ptr addrspace(1) %out
187 define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 {
188 ; SI-LABEL: s_brev_i32:
190 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
191 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
192 ; SI-NEXT: s_mov_b32 s3, 0xf000
193 ; SI-NEXT: s_mov_b32 s2, -1
194 ; SI-NEXT: s_waitcnt lgkmcnt(0)
195 ; SI-NEXT: s_brev_b32 s4, s4
196 ; SI-NEXT: v_mov_b32_e32 v0, s4
197 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
200 ; FLAT-LABEL: s_brev_i32:
202 ; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
203 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
204 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
205 ; FLAT-NEXT: s_mov_b32 s2, -1
206 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
207 ; FLAT-NEXT: s_brev_b32 s4, s4
208 ; FLAT-NEXT: v_mov_b32_e32 v0, s4
209 ; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
210 ; FLAT-NEXT: s_endpgm
212 ; GISEL-LABEL: s_brev_i32:
214 ; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
215 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
216 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
217 ; GISEL-NEXT: s_brev_b32 s2, s2
218 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
219 ; GISEL-NEXT: v_mov_b32_e32 v2, s2
220 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
221 ; GISEL-NEXT: flat_store_dword v[0:1], v2
222 ; GISEL-NEXT: s_endpgm
224 ; GFX11-FLAT-LABEL: s_brev_i32:
225 ; GFX11-FLAT: ; %bb.0:
226 ; GFX11-FLAT-NEXT: s_clause 0x1
227 ; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
228 ; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
229 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
230 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
232 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
233 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2
234 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
235 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
236 ; GFX11-FLAT-NEXT: s_nop 0
237 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
238 ; GFX11-FLAT-NEXT: s_endpgm
240 ; GFX11-GISEL-LABEL: s_brev_i32:
241 ; GFX11-GISEL: ; %bb.0:
242 ; GFX11-GISEL-NEXT: s_clause 0x1
243 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
244 ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
245 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
246 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
248 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
249 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
250 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
251 ; GFX11-GISEL-NEXT: s_nop 0
252 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
253 ; GFX11-GISEL-NEXT: s_endpgm
254 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
255 store i32 %brev, ptr addrspace(1) %out
259 define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
260 ; SI-LABEL: v_brev_i32:
262 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
263 ; SI-NEXT: s_mov_b32 s7, 0xf000
264 ; SI-NEXT: s_mov_b32 s10, 0
265 ; SI-NEXT: s_mov_b32 s11, s7
266 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
267 ; SI-NEXT: s_waitcnt lgkmcnt(0)
268 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
269 ; SI-NEXT: v_mov_b32_e32 v1, 0
270 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
271 ; SI-NEXT: s_mov_b32 s6, -1
272 ; SI-NEXT: s_mov_b32 s4, s0
273 ; SI-NEXT: s_mov_b32 s5, s1
274 ; SI-NEXT: s_waitcnt vmcnt(0)
275 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
276 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
279 ; FLAT-LABEL: v_brev_i32:
281 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
282 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
283 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
284 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
285 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
286 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
287 ; FLAT-NEXT: flat_load_dword v0, v[0:1]
288 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
289 ; FLAT-NEXT: s_mov_b32 s2, -1
290 ; FLAT-NEXT: s_waitcnt vmcnt(0)
291 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
292 ; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
293 ; FLAT-NEXT: s_endpgm
295 ; GISEL-LABEL: v_brev_i32:
297 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
298 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
299 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
300 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
301 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
302 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
303 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
304 ; GISEL-NEXT: flat_load_dword v0, v[0:1]
305 ; GISEL-NEXT: s_waitcnt vmcnt(0)
306 ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
307 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
308 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
309 ; GISEL-NEXT: flat_store_dword v[0:1], v2
310 ; GISEL-NEXT: s_endpgm
312 ; GFX11-FLAT-LABEL: v_brev_i32:
313 ; GFX11-FLAT: ; %bb.0:
314 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
315 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
317 ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3]
318 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
319 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
320 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
321 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
322 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
323 ; GFX11-FLAT-NEXT: s_nop 0
324 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
325 ; GFX11-FLAT-NEXT: s_endpgm
327 ; GFX11-GISEL-LABEL: v_brev_i32:
328 ; GFX11-GISEL: ; %bb.0:
329 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
330 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
331 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
332 ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
333 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
334 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0
335 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
336 ; GFX11-GISEL-NEXT: s_nop 0
337 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
338 ; GFX11-GISEL-NEXT: s_endpgm
339 %tid = call i32 @llvm.amdgcn.workitem.id.x()
340 %gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
341 %val = load i32, ptr addrspace(1) %gep
342 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
343 store i32 %brev, ptr addrspace(1) %out
347 define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 {
348 ; SI-LABEL: s_brev_v2i32:
350 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
351 ; SI-NEXT: s_mov_b32 s7, 0xf000
352 ; SI-NEXT: s_mov_b32 s6, -1
353 ; SI-NEXT: s_waitcnt lgkmcnt(0)
354 ; SI-NEXT: s_mov_b32 s4, s0
355 ; SI-NEXT: s_mov_b32 s5, s1
356 ; SI-NEXT: s_brev_b32 s0, s3
357 ; SI-NEXT: s_brev_b32 s1, s2
358 ; SI-NEXT: v_mov_b32_e32 v0, s1
359 ; SI-NEXT: v_mov_b32_e32 v1, s0
360 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
363 ; FLAT-LABEL: s_brev_v2i32:
365 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
366 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
367 ; FLAT-NEXT: s_mov_b32 s6, -1
368 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
369 ; FLAT-NEXT: s_mov_b32 s4, s0
370 ; FLAT-NEXT: s_mov_b32 s5, s1
371 ; FLAT-NEXT: s_brev_b32 s0, s3
372 ; FLAT-NEXT: s_brev_b32 s1, s2
373 ; FLAT-NEXT: v_mov_b32_e32 v0, s1
374 ; FLAT-NEXT: v_mov_b32_e32 v1, s0
375 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
376 ; FLAT-NEXT: s_endpgm
378 ; GISEL-LABEL: s_brev_v2i32:
380 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
381 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
382 ; GISEL-NEXT: s_brev_b32 s2, s2
383 ; GISEL-NEXT: s_brev_b32 s3, s3
384 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
385 ; GISEL-NEXT: v_mov_b32_e32 v3, s1
386 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
387 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
388 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
389 ; GISEL-NEXT: s_endpgm
391 ; GFX11-FLAT-LABEL: s_brev_v2i32:
392 ; GFX11-FLAT: ; %bb.0:
393 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
394 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
395 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
396 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
398 ; GFX11-FLAT-NEXT: s_brev_b32 s3, s3
399 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
400 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
401 ; GFX11-FLAT-NEXT: s_mov_b32 s4, s0
402 ; GFX11-FLAT-NEXT: s_mov_b32 s5, s1
403 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
404 ; GFX11-FLAT-NEXT: s_nop 0
405 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
406 ; GFX11-FLAT-NEXT: s_endpgm
408 ; GFX11-GISEL-LABEL: s_brev_v2i32:
409 ; GFX11-GISEL: ; %bb.0:
410 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
411 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
412 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
414 ; GFX11-GISEL-NEXT: s_brev_b32 s3, s3
415 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
416 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
417 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
418 ; GFX11-GISEL-NEXT: s_nop 0
419 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
420 ; GFX11-GISEL-NEXT: s_endpgm
421 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
422 store <2 x i32> %brev, ptr addrspace(1) %out
426 define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
427 ; SI-LABEL: v_brev_v2i32:
429 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
430 ; SI-NEXT: s_mov_b32 s7, 0xf000
431 ; SI-NEXT: s_mov_b32 s10, 0
432 ; SI-NEXT: s_mov_b32 s11, s7
433 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
434 ; SI-NEXT: s_waitcnt lgkmcnt(0)
435 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
436 ; SI-NEXT: v_mov_b32_e32 v1, 0
437 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
438 ; SI-NEXT: s_mov_b32 s6, -1
439 ; SI-NEXT: s_mov_b32 s4, s0
440 ; SI-NEXT: s_mov_b32 s5, s1
441 ; SI-NEXT: s_waitcnt vmcnt(0)
442 ; SI-NEXT: v_bfrev_b32_e32 v1, v1
443 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
444 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
447 ; FLAT-LABEL: v_brev_v2i32:
449 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
450 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
451 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
452 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
453 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
454 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
455 ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
456 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
457 ; FLAT-NEXT: s_mov_b32 s2, -1
458 ; FLAT-NEXT: s_waitcnt vmcnt(0)
459 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
460 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
461 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
462 ; FLAT-NEXT: s_endpgm
464 ; GISEL-LABEL: v_brev_v2i32:
466 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
467 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
468 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
469 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
470 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
471 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
472 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
473 ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
474 ; GISEL-NEXT: v_mov_b32_e32 v3, s1
475 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
476 ; GISEL-NEXT: s_waitcnt vmcnt(0)
477 ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
478 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1
479 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
480 ; GISEL-NEXT: s_endpgm
482 ; GFX11-FLAT-LABEL: v_brev_v2i32:
483 ; GFX11-FLAT: ; %bb.0:
484 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
485 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
486 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3]
488 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
489 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
490 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
491 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1
492 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
493 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
494 ; GFX11-FLAT-NEXT: s_nop 0
495 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
496 ; GFX11-FLAT-NEXT: s_endpgm
498 ; GFX11-GISEL-LABEL: v_brev_v2i32:
499 ; GFX11-GISEL: ; %bb.0:
500 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
501 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
502 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
503 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
505 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
506 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0
507 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1
508 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
509 ; GFX11-GISEL-NEXT: s_nop 0
510 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
511 ; GFX11-GISEL-NEXT: s_endpgm
512 %tid = call i32 @llvm.amdgcn.workitem.id.x()
513 %gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
514 %val = load <2 x i32>, ptr addrspace(1) %gep
515 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
516 store <2 x i32> %brev, ptr addrspace(1) %out
520 define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 {
521 ; SI-LABEL: s_brev_i64:
523 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
524 ; SI-NEXT: s_mov_b32 s7, 0xf000
525 ; SI-NEXT: s_mov_b32 s6, -1
526 ; SI-NEXT: s_waitcnt lgkmcnt(0)
527 ; SI-NEXT: s_mov_b32 s4, s0
528 ; SI-NEXT: s_mov_b32 s5, s1
529 ; SI-NEXT: s_brev_b64 s[0:1], s[2:3]
530 ; SI-NEXT: v_mov_b32_e32 v0, s0
531 ; SI-NEXT: v_mov_b32_e32 v1, s1
532 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
535 ; FLAT-LABEL: s_brev_i64:
537 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
538 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
539 ; FLAT-NEXT: s_mov_b32 s6, -1
540 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
541 ; FLAT-NEXT: s_mov_b32 s4, s0
542 ; FLAT-NEXT: s_mov_b32 s5, s1
543 ; FLAT-NEXT: s_brev_b64 s[0:1], s[2:3]
544 ; FLAT-NEXT: v_mov_b32_e32 v0, s0
545 ; FLAT-NEXT: v_mov_b32_e32 v1, s1
546 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
547 ; FLAT-NEXT: s_endpgm
549 ; GISEL-LABEL: s_brev_i64:
551 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
552 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
553 ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
554 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
555 ; GISEL-NEXT: v_mov_b32_e32 v3, s1
556 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
557 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
558 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
559 ; GISEL-NEXT: s_endpgm
561 ; GFX11-FLAT-LABEL: s_brev_i64:
562 ; GFX11-FLAT: ; %bb.0:
563 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
564 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
565 ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3]
566 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
567 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
568 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
569 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
570 ; GFX11-FLAT-NEXT: s_nop 0
571 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
572 ; GFX11-FLAT-NEXT: s_endpgm
574 ; GFX11-GISEL-LABEL: s_brev_i64:
575 ; GFX11-GISEL: ; %bb.0:
576 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
577 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
578 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
580 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
581 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
582 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
583 ; GFX11-GISEL-NEXT: s_nop 0
584 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
585 ; GFX11-GISEL-NEXT: s_endpgm
586 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
587 store i64 %brev, ptr addrspace(1) %out
591 define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
592 ; SI-LABEL: v_brev_i64:
594 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
595 ; SI-NEXT: s_mov_b32 s7, 0xf000
596 ; SI-NEXT: s_mov_b32 s10, 0
597 ; SI-NEXT: s_mov_b32 s11, s7
598 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
599 ; SI-NEXT: s_waitcnt lgkmcnt(0)
600 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
601 ; SI-NEXT: v_mov_b32_e32 v1, 0
602 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
603 ; SI-NEXT: s_mov_b32 s6, -1
604 ; SI-NEXT: s_mov_b32 s4, s0
605 ; SI-NEXT: s_mov_b32 s5, s1
606 ; SI-NEXT: s_waitcnt vmcnt(0)
607 ; SI-NEXT: v_bfrev_b32_e32 v2, v0
608 ; SI-NEXT: v_bfrev_b32_e32 v1, v1
609 ; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
612 ; FLAT-LABEL: v_brev_i64:
614 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
615 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
616 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
617 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
618 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
619 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
620 ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
621 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
622 ; FLAT-NEXT: s_mov_b32 s2, -1
623 ; FLAT-NEXT: s_waitcnt vmcnt(0)
624 ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0
625 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
626 ; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
627 ; FLAT-NEXT: s_endpgm
629 ; GISEL-LABEL: v_brev_i64:
631 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
632 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
633 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
634 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
635 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
636 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
637 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
638 ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
639 ; GISEL-NEXT: v_mov_b32_e32 v4, s1
640 ; GISEL-NEXT: v_mov_b32_e32 v3, s0
641 ; GISEL-NEXT: s_waitcnt vmcnt(0)
642 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1
643 ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
644 ; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
645 ; GISEL-NEXT: s_endpgm
647 ; GFX11-FLAT-LABEL: v_brev_i64:
648 ; GFX11-FLAT: ; %bb.0:
649 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
650 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
651 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
652 ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3]
653 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
654 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
655 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
656 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0
657 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1
658 ; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
659 ; GFX11-FLAT-NEXT: s_nop 0
660 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
661 ; GFX11-FLAT-NEXT: s_endpgm
663 ; GFX11-GISEL-LABEL: v_brev_i64:
664 ; GFX11-GISEL: ; %bb.0:
665 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
666 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
667 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
668 ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
669 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
670 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1
671 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v2, v0
672 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
673 ; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1]
674 ; GFX11-GISEL-NEXT: s_nop 0
675 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
676 ; GFX11-GISEL-NEXT: s_endpgm
677 %tid = call i32 @llvm.amdgcn.workitem.id.x()
678 %gep = getelementptr i64, ptr addrspace(1) %valptr, i32 %tid
679 %val = load i64, ptr addrspace(1) %gep
680 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
681 store i64 %brev, ptr addrspace(1) %out
685 define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 {
686 ; SI-LABEL: s_brev_v2i64:
688 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
689 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
690 ; SI-NEXT: s_mov_b32 s3, 0xf000
691 ; SI-NEXT: s_mov_b32 s2, -1
692 ; SI-NEXT: s_waitcnt lgkmcnt(0)
693 ; SI-NEXT: s_brev_b64 s[6:7], s[6:7]
694 ; SI-NEXT: s_brev_b64 s[4:5], s[4:5]
695 ; SI-NEXT: v_mov_b32_e32 v0, s4
696 ; SI-NEXT: v_mov_b32_e32 v1, s5
697 ; SI-NEXT: v_mov_b32_e32 v2, s6
698 ; SI-NEXT: v_mov_b32_e32 v3, s7
699 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
702 ; FLAT-LABEL: s_brev_v2i64:
704 ; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
705 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
706 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
707 ; FLAT-NEXT: s_mov_b32 s2, -1
708 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
709 ; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7]
710 ; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5]
711 ; FLAT-NEXT: v_mov_b32_e32 v0, s4
712 ; FLAT-NEXT: v_mov_b32_e32 v1, s5
713 ; FLAT-NEXT: v_mov_b32_e32 v2, s6
714 ; FLAT-NEXT: v_mov_b32_e32 v3, s7
715 ; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
716 ; FLAT-NEXT: s_endpgm
718 ; GISEL-LABEL: s_brev_v2i64:
720 ; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
721 ; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
722 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
723 ; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5]
724 ; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7]
725 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
726 ; GISEL-NEXT: v_mov_b32_e32 v4, s8
727 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
728 ; GISEL-NEXT: v_mov_b32_e32 v2, s2
729 ; GISEL-NEXT: v_mov_b32_e32 v3, s3
730 ; GISEL-NEXT: v_mov_b32_e32 v5, s9
731 ; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
732 ; GISEL-NEXT: s_endpgm
734 ; GFX11-FLAT-LABEL: s_brev_v2i64:
735 ; GFX11-FLAT: ; %bb.0:
736 ; GFX11-FLAT-NEXT: s_clause 0x1
737 ; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
738 ; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
739 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
740 ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
741 ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7]
742 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
743 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
744 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
745 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
746 ; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
747 ; GFX11-FLAT-NEXT: s_nop 0
748 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
749 ; GFX11-FLAT-NEXT: s_endpgm
751 ; GFX11-GISEL-LABEL: s_brev_v2i64:
752 ; GFX11-GISEL: ; %bb.0:
753 ; GFX11-GISEL-NEXT: s_clause 0x1
754 ; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
755 ; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
756 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
757 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
758 ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5]
759 ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[6:7]
760 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
761 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
762 ; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[8:9]
763 ; GFX11-GISEL-NEXT: s_nop 0
764 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
765 ; GFX11-GISEL-NEXT: s_endpgm
766 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
767 store <2 x i64> %brev, ptr addrspace(1) %out
771 define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
772 ; SI-LABEL: v_brev_v2i64:
774 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
775 ; SI-NEXT: s_mov_b32 s7, 0xf000
776 ; SI-NEXT: s_mov_b32 s10, 0
777 ; SI-NEXT: s_mov_b32 s11, s7
778 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
779 ; SI-NEXT: s_waitcnt lgkmcnt(0)
780 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
781 ; SI-NEXT: v_mov_b32_e32 v1, 0
782 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
783 ; SI-NEXT: s_mov_b32 s6, -1
784 ; SI-NEXT: s_mov_b32 s4, s0
785 ; SI-NEXT: s_mov_b32 s5, s1
786 ; SI-NEXT: s_waitcnt vmcnt(0)
787 ; SI-NEXT: v_bfrev_b32_e32 v4, v2
788 ; SI-NEXT: v_bfrev_b32_e32 v3, v3
789 ; SI-NEXT: v_bfrev_b32_e32 v2, v0
790 ; SI-NEXT: v_bfrev_b32_e32 v1, v1
791 ; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0
794 ; FLAT-LABEL: v_brev_v2i64:
796 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
797 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
798 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
799 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
800 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
801 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
802 ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
803 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
804 ; FLAT-NEXT: s_mov_b32 s2, -1
805 ; FLAT-NEXT: s_waitcnt vmcnt(0)
806 ; FLAT-NEXT: v_bfrev_b32_e32 v4, v2
807 ; FLAT-NEXT: v_bfrev_b32_e32 v3, v3
808 ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0
809 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
810 ; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
811 ; FLAT-NEXT: s_endpgm
813 ; GISEL-LABEL: v_brev_v2i64:
815 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
816 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
817 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
818 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
819 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
820 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
821 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
822 ; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
823 ; GISEL-NEXT: s_waitcnt vmcnt(0)
824 ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1
825 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0
826 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
827 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3
828 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2
829 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
830 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
831 ; GISEL-NEXT: s_endpgm
833 ; GFX11-FLAT-LABEL: v_brev_v2i64:
834 ; GFX11-FLAT: ; %bb.0:
835 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
836 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
837 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
838 ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3]
839 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
840 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
841 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
842 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v4, v2
843 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v3, v3
844 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0
845 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1
846 ; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[0:3], 0
847 ; GFX11-FLAT-NEXT: s_nop 0
848 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
849 ; GFX11-FLAT-NEXT: s_endpgm
851 ; GFX11-GISEL-LABEL: v_brev_v2i64:
852 ; GFX11-GISEL: ; %bb.0:
853 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
854 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
855 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
856 ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3]
857 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
858 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v4, v1
859 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v5, v0
860 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v6, v3
861 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v7, v2
862 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
863 ; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[0:1]
864 ; GFX11-GISEL-NEXT: s_nop 0
865 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
866 ; GFX11-GISEL-NEXT: s_endpgm
867 %tid = call i32 @llvm.amdgcn.workitem.id.x()
868 %gep = getelementptr <2 x i64> , ptr addrspace(1) %valptr, i32 %tid
869 %val = load <2 x i64>, ptr addrspace(1) %gep
870 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
871 store <2 x i64> %brev, ptr addrspace(1) %out
875 define float @missing_truncate_promote_bitreverse(i32 %arg) {
876 ; SI-LABEL: missing_truncate_promote_bitreverse:
878 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
879 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
880 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
881 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
882 ; SI-NEXT: s_setpc_b64 s[30:31]
884 ; FLAT-LABEL: missing_truncate_promote_bitreverse:
885 ; FLAT: ; %bb.0: ; %bb
886 ; FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
887 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
888 ; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
889 ; FLAT-NEXT: s_setpc_b64 s[30:31]
891 ; GISEL-LABEL: missing_truncate_promote_bitreverse:
892 ; GISEL: ; %bb.0: ; %bb
893 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894 ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
895 ; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
896 ; GISEL-NEXT: s_setpc_b64 s[30:31]
898 ; GFX11-FLAT-LABEL: missing_truncate_promote_bitreverse:
899 ; GFX11-FLAT: ; %bb.0: ; %bb
900 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
901 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
902 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
903 ; GFX11-FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0
904 ; GFX11-FLAT-NEXT: v_cvt_f32_f16_e32 v0, v0
905 ; GFX11-FLAT-NEXT: s_setpc_b64 s[30:31]
907 ; GFX11-GISEL-LABEL: missing_truncate_promote_bitreverse:
908 ; GFX11-GISEL: ; %bb.0: ; %bb
909 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0
911 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
912 ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
913 ; GFX11-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
914 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
916 %tmp = trunc i32 %arg to i16
917 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
918 %tmp2 = bitcast i16 %tmp1 to half
919 %tmp3 = fpext half %tmp2 to float
923 attributes #0 = { nounwind }
924 attributes #1 = { nounwind readnone }