1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
4 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
5 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
6 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-FLAT
7 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-GISEL
9 declare i32 @llvm.amdgcn.workitem.id.x() #1
11 declare i16 @llvm.bitreverse.i16(i16) #1
12 declare i32 @llvm.bitreverse.i32(i32) #1
13 declare i64 @llvm.bitreverse.i64(i64) #1
15 declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
16 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
18 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
19 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
21 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 {
22 ; SI-LABEL: s_brev_i16:
24 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
25 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
26 ; SI-NEXT: s_mov_b32 s3, 0xf000
27 ; SI-NEXT: s_mov_b32 s2, -1
28 ; SI-NEXT: s_waitcnt lgkmcnt(0)
29 ; SI-NEXT: s_brev_b32 s4, s6
30 ; SI-NEXT: s_lshr_b32 s4, s4, 16
31 ; SI-NEXT: v_mov_b32_e32 v0, s4
32 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
35 ; FLAT-LABEL: s_brev_i16:
37 ; FLAT-NEXT: s_load_dword s6, s[4:5], 0x2c
38 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
39 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
40 ; FLAT-NEXT: s_mov_b32 s2, -1
41 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
42 ; FLAT-NEXT: s_brev_b32 s4, s6
43 ; FLAT-NEXT: s_lshr_b32 s4, s4, 16
44 ; FLAT-NEXT: v_mov_b32_e32 v0, s4
45 ; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0
48 ; GISEL-LABEL: s_brev_i16:
50 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
51 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
52 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
53 ; GISEL-NEXT: s_and_b32 s2, s2, 0xffff
54 ; GISEL-NEXT: s_brev_b32 s2, s2
55 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16
56 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
57 ; GISEL-NEXT: v_mov_b32_e32 v2, s2
58 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
59 ; GISEL-NEXT: flat_store_short v[0:1], v2
60 ; GISEL-NEXT: s_endpgm
62 ; GFX11-FLAT-LABEL: s_brev_i16:
63 ; GFX11-FLAT: ; %bb.0:
64 ; GFX11-FLAT-NEXT: s_clause 0x1
65 ; GFX11-FLAT-NEXT: s_load_b32 s2, s[4:5], 0x2c
66 ; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
67 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
69 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
70 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
71 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
72 ; GFX11-FLAT-NEXT: s_endpgm
74 ; GFX11-GISEL-LABEL: s_brev_i16:
75 ; GFX11-GISEL: ; %bb.0:
76 ; GFX11-GISEL-NEXT: s_clause 0x1
77 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
78 ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
79 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
80 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
82 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
83 ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
84 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16
85 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
86 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
87 ; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
88 ; GFX11-GISEL-NEXT: s_endpgm
89 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
90 store i16 %brev, ptr addrspace(1) %out
94 define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
95 ; SI-LABEL: v_brev_i16:
97 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
98 ; SI-NEXT: s_mov_b32 s7, 0xf000
99 ; SI-NEXT: s_mov_b32 s6, -1
100 ; SI-NEXT: s_mov_b32 s10, s6
101 ; SI-NEXT: s_mov_b32 s11, s7
102 ; SI-NEXT: s_waitcnt lgkmcnt(0)
103 ; SI-NEXT: s_mov_b32 s8, s2
104 ; SI-NEXT: s_mov_b32 s9, s3
105 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
106 ; SI-NEXT: s_mov_b32 s4, s0
107 ; SI-NEXT: s_mov_b32 s5, s1
108 ; SI-NEXT: s_waitcnt vmcnt(0)
109 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
110 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
111 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
114 ; FLAT-LABEL: v_brev_i16:
116 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
117 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
118 ; FLAT-NEXT: s_mov_b32 s6, -1
119 ; FLAT-NEXT: s_mov_b32 s10, s6
120 ; FLAT-NEXT: s_mov_b32 s11, s7
121 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
122 ; FLAT-NEXT: s_mov_b32 s8, s2
123 ; FLAT-NEXT: s_mov_b32 s9, s3
124 ; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0
125 ; FLAT-NEXT: s_mov_b32 s4, s0
126 ; FLAT-NEXT: s_mov_b32 s5, s1
127 ; FLAT-NEXT: s_waitcnt vmcnt(0)
128 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
129 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0
130 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
131 ; FLAT-NEXT: s_endpgm
133 ; GISEL-LABEL: v_brev_i16:
135 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
136 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
137 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
138 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
139 ; GISEL-NEXT: flat_load_ushort v0, v[0:1]
140 ; GISEL-NEXT: s_waitcnt vmcnt(0)
141 ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
142 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
143 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
144 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
145 ; GISEL-NEXT: flat_store_short v[0:1], v2
146 ; GISEL-NEXT: s_endpgm
148 ; GFX11-FLAT-LABEL: v_brev_i16:
149 ; GFX11-FLAT: ; %bb.0:
150 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
151 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
152 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
153 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0
154 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
155 ; GFX11-FLAT-NEXT: s_mov_b32 s4, s2
156 ; GFX11-FLAT-NEXT: s_mov_b32 s5, s3
157 ; GFX11-FLAT-NEXT: buffer_load_u16 v0, off, s[4:7], 0
158 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
159 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
160 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
161 ; GFX11-FLAT-NEXT: s_endpgm
163 ; GFX11-GISEL-LABEL: v_brev_i16:
164 ; GFX11-GISEL: ; %bb.0:
165 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
166 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
167 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
168 ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3]
169 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
170 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1
171 ; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
172 ; GFX11-GISEL-NEXT: s_endpgm
173 %val = load i16, ptr addrspace(1) %valptr
174 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
175 store i16 %brev, ptr addrspace(1) %out
179 define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 {
180 ; SI-LABEL: s_brev_i32:
182 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
183 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
184 ; SI-NEXT: s_mov_b32 s3, 0xf000
185 ; SI-NEXT: s_mov_b32 s2, -1
186 ; SI-NEXT: s_waitcnt lgkmcnt(0)
187 ; SI-NEXT: s_brev_b32 s4, s6
188 ; SI-NEXT: v_mov_b32_e32 v0, s4
189 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
192 ; FLAT-LABEL: s_brev_i32:
194 ; FLAT-NEXT: s_load_dword s6, s[4:5], 0x2c
195 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
196 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
197 ; FLAT-NEXT: s_mov_b32 s2, -1
198 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
199 ; FLAT-NEXT: s_brev_b32 s4, s6
200 ; FLAT-NEXT: v_mov_b32_e32 v0, s4
201 ; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
202 ; FLAT-NEXT: s_endpgm
204 ; GISEL-LABEL: s_brev_i32:
206 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
207 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
208 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
209 ; GISEL-NEXT: s_brev_b32 s2, s2
210 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
211 ; GISEL-NEXT: v_mov_b32_e32 v2, s2
212 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
213 ; GISEL-NEXT: flat_store_dword v[0:1], v2
214 ; GISEL-NEXT: s_endpgm
216 ; GFX11-FLAT-LABEL: s_brev_i32:
217 ; GFX11-FLAT: ; %bb.0:
218 ; GFX11-FLAT-NEXT: s_clause 0x1
219 ; GFX11-FLAT-NEXT: s_load_b32 s2, s[4:5], 0x2c
220 ; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
221 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
222 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
224 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
225 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2
226 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
227 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
228 ; GFX11-FLAT-NEXT: s_endpgm
230 ; GFX11-GISEL-LABEL: s_brev_i32:
231 ; GFX11-GISEL: ; %bb.0:
232 ; GFX11-GISEL-NEXT: s_clause 0x1
233 ; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
234 ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
235 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
236 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
238 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
239 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
240 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
241 ; GFX11-GISEL-NEXT: s_endpgm
242 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
243 store i32 %brev, ptr addrspace(1) %out
247 define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
248 ; SI-LABEL: v_brev_i32:
250 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
251 ; SI-NEXT: s_mov_b32 s7, 0xf000
252 ; SI-NEXT: s_mov_b32 s10, 0
253 ; SI-NEXT: s_mov_b32 s11, s7
254 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
255 ; SI-NEXT: s_waitcnt lgkmcnt(0)
256 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
257 ; SI-NEXT: v_mov_b32_e32 v1, 0
258 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
259 ; SI-NEXT: s_mov_b32 s6, -1
260 ; SI-NEXT: s_mov_b32 s4, s0
261 ; SI-NEXT: s_mov_b32 s5, s1
262 ; SI-NEXT: s_waitcnt vmcnt(0)
263 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
264 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
267 ; FLAT-LABEL: v_brev_i32:
269 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
270 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
271 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
272 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
273 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
274 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
275 ; FLAT-NEXT: flat_load_dword v0, v[0:1]
276 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
277 ; FLAT-NEXT: s_mov_b32 s2, -1
278 ; FLAT-NEXT: s_waitcnt vmcnt(0)
279 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
280 ; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
281 ; FLAT-NEXT: s_endpgm
283 ; GISEL-LABEL: v_brev_i32:
285 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
286 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
287 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
288 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
289 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
290 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
291 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
292 ; GISEL-NEXT: flat_load_dword v0, v[0:1]
293 ; GISEL-NEXT: s_waitcnt vmcnt(0)
294 ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
295 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
296 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
297 ; GISEL-NEXT: flat_store_dword v[0:1], v2
298 ; GISEL-NEXT: s_endpgm
300 ; GFX11-FLAT-LABEL: v_brev_i32:
301 ; GFX11-FLAT: ; %bb.0:
302 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
303 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
304 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1)
305 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
306 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
307 ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3]
308 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
309 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
310 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
311 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
312 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
313 ; GFX11-FLAT-NEXT: s_endpgm
315 ; GFX11-GISEL-LABEL: v_brev_i32:
316 ; GFX11-GISEL: ; %bb.0:
317 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
318 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
319 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
320 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
321 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
323 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
324 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0
325 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
326 ; GFX11-GISEL-NEXT: s_endpgm
327 %tid = call i32 @llvm.amdgcn.workitem.id.x()
328 %gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
329 %val = load i32, ptr addrspace(1) %gep
330 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
331 store i32 %brev, ptr addrspace(1) %out
335 define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 {
336 ; SI-LABEL: s_brev_v2i32:
338 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
339 ; SI-NEXT: s_mov_b32 s7, 0xf000
340 ; SI-NEXT: s_mov_b32 s6, -1
341 ; SI-NEXT: s_waitcnt lgkmcnt(0)
342 ; SI-NEXT: s_mov_b32 s4, s0
343 ; SI-NEXT: s_mov_b32 s5, s1
344 ; SI-NEXT: s_brev_b32 s0, s3
345 ; SI-NEXT: s_brev_b32 s1, s2
346 ; SI-NEXT: v_mov_b32_e32 v0, s1
347 ; SI-NEXT: v_mov_b32_e32 v1, s0
348 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
351 ; FLAT-LABEL: s_brev_v2i32:
353 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
354 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
355 ; FLAT-NEXT: s_mov_b32 s6, -1
356 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
357 ; FLAT-NEXT: s_mov_b32 s4, s0
358 ; FLAT-NEXT: s_mov_b32 s5, s1
359 ; FLAT-NEXT: s_brev_b32 s0, s3
360 ; FLAT-NEXT: s_brev_b32 s1, s2
361 ; FLAT-NEXT: v_mov_b32_e32 v0, s1
362 ; FLAT-NEXT: v_mov_b32_e32 v1, s0
363 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
364 ; FLAT-NEXT: s_endpgm
366 ; GISEL-LABEL: s_brev_v2i32:
368 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
369 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
370 ; GISEL-NEXT: s_brev_b32 s2, s2
371 ; GISEL-NEXT: s_brev_b32 s3, s3
372 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
373 ; GISEL-NEXT: v_mov_b32_e32 v3, s1
374 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
375 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
376 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
377 ; GISEL-NEXT: s_endpgm
379 ; GFX11-FLAT-LABEL: s_brev_v2i32:
380 ; GFX11-FLAT: ; %bb.0:
381 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
382 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
383 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
384 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
386 ; GFX11-FLAT-NEXT: s_brev_b32 s3, s3
387 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
388 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
389 ; GFX11-FLAT-NEXT: s_mov_b32 s4, s0
390 ; GFX11-FLAT-NEXT: s_mov_b32 s5, s1
391 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
392 ; GFX11-FLAT-NEXT: s_endpgm
394 ; GFX11-GISEL-LABEL: s_brev_v2i32:
395 ; GFX11-GISEL: ; %bb.0:
396 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
397 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
398 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
400 ; GFX11-GISEL-NEXT: s_brev_b32 s3, s3
401 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
402 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
403 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
404 ; GFX11-GISEL-NEXT: s_endpgm
405 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
406 store <2 x i32> %brev, ptr addrspace(1) %out
410 define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
411 ; SI-LABEL: v_brev_v2i32:
413 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
414 ; SI-NEXT: s_mov_b32 s7, 0xf000
415 ; SI-NEXT: s_mov_b32 s10, 0
416 ; SI-NEXT: s_mov_b32 s11, s7
417 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
418 ; SI-NEXT: s_waitcnt lgkmcnt(0)
419 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
420 ; SI-NEXT: v_mov_b32_e32 v1, 0
421 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
422 ; SI-NEXT: s_mov_b32 s6, -1
423 ; SI-NEXT: s_mov_b32 s4, s0
424 ; SI-NEXT: s_mov_b32 s5, s1
425 ; SI-NEXT: s_waitcnt vmcnt(0)
426 ; SI-NEXT: v_bfrev_b32_e32 v1, v1
427 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
428 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
431 ; FLAT-LABEL: v_brev_v2i32:
433 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
434 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
435 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
436 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
437 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
438 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
439 ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
440 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
441 ; FLAT-NEXT: s_mov_b32 s2, -1
442 ; FLAT-NEXT: s_waitcnt vmcnt(0)
443 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
444 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
445 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
446 ; FLAT-NEXT: s_endpgm
448 ; GISEL-LABEL: v_brev_v2i32:
450 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
451 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
452 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
453 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
454 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
455 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
456 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
457 ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
458 ; GISEL-NEXT: v_mov_b32_e32 v3, s1
459 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
460 ; GISEL-NEXT: s_waitcnt vmcnt(0)
461 ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
462 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1
463 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
464 ; GISEL-NEXT: s_endpgm
466 ; GFX11-FLAT-LABEL: v_brev_v2i32:
467 ; GFX11-FLAT: ; %bb.0:
468 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
469 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
470 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1)
471 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
472 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
473 ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3]
474 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
475 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
476 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
477 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1
478 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
479 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
480 ; GFX11-FLAT-NEXT: s_endpgm
482 ; GFX11-GISEL-LABEL: v_brev_v2i32:
483 ; GFX11-GISEL: ; %bb.0:
484 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
485 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
486 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
487 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
488 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
489 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
490 ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
491 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
492 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0
493 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1
494 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
495 ; GFX11-GISEL-NEXT: s_endpgm
496 %tid = call i32 @llvm.amdgcn.workitem.id.x()
497 %gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
498 %val = load <2 x i32>, ptr addrspace(1) %gep
499 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
500 store <2 x i32> %brev, ptr addrspace(1) %out
504 define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 {
505 ; SI-LABEL: s_brev_i64:
507 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
508 ; SI-NEXT: s_mov_b32 s7, 0xf000
509 ; SI-NEXT: s_mov_b32 s6, -1
510 ; SI-NEXT: s_waitcnt lgkmcnt(0)
511 ; SI-NEXT: s_mov_b32 s4, s0
512 ; SI-NEXT: s_mov_b32 s5, s1
513 ; SI-NEXT: s_brev_b64 s[0:1], s[2:3]
514 ; SI-NEXT: v_mov_b32_e32 v0, s0
515 ; SI-NEXT: v_mov_b32_e32 v1, s1
516 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
519 ; FLAT-LABEL: s_brev_i64:
521 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
522 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
523 ; FLAT-NEXT: s_mov_b32 s6, -1
524 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
525 ; FLAT-NEXT: s_mov_b32 s4, s0
526 ; FLAT-NEXT: s_mov_b32 s5, s1
527 ; FLAT-NEXT: s_brev_b64 s[0:1], s[2:3]
528 ; FLAT-NEXT: v_mov_b32_e32 v0, s0
529 ; FLAT-NEXT: v_mov_b32_e32 v1, s1
530 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
531 ; FLAT-NEXT: s_endpgm
533 ; GISEL-LABEL: s_brev_i64:
535 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
536 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
537 ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
538 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
539 ; GISEL-NEXT: v_mov_b32_e32 v3, s1
540 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
541 ; GISEL-NEXT: v_mov_b32_e32 v2, s0
542 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
543 ; GISEL-NEXT: s_endpgm
545 ; GFX11-FLAT-LABEL: s_brev_i64:
546 ; GFX11-FLAT: ; %bb.0:
547 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
548 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
549 ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3]
550 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
551 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
552 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
553 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
554 ; GFX11-FLAT-NEXT: s_endpgm
556 ; GFX11-GISEL-LABEL: s_brev_i64:
557 ; GFX11-GISEL: ; %bb.0:
558 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
559 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
560 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
562 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
563 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
564 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
565 ; GFX11-GISEL-NEXT: s_endpgm
566 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
567 store i64 %brev, ptr addrspace(1) %out
571 define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
572 ; SI-LABEL: v_brev_i64:
574 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
575 ; SI-NEXT: s_mov_b32 s7, 0xf000
576 ; SI-NEXT: s_mov_b32 s10, 0
577 ; SI-NEXT: s_mov_b32 s11, s7
578 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
579 ; SI-NEXT: s_waitcnt lgkmcnt(0)
580 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
581 ; SI-NEXT: v_mov_b32_e32 v1, 0
582 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
583 ; SI-NEXT: s_mov_b32 s6, -1
584 ; SI-NEXT: s_mov_b32 s4, s0
585 ; SI-NEXT: s_mov_b32 s5, s1
586 ; SI-NEXT: s_waitcnt vmcnt(0)
587 ; SI-NEXT: v_bfrev_b32_e32 v2, v0
588 ; SI-NEXT: v_bfrev_b32_e32 v1, v1
589 ; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
592 ; FLAT-LABEL: v_brev_i64:
594 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
595 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
596 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
597 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
598 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
599 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
600 ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
601 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
602 ; FLAT-NEXT: s_mov_b32 s2, -1
603 ; FLAT-NEXT: s_waitcnt vmcnt(0)
604 ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0
605 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
606 ; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
607 ; FLAT-NEXT: s_endpgm
609 ; GISEL-LABEL: v_brev_i64:
611 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
612 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
613 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
614 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
615 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
616 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
617 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
618 ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
619 ; GISEL-NEXT: v_mov_b32_e32 v4, s1
620 ; GISEL-NEXT: v_mov_b32_e32 v3, s0
621 ; GISEL-NEXT: s_waitcnt vmcnt(0)
622 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1
623 ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0
624 ; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
625 ; GISEL-NEXT: s_endpgm
627 ; GFX11-FLAT-LABEL: v_brev_i64:
628 ; GFX11-FLAT: ; %bb.0:
629 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
630 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
631 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1)
632 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
633 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
634 ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3]
635 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
636 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
637 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
638 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0
639 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1
640 ; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
641 ; GFX11-FLAT-NEXT: s_endpgm
643 ; GFX11-GISEL-LABEL: v_brev_i64:
644 ; GFX11-GISEL: ; %bb.0:
645 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
646 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
647 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
648 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
649 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
651 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
652 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1
653 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v2, v0
654 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
655 ; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1]
656 ; GFX11-GISEL-NEXT: s_endpgm
657 %tid = call i32 @llvm.amdgcn.workitem.id.x()
658 %gep = getelementptr i64, ptr addrspace(1) %valptr, i32 %tid
659 %val = load i64, ptr addrspace(1) %gep
660 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
661 store i64 %brev, ptr addrspace(1) %out
665 define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 {
666 ; SI-LABEL: s_brev_v2i64:
668 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
669 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
670 ; SI-NEXT: s_mov_b32 s7, 0xf000
671 ; SI-NEXT: s_mov_b32 s6, -1
672 ; SI-NEXT: s_waitcnt lgkmcnt(0)
673 ; SI-NEXT: s_brev_b64 s[2:3], s[2:3]
674 ; SI-NEXT: s_brev_b64 s[0:1], s[0:1]
675 ; SI-NEXT: v_mov_b32_e32 v0, s0
676 ; SI-NEXT: v_mov_b32_e32 v1, s1
677 ; SI-NEXT: v_mov_b32_e32 v2, s2
678 ; SI-NEXT: v_mov_b32_e32 v3, s3
679 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
682 ; FLAT-LABEL: s_brev_v2i64:
684 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
685 ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
686 ; FLAT-NEXT: s_mov_b32 s7, 0xf000
687 ; FLAT-NEXT: s_mov_b32 s6, -1
688 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
689 ; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3]
690 ; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1]
691 ; FLAT-NEXT: v_mov_b32_e32 v0, s0
692 ; FLAT-NEXT: v_mov_b32_e32 v1, s1
693 ; FLAT-NEXT: v_mov_b32_e32 v2, s2
694 ; FLAT-NEXT: v_mov_b32_e32 v3, s3
695 ; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
696 ; FLAT-NEXT: s_endpgm
698 ; GISEL-LABEL: s_brev_v2i64:
700 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
701 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
702 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
703 ; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1]
704 ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
705 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
706 ; GISEL-NEXT: v_mov_b32_e32 v4, s4
707 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
708 ; GISEL-NEXT: v_mov_b32_e32 v2, s2
709 ; GISEL-NEXT: v_mov_b32_e32 v3, s3
710 ; GISEL-NEXT: v_mov_b32_e32 v5, s5
711 ; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
712 ; GISEL-NEXT: s_endpgm
714 ; GFX11-FLAT-LABEL: s_brev_v2i64:
715 ; GFX11-FLAT: ; %bb.0:
716 ; GFX11-FLAT-NEXT: s_clause 0x1
717 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
718 ; GFX11-FLAT-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
719 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
720 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
721 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[0:1]
723 ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[2:3]
724 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
725 ; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
726 ; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
727 ; GFX11-FLAT-NEXT: s_endpgm
729 ; GFX11-GISEL-LABEL: s_brev_v2i64:
730 ; GFX11-GISEL: ; %bb.0:
731 ; GFX11-GISEL-NEXT: s_clause 0x1
732 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
733 ; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
734 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
735 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
736 ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[0:1]
737 ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3]
738 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
739 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
740 ; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[4:5]
741 ; GFX11-GISEL-NEXT: s_endpgm
742 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
743 store <2 x i64> %brev, ptr addrspace(1) %out
747 define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
748 ; SI-LABEL: v_brev_v2i64:
750 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
751 ; SI-NEXT: s_mov_b32 s7, 0xf000
752 ; SI-NEXT: s_mov_b32 s10, 0
753 ; SI-NEXT: s_mov_b32 s11, s7
754 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
755 ; SI-NEXT: s_waitcnt lgkmcnt(0)
756 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
757 ; SI-NEXT: v_mov_b32_e32 v1, 0
758 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
759 ; SI-NEXT: s_mov_b32 s6, -1
760 ; SI-NEXT: s_mov_b32 s4, s0
761 ; SI-NEXT: s_mov_b32 s5, s1
762 ; SI-NEXT: s_waitcnt vmcnt(0)
763 ; SI-NEXT: v_bfrev_b32_e32 v4, v2
764 ; SI-NEXT: v_bfrev_b32_e32 v3, v3
765 ; SI-NEXT: v_bfrev_b32_e32 v2, v0
766 ; SI-NEXT: v_bfrev_b32_e32 v1, v1
767 ; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0
770 ; FLAT-LABEL: v_brev_v2i64:
772 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
773 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
774 ; FLAT-NEXT: s_waitcnt lgkmcnt(0)
775 ; FLAT-NEXT: v_mov_b32_e32 v1, s3
776 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
777 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
778 ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
779 ; FLAT-NEXT: s_mov_b32 s3, 0xf000
780 ; FLAT-NEXT: s_mov_b32 s2, -1
781 ; FLAT-NEXT: s_waitcnt vmcnt(0)
782 ; FLAT-NEXT: v_bfrev_b32_e32 v4, v2
783 ; FLAT-NEXT: v_bfrev_b32_e32 v3, v3
784 ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0
785 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1
786 ; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
787 ; FLAT-NEXT: s_endpgm
789 ; GISEL-LABEL: v_brev_v2i64:
791 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
792 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
793 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
794 ; GISEL-NEXT: v_mov_b32_e32 v0, s2
795 ; GISEL-NEXT: v_mov_b32_e32 v1, s3
796 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
797 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
798 ; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
799 ; GISEL-NEXT: s_waitcnt vmcnt(0)
800 ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1
801 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0
802 ; GISEL-NEXT: v_mov_b32_e32 v0, s0
803 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3
804 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2
805 ; GISEL-NEXT: v_mov_b32_e32 v1, s1
806 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
807 ; GISEL-NEXT: s_endpgm
809 ; GFX11-FLAT-LABEL: v_brev_v2i64:
810 ; GFX11-FLAT: ; %bb.0:
811 ; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
812 ; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
813 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1)
814 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
815 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
816 ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3]
817 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
818 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
819 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0)
820 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v4, v2
821 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v3, v3
822 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0
823 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1
824 ; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[0:3], 0
825 ; GFX11-FLAT-NEXT: s_endpgm
827 ; GFX11-GISEL-LABEL: v_brev_v2i64:
828 ; GFX11-GISEL: ; %bb.0:
829 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
830 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
831 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
832 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
833 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
834 ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3]
835 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
836 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v4, v1
837 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v5, v0
838 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v6, v3
839 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v7, v2
840 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
841 ; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[0:1]
842 ; GFX11-GISEL-NEXT: s_endpgm
843 %tid = call i32 @llvm.amdgcn.workitem.id.x()
844 %gep = getelementptr <2 x i64> , ptr addrspace(1) %valptr, i32 %tid
845 %val = load <2 x i64>, ptr addrspace(1) %gep
846 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
847 store <2 x i64> %brev, ptr addrspace(1) %out
851 define float @missing_truncate_promote_bitreverse(i32 %arg) {
852 ; SI-LABEL: missing_truncate_promote_bitreverse:
854 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
855 ; SI-NEXT: v_bfrev_b32_e32 v0, v0
856 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
857 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
858 ; SI-NEXT: s_setpc_b64 s[30:31]
860 ; FLAT-LABEL: missing_truncate_promote_bitreverse:
861 ; FLAT: ; %bb.0: ; %bb
862 ; FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
863 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0
864 ; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
865 ; FLAT-NEXT: s_setpc_b64 s[30:31]
867 ; GISEL-LABEL: missing_truncate_promote_bitreverse:
868 ; GISEL: ; %bb.0: ; %bb
869 ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870 ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0
871 ; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
872 ; GISEL-NEXT: s_setpc_b64 s[30:31]
874 ; GFX11-FLAT-LABEL: missing_truncate_promote_bitreverse:
875 ; GFX11-FLAT: ; %bb.0: ; %bb
876 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0
878 ; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
879 ; GFX11-FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0
880 ; GFX11-FLAT-NEXT: v_cvt_f32_f16_e32 v0, v0
881 ; GFX11-FLAT-NEXT: s_setpc_b64 s[30:31]
883 ; GFX11-GISEL-LABEL: missing_truncate_promote_bitreverse:
884 ; GFX11-GISEL: ; %bb.0: ; %bb
885 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
886 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0
887 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
888 ; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0
889 ; GFX11-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
890 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
892 %tmp = trunc i32 %arg to i16
893 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
894 %tmp2 = bitcast i16 %tmp1 to half
895 %tmp3 = fpext half %tmp2 to float
899 attributes #0 = { nounwind }
900 attributes #1 = { nounwind readnone }