1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
5 define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
6 ; SI-LABEL: v_ubfe_sub_i32:
8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
9 ; SI-NEXT: s_mov_b32 s7, 0xf000
10 ; SI-NEXT: s_mov_b32 s6, 0
11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
12 ; SI-NEXT: v_mov_b32_e32 v1, 0
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
15 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
16 ; SI-NEXT: s_waitcnt vmcnt(0)
17 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
18 ; SI-NEXT: s_waitcnt vmcnt(0)
19 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20 ; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
21 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
24 ; VI-LABEL: v_ubfe_sub_i32:
26 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
27 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: v_mov_b32_e32 v1, s3
30 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
31 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
32 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
33 ; VI-NEXT: s_waitcnt vmcnt(0)
34 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
35 ; VI-NEXT: s_waitcnt vmcnt(0)
36 ; VI-NEXT: v_mov_b32_e32 v1, s1
37 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
38 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
39 ; VI-NEXT: v_bfe_u32 v2, v3, 0, v4
40 ; VI-NEXT: flat_store_dword v[0:1], v2
42 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
43 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
44 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
45 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
46 %src = load volatile i32, ptr addrspace(1) %in0.gep
47 %width = load volatile i32, ptr addrspace(1) %in0.gep
48 %sub = sub i32 32, %width
49 %shl = shl i32 %src, %sub
50 %bfe = lshr i32 %shl, %sub
51 store i32 %bfe, ptr addrspace(1) %out.gep
55 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
56 ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32:
58 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
59 ; SI-NEXT: s_mov_b32 s6, 0
60 ; SI-NEXT: s_mov_b32 s7, 0xf000
61 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
62 ; SI-NEXT: v_mov_b32_e32 v1, 0
63 ; SI-NEXT: s_waitcnt lgkmcnt(0)
64 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
65 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
66 ; SI-NEXT: s_waitcnt vmcnt(0)
67 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
68 ; SI-NEXT: s_waitcnt vmcnt(0)
69 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
70 ; SI-NEXT: s_mov_b32 s6, -1
71 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
72 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
73 ; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
74 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
75 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
76 ; SI-NEXT: s_waitcnt vmcnt(0)
79 ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
81 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
82 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
83 ; VI-NEXT: s_waitcnt lgkmcnt(0)
84 ; VI-NEXT: v_mov_b32_e32 v1, s3
85 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
86 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
87 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
88 ; VI-NEXT: s_waitcnt vmcnt(0)
89 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
90 ; VI-NEXT: s_waitcnt vmcnt(0)
91 ; VI-NEXT: v_mov_b32_e32 v1, s1
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
94 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
95 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
96 ; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
97 ; VI-NEXT: flat_store_dword v[0:1], v2
98 ; VI-NEXT: flat_store_dword v[0:1], v3
99 ; VI-NEXT: s_waitcnt vmcnt(0)
101 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
102 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
103 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
104 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
105 %src = load volatile i32, ptr addrspace(1) %in0.gep
106 %width = load volatile i32, ptr addrspace(1) %in0.gep
107 %sub = sub i32 32, %width
108 %shl = shl i32 %src, %sub
109 %bfe = lshr i32 %shl, %sub
110 store i32 %bfe, ptr addrspace(1) %out.gep
111 store volatile i32 %shl, ptr addrspace(1) undef
115 define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
116 ; SI-LABEL: s_ubfe_sub_i32:
118 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
119 ; SI-NEXT: s_mov_b32 s7, 0xf000
120 ; SI-NEXT: s_mov_b32 s6, 0
121 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
122 ; SI-NEXT: s_waitcnt lgkmcnt(0)
123 ; SI-NEXT: s_sub_i32 s3, 32, s3
124 ; SI-NEXT: s_lshl_b32 s2, s2, s3
125 ; SI-NEXT: s_lshr_b32 s2, s2, s3
126 ; SI-NEXT: v_mov_b32_e32 v1, 0
127 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
128 ; SI-NEXT: v_mov_b32_e32 v2, s2
129 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
132 ; VI-LABEL: s_ubfe_sub_i32:
134 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
135 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
136 ; VI-NEXT: s_waitcnt lgkmcnt(0)
137 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
138 ; VI-NEXT: s_sub_i32 s0, 32, s3
139 ; VI-NEXT: v_mov_b32_e32 v1, s1
140 ; VI-NEXT: s_lshl_b32 s1, s2, s0
141 ; VI-NEXT: s_lshr_b32 s0, s1, s0
142 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
143 ; VI-NEXT: v_mov_b32_e32 v2, s0
144 ; VI-NEXT: flat_store_dword v[0:1], v2
146 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
147 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
148 %sub = sub i32 32, %width
149 %shl = shl i32 %src, %sub
150 %bfe = lshr i32 %shl, %sub
151 store i32 %bfe, ptr addrspace(1) %out.gep
155 define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
156 ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32:
158 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
159 ; SI-NEXT: s_mov_b32 s6, 0
160 ; SI-NEXT: s_mov_b32 s7, 0xf000
161 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
162 ; SI-NEXT: s_waitcnt lgkmcnt(0)
163 ; SI-NEXT: s_sub_i32 s3, 32, s3
164 ; SI-NEXT: s_lshl_b32 s2, s2, s3
165 ; SI-NEXT: s_lshr_b32 s3, s2, s3
166 ; SI-NEXT: v_mov_b32_e32 v1, 0
167 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
168 ; SI-NEXT: v_mov_b32_e32 v2, s3
169 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
170 ; SI-NEXT: s_mov_b32 s6, -1
171 ; SI-NEXT: v_mov_b32_e32 v0, s2
172 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
173 ; SI-NEXT: s_waitcnt vmcnt(0)
176 ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32:
178 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
179 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
180 ; VI-NEXT: s_waitcnt lgkmcnt(0)
181 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
182 ; VI-NEXT: s_sub_i32 s0, 32, s3
183 ; VI-NEXT: v_mov_b32_e32 v1, s1
184 ; VI-NEXT: s_lshl_b32 s1, s2, s0
185 ; VI-NEXT: s_lshr_b32 s0, s1, s0
186 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
187 ; VI-NEXT: v_mov_b32_e32 v2, s0
188 ; VI-NEXT: flat_store_dword v[0:1], v2
189 ; VI-NEXT: v_mov_b32_e32 v0, s1
190 ; VI-NEXT: flat_store_dword v[0:1], v0
191 ; VI-NEXT: s_waitcnt vmcnt(0)
193 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
194 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
195 %sub = sub i32 32, %width
196 %shl = shl i32 %src, %sub
197 %bfe = lshr i32 %shl, %sub
198 store i32 %bfe, ptr addrspace(1) %out.gep
199 store volatile i32 %shl, ptr addrspace(1) undef
203 define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
204 ; SI-LABEL: v_sbfe_sub_i32:
206 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
207 ; SI-NEXT: s_mov_b32 s7, 0xf000
208 ; SI-NEXT: s_mov_b32 s6, 0
209 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
210 ; SI-NEXT: v_mov_b32_e32 v1, 0
211 ; SI-NEXT: s_waitcnt lgkmcnt(0)
212 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
213 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
214 ; SI-NEXT: s_waitcnt vmcnt(0)
215 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
216 ; SI-NEXT: s_waitcnt vmcnt(0)
217 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
218 ; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
219 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
222 ; VI-LABEL: v_sbfe_sub_i32:
224 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
225 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
226 ; VI-NEXT: s_waitcnt lgkmcnt(0)
227 ; VI-NEXT: v_mov_b32_e32 v1, s3
228 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
229 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
230 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
231 ; VI-NEXT: s_waitcnt vmcnt(0)
232 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
233 ; VI-NEXT: s_waitcnt vmcnt(0)
234 ; VI-NEXT: v_mov_b32_e32 v1, s1
235 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
236 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
237 ; VI-NEXT: v_bfe_i32 v2, v3, 0, v4
238 ; VI-NEXT: flat_store_dword v[0:1], v2
240 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
241 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
242 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
243 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
244 %src = load volatile i32, ptr addrspace(1) %in0.gep
245 %width = load volatile i32, ptr addrspace(1) %in0.gep
246 %sub = sub i32 32, %width
247 %shl = shl i32 %src, %sub
248 %bfe = ashr i32 %shl, %sub
249 store i32 %bfe, ptr addrspace(1) %out.gep
253 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
254 ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32:
256 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
257 ; SI-NEXT: s_mov_b32 s6, 0
258 ; SI-NEXT: s_mov_b32 s7, 0xf000
259 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
260 ; SI-NEXT: v_mov_b32_e32 v1, 0
261 ; SI-NEXT: s_waitcnt lgkmcnt(0)
262 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
263 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
264 ; SI-NEXT: s_waitcnt vmcnt(0)
265 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
266 ; SI-NEXT: s_waitcnt vmcnt(0)
267 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
268 ; SI-NEXT: s_mov_b32 s6, -1
269 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
270 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
271 ; SI-NEXT: v_ashrrev_i32_e32 v3, v3, v2
272 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
273 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
274 ; SI-NEXT: s_waitcnt vmcnt(0)
277 ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
279 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
280 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
281 ; VI-NEXT: s_waitcnt lgkmcnt(0)
282 ; VI-NEXT: v_mov_b32_e32 v1, s3
283 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
284 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
285 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
286 ; VI-NEXT: s_waitcnt vmcnt(0)
287 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
288 ; VI-NEXT: s_waitcnt vmcnt(0)
289 ; VI-NEXT: v_mov_b32_e32 v1, s1
290 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
291 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
292 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
293 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
294 ; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
295 ; VI-NEXT: flat_store_dword v[0:1], v2
296 ; VI-NEXT: flat_store_dword v[0:1], v3
297 ; VI-NEXT: s_waitcnt vmcnt(0)
299 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
300 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
301 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
302 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
303 %src = load volatile i32, ptr addrspace(1) %in0.gep
304 %width = load volatile i32, ptr addrspace(1) %in0.gep
305 %sub = sub i32 32, %width
306 %shl = shl i32 %src, %sub
307 %bfe = ashr i32 %shl, %sub
308 store i32 %bfe, ptr addrspace(1) %out.gep
309 store volatile i32 %shl, ptr addrspace(1) undef
313 define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
314 ; SI-LABEL: s_sbfe_sub_i32:
316 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
317 ; SI-NEXT: s_mov_b32 s7, 0xf000
318 ; SI-NEXT: s_mov_b32 s6, 0
319 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
320 ; SI-NEXT: s_waitcnt lgkmcnt(0)
321 ; SI-NEXT: s_sub_i32 s3, 32, s3
322 ; SI-NEXT: s_lshl_b32 s2, s2, s3
323 ; SI-NEXT: s_ashr_i32 s2, s2, s3
324 ; SI-NEXT: v_mov_b32_e32 v1, 0
325 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
326 ; SI-NEXT: v_mov_b32_e32 v2, s2
327 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
330 ; VI-LABEL: s_sbfe_sub_i32:
332 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
333 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
334 ; VI-NEXT: s_waitcnt lgkmcnt(0)
335 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
336 ; VI-NEXT: s_sub_i32 s0, 32, s3
337 ; VI-NEXT: v_mov_b32_e32 v1, s1
338 ; VI-NEXT: s_lshl_b32 s1, s2, s0
339 ; VI-NEXT: s_ashr_i32 s0, s1, s0
340 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
341 ; VI-NEXT: v_mov_b32_e32 v2, s0
342 ; VI-NEXT: flat_store_dword v[0:1], v2
344 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
345 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
346 %sub = sub i32 32, %width
347 %shl = shl i32 %src, %sub
348 %bfe = ashr i32 %shl, %sub
349 store i32 %bfe, ptr addrspace(1) %out.gep
353 define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
354 ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32:
356 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
357 ; SI-NEXT: s_mov_b32 s6, 0
358 ; SI-NEXT: s_mov_b32 s7, 0xf000
359 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
360 ; SI-NEXT: s_waitcnt lgkmcnt(0)
361 ; SI-NEXT: s_sub_i32 s3, 32, s3
362 ; SI-NEXT: s_lshl_b32 s2, s2, s3
363 ; SI-NEXT: s_ashr_i32 s3, s2, s3
364 ; SI-NEXT: v_mov_b32_e32 v1, 0
365 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
366 ; SI-NEXT: v_mov_b32_e32 v2, s3
367 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
368 ; SI-NEXT: s_mov_b32 s6, -1
369 ; SI-NEXT: v_mov_b32_e32 v0, s2
370 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
371 ; SI-NEXT: s_waitcnt vmcnt(0)
374 ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32:
376 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
377 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
378 ; VI-NEXT: s_waitcnt lgkmcnt(0)
379 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
380 ; VI-NEXT: s_sub_i32 s0, 32, s3
381 ; VI-NEXT: v_mov_b32_e32 v1, s1
382 ; VI-NEXT: s_lshl_b32 s1, s2, s0
383 ; VI-NEXT: s_ashr_i32 s0, s1, s0
384 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
385 ; VI-NEXT: v_mov_b32_e32 v2, s0
386 ; VI-NEXT: flat_store_dword v[0:1], v2
387 ; VI-NEXT: v_mov_b32_e32 v0, s1
388 ; VI-NEXT: flat_store_dword v[0:1], v0
389 ; VI-NEXT: s_waitcnt vmcnt(0)
391 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
392 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
393 %sub = sub i32 32, %width
394 %shl = shl i32 %src, %sub
395 %bfe = ashr i32 %shl, %sub
396 store i32 %bfe, ptr addrspace(1) %out.gep
397 store volatile i32 %shl, ptr addrspace(1) undef
401 define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
402 ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
404 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
405 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
406 ; SI-NEXT: s_waitcnt lgkmcnt(0)
407 ; SI-NEXT: s_load_dword s2, s[6:7], 0x0
408 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
409 ; SI-NEXT: s_mov_b32 s7, 0xf000
410 ; SI-NEXT: s_waitcnt lgkmcnt(0)
411 ; SI-NEXT: s_or_b32 s0, s2, s0
412 ; SI-NEXT: s_bfe_i32 s0, s0, 0xf0000
413 ; SI-NEXT: s_mov_b32 s6, -1
414 ; SI-NEXT: v_mov_b32_e32 v0, s0
415 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
418 ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
420 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
421 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
422 ; VI-NEXT: s_waitcnt lgkmcnt(0)
423 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
424 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
425 ; VI-NEXT: v_mov_b32_e32 v0, s4
426 ; VI-NEXT: v_mov_b32_e32 v1, s5
427 ; VI-NEXT: s_waitcnt lgkmcnt(0)
428 ; VI-NEXT: s_or_b32 s0, s2, s0
429 ; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000
430 ; VI-NEXT: v_mov_b32_e32 v2, s0
431 ; VI-NEXT: flat_store_dword v[0:1], v2
433 %a0 = load i32, ptr addrspace(1) %in0
434 %b0 = load i32, ptr addrspace(1) %in1
435 %a1 = shl i32 %a0, 17
436 %b1 = shl i32 %b0, 17
437 %or = or i32 %a1, %b1
438 %result = ashr i32 %or, 17
439 store i32 %result, ptr addrspace(1) %out
443 ; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
444 define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
445 ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
447 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
448 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
449 ; SI-NEXT: s_waitcnt lgkmcnt(0)
450 ; SI-NEXT: s_load_dword s2, s[6:7], 0x0
451 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
452 ; SI-NEXT: s_mov_b32 s7, 0xf000
453 ; SI-NEXT: s_waitcnt lgkmcnt(0)
454 ; SI-NEXT: s_lshl_b32 s1, s2, 17
455 ; SI-NEXT: s_lshl_b32 s0, s0, 19
456 ; SI-NEXT: s_or_b32 s0, s1, s0
457 ; SI-NEXT: s_ashr_i32 s0, s0, 17
458 ; SI-NEXT: s_mov_b32 s6, -1
459 ; SI-NEXT: v_mov_b32_e32 v0, s0
460 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
463 ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
465 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
466 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
468 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
469 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
470 ; VI-NEXT: v_mov_b32_e32 v0, s4
471 ; VI-NEXT: v_mov_b32_e32 v1, s5
472 ; VI-NEXT: s_waitcnt lgkmcnt(0)
473 ; VI-NEXT: s_lshl_b32 s1, s2, 17
474 ; VI-NEXT: s_lshl_b32 s0, s0, 19
475 ; VI-NEXT: s_or_b32 s0, s1, s0
476 ; VI-NEXT: s_ashr_i32 s0, s0, 17
477 ; VI-NEXT: v_mov_b32_e32 v2, s0
478 ; VI-NEXT: flat_store_dword v[0:1], v2
480 %a0 = load i32, ptr addrspace(1) %x
481 %b0 = load i32, ptr addrspace(1) %y
482 %a1 = shl i32 %a0, 17
483 %b1 = shl i32 %b0, 19
484 %or = or i32 %a1, %b1
485 %result = ashr i32 %or, 17
486 store i32 %result, ptr addrspace(1) %out
490 ; Don't fold as 'other shl' amount is less than the sign_extend_inreg type.
491 define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
492 ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
494 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
495 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
496 ; SI-NEXT: s_waitcnt lgkmcnt(0)
497 ; SI-NEXT: s_load_dword s2, s[6:7], 0x0
498 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
499 ; SI-NEXT: s_mov_b32 s7, 0xf000
500 ; SI-NEXT: s_waitcnt lgkmcnt(0)
501 ; SI-NEXT: s_lshl_b32 s1, s2, 17
502 ; SI-NEXT: s_lshl_b32 s0, s0, 16
503 ; SI-NEXT: s_or_b32 s0, s1, s0
504 ; SI-NEXT: s_ashr_i32 s0, s0, 17
505 ; SI-NEXT: s_mov_b32 s6, -1
506 ; SI-NEXT: v_mov_b32_e32 v0, s0
507 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
510 ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
512 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
513 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
514 ; VI-NEXT: s_waitcnt lgkmcnt(0)
515 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
516 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
517 ; VI-NEXT: v_mov_b32_e32 v0, s4
518 ; VI-NEXT: v_mov_b32_e32 v1, s5
519 ; VI-NEXT: s_waitcnt lgkmcnt(0)
520 ; VI-NEXT: s_lshl_b32 s1, s2, 17
521 ; VI-NEXT: s_lshl_b32 s0, s0, 16
522 ; VI-NEXT: s_or_b32 s0, s1, s0
523 ; VI-NEXT: s_ashr_i32 s0, s0, 17
524 ; VI-NEXT: v_mov_b32_e32 v2, s0
525 ; VI-NEXT: flat_store_dword v[0:1], v2
527 %a0 = load i32, ptr addrspace(1) %x
528 %b0 = load i32, ptr addrspace(1) %y
529 %a1 = shl i32 %a0, 17
530 %b1 = shl i32 %b0, 16
531 %or = or i32 %a1, %b1
532 %result = ashr i32 %or, 17
533 store i32 %result, ptr addrspace(1) %out
537 declare i32 @llvm.amdgcn.workitem.id.x() #0
539 attributes #0 = { nounwind readnone }
540 attributes #1 = { nounwind }