1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
5 define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
6 ; SI-LABEL: v_ubfe_sub_i32:
8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
9 ; SI-NEXT: s_mov_b32 s7, 0xf000
10 ; SI-NEXT: s_mov_b32 s6, 0
11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
12 ; SI-NEXT: v_mov_b32_e32 v1, 0
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
15 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
16 ; SI-NEXT: s_waitcnt vmcnt(0)
17 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
18 ; SI-NEXT: s_waitcnt vmcnt(0)
19 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
21 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
22 ; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
23 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
26 ; VI-LABEL: v_ubfe_sub_i32:
28 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
29 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: v_mov_b32_e32 v1, s3
32 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
33 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
34 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
35 ; VI-NEXT: s_waitcnt vmcnt(0)
36 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
37 ; VI-NEXT: s_waitcnt vmcnt(0)
38 ; VI-NEXT: v_mov_b32_e32 v1, s1
39 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
40 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
42 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
43 ; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
44 ; VI-NEXT: flat_store_dword v[0:1], v2
46 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
47 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
48 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
49 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
50 %src = load volatile i32, ptr addrspace(1) %in0.gep
51 %width = load volatile i32, ptr addrspace(1) %in0.gep
52 %sub = sub i32 32, %width
53 %shl = shl i32 %src, %sub
54 %bfe = lshr i32 %shl, %sub
55 store i32 %bfe, ptr addrspace(1) %out.gep
59 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
60 ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32:
62 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
63 ; SI-NEXT: s_mov_b32 s6, 0
64 ; SI-NEXT: s_mov_b32 s7, 0xf000
65 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
66 ; SI-NEXT: v_mov_b32_e32 v1, 0
67 ; SI-NEXT: s_waitcnt lgkmcnt(0)
68 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
69 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
70 ; SI-NEXT: s_waitcnt vmcnt(0)
71 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
72 ; SI-NEXT: s_waitcnt vmcnt(0)
73 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
74 ; SI-NEXT: s_mov_b32 s6, -1
75 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
76 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
77 ; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
78 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
79 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
80 ; SI-NEXT: s_waitcnt vmcnt(0)
83 ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
85 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
86 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
87 ; VI-NEXT: s_waitcnt lgkmcnt(0)
88 ; VI-NEXT: v_mov_b32_e32 v1, s3
89 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
90 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
91 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
92 ; VI-NEXT: s_waitcnt vmcnt(0)
93 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
94 ; VI-NEXT: s_waitcnt vmcnt(0)
95 ; VI-NEXT: v_mov_b32_e32 v1, s1
96 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
97 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
98 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
99 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
100 ; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
101 ; VI-NEXT: flat_store_dword v[0:1], v2
102 ; VI-NEXT: flat_store_dword v[0:1], v3
103 ; VI-NEXT: s_waitcnt vmcnt(0)
105 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
106 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
107 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
108 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
109 %src = load volatile i32, ptr addrspace(1) %in0.gep
110 %width = load volatile i32, ptr addrspace(1) %in0.gep
111 %sub = sub i32 32, %width
112 %shl = shl i32 %src, %sub
113 %bfe = lshr i32 %shl, %sub
114 store i32 %bfe, ptr addrspace(1) %out.gep
115 store volatile i32 %shl, ptr addrspace(1) undef
119 define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
120 ; SI-LABEL: s_ubfe_sub_i32:
122 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
123 ; SI-NEXT: s_mov_b32 s7, 0xf000
124 ; SI-NEXT: s_mov_b32 s6, 0
125 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
126 ; SI-NEXT: s_waitcnt lgkmcnt(0)
127 ; SI-NEXT: s_sub_i32 s3, 32, s3
128 ; SI-NEXT: s_lshl_b32 s2, s2, s3
129 ; SI-NEXT: s_lshr_b32 s2, s2, s3
130 ; SI-NEXT: v_mov_b32_e32 v1, 0
131 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
132 ; SI-NEXT: v_mov_b32_e32 v2, s2
133 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
136 ; VI-LABEL: s_ubfe_sub_i32:
138 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
139 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
140 ; VI-NEXT: s_waitcnt lgkmcnt(0)
141 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
142 ; VI-NEXT: s_sub_i32 s0, 32, s3
143 ; VI-NEXT: v_mov_b32_e32 v1, s1
144 ; VI-NEXT: s_lshl_b32 s1, s2, s0
145 ; VI-NEXT: s_lshr_b32 s0, s1, s0
146 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
147 ; VI-NEXT: v_mov_b32_e32 v2, s0
148 ; VI-NEXT: flat_store_dword v[0:1], v2
150 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
151 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
152 %sub = sub i32 32, %width
153 %shl = shl i32 %src, %sub
154 %bfe = lshr i32 %shl, %sub
155 store i32 %bfe, ptr addrspace(1) %out.gep
159 define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
160 ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32:
162 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
163 ; SI-NEXT: s_mov_b32 s6, 0
164 ; SI-NEXT: s_mov_b32 s7, 0xf000
165 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
166 ; SI-NEXT: s_waitcnt lgkmcnt(0)
167 ; SI-NEXT: s_sub_i32 s3, 32, s3
168 ; SI-NEXT: s_lshl_b32 s2, s2, s3
169 ; SI-NEXT: s_lshr_b32 s3, s2, s3
170 ; SI-NEXT: v_mov_b32_e32 v1, 0
171 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
172 ; SI-NEXT: v_mov_b32_e32 v2, s3
173 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
174 ; SI-NEXT: s_mov_b32 s6, -1
175 ; SI-NEXT: v_mov_b32_e32 v0, s2
176 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
177 ; SI-NEXT: s_waitcnt vmcnt(0)
180 ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32:
182 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
183 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
184 ; VI-NEXT: s_waitcnt lgkmcnt(0)
185 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
186 ; VI-NEXT: s_sub_i32 s0, 32, s3
187 ; VI-NEXT: v_mov_b32_e32 v1, s1
188 ; VI-NEXT: s_lshl_b32 s1, s2, s0
189 ; VI-NEXT: s_lshr_b32 s0, s1, s0
190 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
191 ; VI-NEXT: v_mov_b32_e32 v2, s0
192 ; VI-NEXT: flat_store_dword v[0:1], v2
193 ; VI-NEXT: v_mov_b32_e32 v0, s1
194 ; VI-NEXT: flat_store_dword v[0:1], v0
195 ; VI-NEXT: s_waitcnt vmcnt(0)
197 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
198 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
199 %sub = sub i32 32, %width
200 %shl = shl i32 %src, %sub
201 %bfe = lshr i32 %shl, %sub
202 store i32 %bfe, ptr addrspace(1) %out.gep
203 store volatile i32 %shl, ptr addrspace(1) undef
207 define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
208 ; SI-LABEL: v_sbfe_sub_i32:
210 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
211 ; SI-NEXT: s_mov_b32 s7, 0xf000
212 ; SI-NEXT: s_mov_b32 s6, 0
213 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
214 ; SI-NEXT: v_mov_b32_e32 v1, 0
215 ; SI-NEXT: s_waitcnt lgkmcnt(0)
216 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
217 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
218 ; SI-NEXT: s_waitcnt vmcnt(0)
219 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
220 ; SI-NEXT: s_waitcnt vmcnt(0)
221 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
222 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
223 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
224 ; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
225 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
228 ; VI-LABEL: v_sbfe_sub_i32:
230 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
231 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
232 ; VI-NEXT: s_waitcnt lgkmcnt(0)
233 ; VI-NEXT: v_mov_b32_e32 v1, s3
234 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
235 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
236 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
237 ; VI-NEXT: s_waitcnt vmcnt(0)
238 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
239 ; VI-NEXT: s_waitcnt vmcnt(0)
240 ; VI-NEXT: v_mov_b32_e32 v1, s1
241 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
242 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
243 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
244 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
245 ; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
246 ; VI-NEXT: flat_store_dword v[0:1], v2
248 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
249 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
250 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
251 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
252 %src = load volatile i32, ptr addrspace(1) %in0.gep
253 %width = load volatile i32, ptr addrspace(1) %in0.gep
254 %sub = sub i32 32, %width
255 %shl = shl i32 %src, %sub
256 %bfe = ashr i32 %shl, %sub
257 store i32 %bfe, ptr addrspace(1) %out.gep
261 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
262 ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32:
264 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
265 ; SI-NEXT: s_mov_b32 s6, 0
266 ; SI-NEXT: s_mov_b32 s7, 0xf000
267 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268 ; SI-NEXT: v_mov_b32_e32 v1, 0
269 ; SI-NEXT: s_waitcnt lgkmcnt(0)
270 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
271 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
272 ; SI-NEXT: s_waitcnt vmcnt(0)
273 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
274 ; SI-NEXT: s_waitcnt vmcnt(0)
275 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
276 ; SI-NEXT: s_mov_b32 s6, -1
277 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
278 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
279 ; SI-NEXT: v_ashrrev_i32_e32 v3, v3, v2
280 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
281 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
282 ; SI-NEXT: s_waitcnt vmcnt(0)
285 ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
287 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
288 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
289 ; VI-NEXT: s_waitcnt lgkmcnt(0)
290 ; VI-NEXT: v_mov_b32_e32 v1, s3
291 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
292 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
293 ; VI-NEXT: flat_load_dword v3, v[0:1] glc
294 ; VI-NEXT: s_waitcnt vmcnt(0)
295 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
296 ; VI-NEXT: s_waitcnt vmcnt(0)
297 ; VI-NEXT: v_mov_b32_e32 v1, s1
298 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
299 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
300 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
301 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
302 ; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
303 ; VI-NEXT: flat_store_dword v[0:1], v2
304 ; VI-NEXT: flat_store_dword v[0:1], v3
305 ; VI-NEXT: s_waitcnt vmcnt(0)
307 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
308 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
309 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
310 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
311 %src = load volatile i32, ptr addrspace(1) %in0.gep
312 %width = load volatile i32, ptr addrspace(1) %in0.gep
313 %sub = sub i32 32, %width
314 %shl = shl i32 %src, %sub
315 %bfe = ashr i32 %shl, %sub
316 store i32 %bfe, ptr addrspace(1) %out.gep
317 store volatile i32 %shl, ptr addrspace(1) undef
321 define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
322 ; SI-LABEL: s_sbfe_sub_i32:
324 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
325 ; SI-NEXT: s_mov_b32 s7, 0xf000
326 ; SI-NEXT: s_mov_b32 s6, 0
327 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
328 ; SI-NEXT: s_waitcnt lgkmcnt(0)
329 ; SI-NEXT: s_sub_i32 s3, 32, s3
330 ; SI-NEXT: s_lshl_b32 s2, s2, s3
331 ; SI-NEXT: s_ashr_i32 s2, s2, s3
332 ; SI-NEXT: v_mov_b32_e32 v1, 0
333 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
334 ; SI-NEXT: v_mov_b32_e32 v2, s2
335 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
338 ; VI-LABEL: s_sbfe_sub_i32:
340 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
341 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
342 ; VI-NEXT: s_waitcnt lgkmcnt(0)
343 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
344 ; VI-NEXT: s_sub_i32 s0, 32, s3
345 ; VI-NEXT: v_mov_b32_e32 v1, s1
346 ; VI-NEXT: s_lshl_b32 s1, s2, s0
347 ; VI-NEXT: s_ashr_i32 s0, s1, s0
348 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
349 ; VI-NEXT: v_mov_b32_e32 v2, s0
350 ; VI-NEXT: flat_store_dword v[0:1], v2
352 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
353 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
354 %sub = sub i32 32, %width
355 %shl = shl i32 %src, %sub
356 %bfe = ashr i32 %shl, %sub
357 store i32 %bfe, ptr addrspace(1) %out.gep
361 define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
362 ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32:
364 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
365 ; SI-NEXT: s_mov_b32 s6, 0
366 ; SI-NEXT: s_mov_b32 s7, 0xf000
367 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
368 ; SI-NEXT: s_waitcnt lgkmcnt(0)
369 ; SI-NEXT: s_sub_i32 s3, 32, s3
370 ; SI-NEXT: s_lshl_b32 s2, s2, s3
371 ; SI-NEXT: s_ashr_i32 s3, s2, s3
372 ; SI-NEXT: v_mov_b32_e32 v1, 0
373 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
374 ; SI-NEXT: v_mov_b32_e32 v2, s3
375 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
376 ; SI-NEXT: s_mov_b32 s6, -1
377 ; SI-NEXT: v_mov_b32_e32 v0, s2
378 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
379 ; SI-NEXT: s_waitcnt vmcnt(0)
382 ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32:
384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
385 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
386 ; VI-NEXT: s_waitcnt lgkmcnt(0)
387 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
388 ; VI-NEXT: s_sub_i32 s0, 32, s3
389 ; VI-NEXT: v_mov_b32_e32 v1, s1
390 ; VI-NEXT: s_lshl_b32 s1, s2, s0
391 ; VI-NEXT: s_ashr_i32 s0, s1, s0
392 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
393 ; VI-NEXT: v_mov_b32_e32 v2, s0
394 ; VI-NEXT: flat_store_dword v[0:1], v2
395 ; VI-NEXT: v_mov_b32_e32 v0, s1
396 ; VI-NEXT: flat_store_dword v[0:1], v0
397 ; VI-NEXT: s_waitcnt vmcnt(0)
399 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
400 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
401 %sub = sub i32 32, %width
402 %shl = shl i32 %src, %sub
403 %bfe = ashr i32 %shl, %sub
404 store i32 %bfe, ptr addrspace(1) %out.gep
405 store volatile i32 %shl, ptr addrspace(1) undef
409 define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
410 ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
412 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
413 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
414 ; SI-NEXT: s_waitcnt lgkmcnt(0)
415 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
416 ; SI-NEXT: s_load_dword s4, s[4:5], 0x0
417 ; SI-NEXT: s_mov_b32 s3, 0xf000
418 ; SI-NEXT: s_waitcnt lgkmcnt(0)
419 ; SI-NEXT: s_or_b32 s2, s2, s4
420 ; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000
421 ; SI-NEXT: s_mov_b32 s2, -1
422 ; SI-NEXT: v_mov_b32_e32 v0, s4
423 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
426 ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
428 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
429 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
431 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
432 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0
433 ; VI-NEXT: v_mov_b32_e32 v0, s0
434 ; VI-NEXT: v_mov_b32_e32 v1, s1
435 ; VI-NEXT: s_waitcnt lgkmcnt(0)
436 ; VI-NEXT: s_or_b32 s0, s2, s3
437 ; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000
438 ; VI-NEXT: v_mov_b32_e32 v2, s0
439 ; VI-NEXT: flat_store_dword v[0:1], v2
441 %a0 = load i32, ptr addrspace(1) %in0
442 %b0 = load i32, ptr addrspace(1) %in1
443 %a1 = shl i32 %a0, 17
444 %b1 = shl i32 %b0, 17
445 %or = or i32 %a1, %b1
446 %result = ashr i32 %or, 17
447 store i32 %result, ptr addrspace(1) %out
451 ; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
452 define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
453 ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
455 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
456 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
457 ; SI-NEXT: s_waitcnt lgkmcnt(0)
458 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
459 ; SI-NEXT: s_load_dword s4, s[4:5], 0x0
460 ; SI-NEXT: s_mov_b32 s3, 0xf000
461 ; SI-NEXT: s_waitcnt lgkmcnt(0)
462 ; SI-NEXT: s_lshl_b32 s2, s2, 17
463 ; SI-NEXT: s_lshl_b32 s4, s4, 19
464 ; SI-NEXT: s_or_b32 s2, s2, s4
465 ; SI-NEXT: s_ashr_i32 s4, s2, 17
466 ; SI-NEXT: s_mov_b32 s2, -1
467 ; SI-NEXT: v_mov_b32_e32 v0, s4
468 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
471 ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
473 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
474 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
475 ; VI-NEXT: s_waitcnt lgkmcnt(0)
476 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
477 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0
478 ; VI-NEXT: v_mov_b32_e32 v0, s0
479 ; VI-NEXT: v_mov_b32_e32 v1, s1
480 ; VI-NEXT: s_waitcnt lgkmcnt(0)
481 ; VI-NEXT: s_lshl_b32 s0, s2, 17
482 ; VI-NEXT: s_lshl_b32 s1, s3, 19
483 ; VI-NEXT: s_or_b32 s0, s0, s1
484 ; VI-NEXT: s_ashr_i32 s0, s0, 17
485 ; VI-NEXT: v_mov_b32_e32 v2, s0
486 ; VI-NEXT: flat_store_dword v[0:1], v2
488 %a0 = load i32, ptr addrspace(1) %x
489 %b0 = load i32, ptr addrspace(1) %y
490 %a1 = shl i32 %a0, 17
491 %b1 = shl i32 %b0, 19
492 %or = or i32 %a1, %b1
493 %result = ashr i32 %or, 17
494 store i32 %result, ptr addrspace(1) %out
498 ; Don't fold as 'other shl' amount is less than the sign_extend_inreg type.
499 define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
500 ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
502 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
503 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
504 ; SI-NEXT: s_waitcnt lgkmcnt(0)
505 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
506 ; SI-NEXT: s_load_dword s4, s[4:5], 0x0
507 ; SI-NEXT: s_mov_b32 s3, 0xf000
508 ; SI-NEXT: s_waitcnt lgkmcnt(0)
509 ; SI-NEXT: s_lshl_b32 s2, s2, 17
510 ; SI-NEXT: s_lshl_b32 s4, s4, 16
511 ; SI-NEXT: s_or_b32 s2, s2, s4
512 ; SI-NEXT: s_ashr_i32 s4, s2, 17
513 ; SI-NEXT: s_mov_b32 s2, -1
514 ; SI-NEXT: v_mov_b32_e32 v0, s4
515 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
518 ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
520 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
521 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
522 ; VI-NEXT: s_waitcnt lgkmcnt(0)
523 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
524 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0
525 ; VI-NEXT: v_mov_b32_e32 v0, s0
526 ; VI-NEXT: v_mov_b32_e32 v1, s1
527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
528 ; VI-NEXT: s_lshl_b32 s0, s2, 17
529 ; VI-NEXT: s_lshl_b32 s1, s3, 16
530 ; VI-NEXT: s_or_b32 s0, s0, s1
531 ; VI-NEXT: s_ashr_i32 s0, s0, 17
532 ; VI-NEXT: v_mov_b32_e32 v2, s0
533 ; VI-NEXT: flat_store_dword v[0:1], v2
535 %a0 = load i32, ptr addrspace(1) %x
536 %b0 = load i32, ptr addrspace(1) %y
537 %a1 = shl i32 %a0, 17
538 %b1 = shl i32 %b0, 16
539 %or = or i32 %a1, %b1
540 %result = ashr i32 %or, 17
541 store i32 %result, ptr addrspace(1) %out
545 declare i32 @llvm.amdgcn.workitem.id.x() #0
547 attributes #0 = { nounwind readnone }
548 attributes #1 = { nounwind }