1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 ; Test optimization to reduce shifts to narrower sizes.
10 define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) {
11 ; GCN-LABEL: s_shl_i64_zext_i32:
13 ; GCN-NEXT: s_andn2_b32 s0, s0, -2.0
14 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
15 ; GCN-NEXT: s_mov_b32 s1, 0
16 ; GCN-NEXT: ; return to shader part epilog
18 ; GFX10-LABEL: s_shl_i64_zext_i32:
20 ; GFX10-NEXT: s_andn2_b32 s0, s0, -2.0
21 ; GFX10-NEXT: s_mov_b32 s1, 0
22 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
23 ; GFX10-NEXT: ; return to shader part epilog
25 ; GFX11-LABEL: s_shl_i64_zext_i32:
27 ; GFX11-NEXT: s_and_not1_b32 s0, s0, -2.0
28 ; GFX11-NEXT: s_mov_b32 s1, 0
29 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
30 ; GFX11-NEXT: ; return to shader part epilog
31 %and = and i32 %x, 1073741823
32 %ext = zext i32 %and to i64
33 %shl = shl i64 %ext, 2
37 define i64 @v_shl_i64_zext_i32(i32 %x) {
38 ; GCN-LABEL: v_shl_i64_zext_i32:
40 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
42 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
43 ; GCN-NEXT: v_mov_b32_e32 v1, 0
44 ; GCN-NEXT: s_setpc_b64 s[30:31]
46 ; GFX10-LABEL: v_shl_i64_zext_i32:
48 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GFX10-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
50 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
51 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
52 ; GFX10-NEXT: s_setpc_b64 s[30:31]
54 ; GFX11-LABEL: v_shl_i64_zext_i32:
56 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3fffffff, v0
58 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; GFX11-NEXT: s_setpc_b64 s[30:31]
60 %and = and i32 %x, 1073741823
61 %ext = zext i32 %and to i64
62 %shl = shl i64 %ext, 2
66 define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) {
67 ; GCN-LABEL: s_shl_i64_sext_i32:
69 ; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff
70 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
71 ; GCN-NEXT: s_mov_b32 s1, 0
72 ; GCN-NEXT: ; return to shader part epilog
74 ; GFX10PLUS-LABEL: s_shl_i64_sext_i32:
76 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x1fffffff
77 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
78 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
79 ; GFX10PLUS-NEXT: ; return to shader part epilog
80 %and = and i32 %x, 536870911
81 %ext = sext i32 %and to i64
82 %shl = shl i64 %ext, 2
86 define i64 @v_shl_i64_sext_i32(i32 %x) {
87 ; GCN-LABEL: v_shl_i64_sext_i32:
89 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
91 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
92 ; GCN-NEXT: v_mov_b32_e32 v1, 0
93 ; GCN-NEXT: s_setpc_b64 s[30:31]
95 ; GFX10-LABEL: v_shl_i64_sext_i32:
97 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98 ; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
99 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
101 ; GFX10-NEXT: s_setpc_b64 s[30:31]
103 ; GFX11-LABEL: v_shl_i64_sext_i32:
105 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x1fffffff, v0
107 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
108 ; GFX11-NEXT: s_setpc_b64 s[30:31]
109 %and = and i32 %x, 536870911
110 %ext = sext i32 %and to i64
111 %shl = shl i64 %ext, 2
115 define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) {
116 ; GCN-LABEL: s_shl_i64_zext_i32_overflow:
118 ; GCN-NEXT: s_bitset0_b32 s0, 31
119 ; GCN-NEXT: s_mov_b32 s1, 0
120 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
121 ; GCN-NEXT: ; return to shader part epilog
123 ; GFX10PLUS-LABEL: s_shl_i64_zext_i32_overflow:
124 ; GFX10PLUS: ; %bb.0:
125 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
126 ; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
127 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
128 ; GFX10PLUS-NEXT: ; return to shader part epilog
129 %and = and i32 %x, 2147483647
130 %ext = zext i32 %and to i64
131 %shl = shl i64 %ext, 2
135 define i64 @v_shl_i64_zext_i32_overflow(i32 %x) {
136 ; GFX7-LABEL: v_shl_i64_zext_i32_overflow:
138 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
140 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
141 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
142 ; GFX7-NEXT: s_setpc_b64 s[30:31]
144 ; GFX8-LABEL: v_shl_i64_zext_i32_overflow:
146 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
148 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
149 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
150 ; GFX8-NEXT: s_setpc_b64 s[30:31]
152 ; GFX9-LABEL: v_shl_i64_zext_i32_overflow:
154 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
156 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
157 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
158 ; GFX9-NEXT: s_setpc_b64 s[30:31]
160 ; GFX10-LABEL: v_shl_i64_zext_i32_overflow:
162 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
164 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
165 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
166 ; GFX10-NEXT: s_setpc_b64 s[30:31]
168 ; GFX11-LABEL: v_shl_i64_zext_i32_overflow:
170 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fffffff, v0
172 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
173 ; GFX11-NEXT: s_setpc_b64 s[30:31]
174 %and = and i32 %x, 2147483647
175 %ext = zext i32 %and to i64
176 %shl = shl i64 %ext, 2
180 define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) {
181 ; GCN-LABEL: s_shl_i64_sext_i32_overflow:
183 ; GCN-NEXT: s_bitset0_b32 s0, 31
184 ; GCN-NEXT: s_ashr_i32 s1, s0, 31
185 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
186 ; GCN-NEXT: ; return to shader part epilog
188 ; GFX10PLUS-LABEL: s_shl_i64_sext_i32_overflow:
189 ; GFX10PLUS: ; %bb.0:
190 ; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
191 ; GFX10PLUS-NEXT: s_ashr_i32 s1, s0, 31
192 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
193 ; GFX10PLUS-NEXT: ; return to shader part epilog
194 %and = and i32 %x, 2147483647
195 %ext = sext i32 %and to i64
196 %shl = shl i64 %ext, 2
200 define i64 @v_shl_i64_sext_i32_overflow(i32 %x) {
201 ; GFX7-LABEL: v_shl_i64_sext_i32_overflow:
203 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
205 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
206 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
207 ; GFX7-NEXT: s_setpc_b64 s[30:31]
209 ; GFX8-LABEL: v_shl_i64_sext_i32_overflow:
211 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
213 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
214 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
215 ; GFX8-NEXT: s_setpc_b64 s[30:31]
217 ; GFX9-LABEL: v_shl_i64_sext_i32_overflow:
219 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
221 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
222 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
223 ; GFX9-NEXT: s_setpc_b64 s[30:31]
225 ; GFX10PLUS-LABEL: v_shl_i64_sext_i32_overflow:
226 ; GFX10PLUS: ; %bb.0:
227 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
229 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0
230 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
231 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
232 %and = and i32 %x, 2147483647
233 %ext = sext i32 %and to i64
234 %shl = shl i64 %ext, 2
238 define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
239 ; GFX7-LABEL: mulu24_shl64:
240 ; GFX7: ; %bb.0: ; %bb
241 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
242 ; GFX7-NEXT: v_and_b32_e32 v0, 6, v0
243 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0
244 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
245 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2
246 ; GFX7-NEXT: s_mov_b32 s2, 0
247 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
248 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX7-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
250 ; GFX7-NEXT: s_endpgm
252 ; GFX8-LABEL: mulu24_shl64:
253 ; GFX8: ; %bb.0: ; %bb
254 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
255 ; GFX8-NEXT: v_and_b32_e32 v0, 6, v0
256 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0
257 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
258 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
259 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
260 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
261 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
262 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
263 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
264 ; GFX8-NEXT: flat_store_dword v[2:3], v1
265 ; GFX8-NEXT: s_endpgm
267 ; GFX9-LABEL: mulu24_shl64:
268 ; GFX9: ; %bb.0: ; %bb
269 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
270 ; GFX9-NEXT: v_and_b32_e32 v0, 6, v0
271 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0
272 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
273 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
274 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
275 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
276 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
277 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
278 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
279 ; GFX9-NEXT: global_store_dword v[2:3], v1, off
280 ; GFX9-NEXT: s_endpgm
282 ; GFX10-LABEL: mulu24_shl64:
283 ; GFX10: ; %bb.0: ; %bb
284 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
285 ; GFX10-NEXT: v_and_b32_e32 v0, 6, v0
286 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
287 ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0
288 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
289 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
291 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
292 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
293 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
294 ; GFX10-NEXT: global_store_dword v[2:3], v1, off
295 ; GFX10-NEXT: s_endpgm
297 ; GFX11-LABEL: mulu24_shl64:
298 ; GFX11: ; %bb.0: ; %bb
299 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
300 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0
301 ; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0
302 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
303 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
304 ; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
305 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
306 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
307 ; GFX11-NEXT: global_store_b32 v[2:3], v1, off
308 ; GFX11-NEXT: s_nop 0
309 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
310 ; GFX11-NEXT: s_endpgm
312 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
313 %tmp1 = and i32 %tmp, 6
314 %mulconv = mul nuw nsw i32 %tmp1, 7
315 %tmp2 = zext i32 %mulconv to i64
316 %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp2
317 store i32 0, ptr addrspace(1) %tmp3, align 4
321 define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) {
322 ; GFX7-LABEL: muli24_shl64:
323 ; GFX7: ; %bb.0: ; %bb
324 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
325 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
326 ; GFX7-NEXT: v_mov_b32_e32 v2, 0
327 ; GFX7-NEXT: s_mov_b32 s6, 0
328 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
329 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
331 ; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
332 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
333 ; GFX7-NEXT: s_waitcnt vmcnt(0)
334 ; GFX7-NEXT: v_or_b32_e32 v1, 0xff800000, v1
335 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v1
336 ; GFX7-NEXT: v_lshl_b64 v[3:4], v[1:2], 3
337 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v0
338 ; GFX7-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
339 ; GFX7-NEXT: s_endpgm
341 ; GFX8-LABEL: muli24_shl64:
342 ; GFX8: ; %bb.0: ; %bb
343 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
344 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
345 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0
346 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
348 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
349 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
350 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
351 ; GFX8-NEXT: flat_load_dword v4, v[1:2]
352 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
353 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
354 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
355 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
356 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
357 ; GFX8-NEXT: s_waitcnt vmcnt(0)
358 ; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v4
359 ; GFX8-NEXT: v_mul_i32_i24_e32 v0, -7, v0
360 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
361 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
362 ; GFX8-NEXT: s_endpgm
364 ; GFX9-LABEL: muli24_shl64:
365 ; GFX9: ; %bb.0: ; %bb
366 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
367 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
368 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
369 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
370 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX9-NEXT: global_load_dword v1, v1, s[2:3]
372 ; GFX9-NEXT: s_waitcnt vmcnt(0)
373 ; GFX9-NEXT: v_or_b32_e32 v1, 0xff800000, v1
374 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v1
375 ; GFX9-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2]
376 ; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
377 ; GFX9-NEXT: s_endpgm
379 ; GFX10-LABEL: muli24_shl64:
380 ; GFX10: ; %bb.0: ; %bb
381 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
382 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
383 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
384 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
385 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX10-NEXT: global_load_dword v1, v1, s[2:3]
387 ; GFX10-NEXT: s_waitcnt vmcnt(0)
388 ; GFX10-NEXT: v_or_b32_e32 v1, 0xff800000, v1
389 ; GFX10-NEXT: v_mul_i32_i24_e32 v1, -7, v1
390 ; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2]
391 ; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
392 ; GFX10-NEXT: s_endpgm
394 ; GFX11-LABEL: muli24_shl64:
395 ; GFX11: ; %bb.0: ; %bb
396 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
397 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
398 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
399 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
401 ; GFX11-NEXT: s_waitcnt vmcnt(0)
402 ; GFX11-NEXT: v_or_b32_e32 v1, 0xff800000, v1
403 ; GFX11-NEXT: v_mul_i32_i24_e32 v1, -7, v1
404 ; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2]
405 ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[0:1]
406 ; GFX11-NEXT: s_nop 0
407 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
408 ; GFX11-NEXT: s_endpgm
410 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
411 %tmp2 = sext i32 %tmp to i64
412 %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp2
413 %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
414 %tmp5 = or i32 %tmp4, -8388608
415 %tmp6 = mul nsw i32 %tmp5, -7
416 %tmp7 = zext i32 %tmp6 to i64
417 %tmp8 = shl nuw nsw i64 %tmp7, 3
418 %tmp9 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp2
419 store i64 %tmp8, ptr addrspace(1) %tmp9, align 8
423 define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
424 ; GCN-LABEL: s_shl_v2i64_zext_v2i32:
426 ; GCN-NEXT: s_brev_b32 s2, -4
427 ; GCN-NEXT: s_mov_b32 s3, s2
428 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
429 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
430 ; GCN-NEXT: s_lshl_b32 s2, s1, 2
431 ; GCN-NEXT: s_mov_b32 s1, 0
432 ; GCN-NEXT: s_mov_b32 s3, 0
433 ; GCN-NEXT: ; return to shader part epilog
435 ; GFX10PLUS-LABEL: s_shl_v2i64_zext_v2i32:
436 ; GFX10PLUS: ; %bb.0:
437 ; GFX10PLUS-NEXT: s_brev_b32 s2, -4
438 ; GFX10PLUS-NEXT: s_mov_b32 s3, s2
439 ; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
440 ; GFX10PLUS-NEXT: s_mov_b32 s3, 0
441 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
442 ; GFX10PLUS-NEXT: s_lshl_b32 s2, s1, 2
443 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
444 ; GFX10PLUS-NEXT: ; return to shader part epilog
445 %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
446 %ext = zext <2 x i32> %and to <2 x i64>
447 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
451 define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
452 ; GCN-LABEL: v_shl_v2i64_zext_v2i32:
454 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455 ; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
456 ; GCN-NEXT: v_and_b32_e32 v1, 0x3fffffff, v1
457 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
458 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v1
459 ; GCN-NEXT: v_mov_b32_e32 v1, 0
460 ; GCN-NEXT: v_mov_b32_e32 v3, 0
461 ; GCN-NEXT: s_setpc_b64 s[30:31]
463 ; GFX10-LABEL: v_shl_v2i64_zext_v2i32:
465 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
466 ; GFX10-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
467 ; GFX10-NEXT: v_and_b32_e32 v1, 0x3fffffff, v1
468 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
469 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
470 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
471 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
472 ; GFX10-NEXT: s_setpc_b64 s[30:31]
474 ; GFX11-LABEL: v_shl_v2i64_zext_v2i32:
476 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
478 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3fffffff, v1
479 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 2, v1
480 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
481 ; GFX11-NEXT: s_setpc_b64 s[30:31]
482 %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
483 %ext = zext <2 x i32> %and to <2 x i64>
484 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
488 define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) {
489 ; GCN-LABEL: s_shl_v2i64_sext_v2i32:
491 ; GCN-NEXT: s_brev_b32 s2, -8
492 ; GCN-NEXT: s_mov_b32 s3, s2
493 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
494 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
495 ; GCN-NEXT: s_lshl_b32 s2, s1, 2
496 ; GCN-NEXT: s_mov_b32 s1, 0
497 ; GCN-NEXT: s_mov_b32 s3, 0
498 ; GCN-NEXT: ; return to shader part epilog
500 ; GFX10PLUS-LABEL: s_shl_v2i64_sext_v2i32:
501 ; GFX10PLUS: ; %bb.0:
502 ; GFX10PLUS-NEXT: s_brev_b32 s2, -8
503 ; GFX10PLUS-NEXT: s_mov_b32 s3, s2
504 ; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
505 ; GFX10PLUS-NEXT: s_mov_b32 s3, 0
506 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
507 ; GFX10PLUS-NEXT: s_lshl_b32 s2, s1, 2
508 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
509 ; GFX10PLUS-NEXT: ; return to shader part epilog
510 %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
511 %ext = sext <2 x i32> %and to <2 x i64>
512 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
516 define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
517 ; GCN-LABEL: v_shl_v2i64_sext_v2i32:
519 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
521 ; GCN-NEXT: v_and_b32_e32 v1, 0x1fffffff, v1
522 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
523 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v1
524 ; GCN-NEXT: v_mov_b32_e32 v1, 0
525 ; GCN-NEXT: v_mov_b32_e32 v3, 0
526 ; GCN-NEXT: s_setpc_b64 s[30:31]
528 ; GFX10-LABEL: v_shl_v2i64_sext_v2i32:
530 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531 ; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
532 ; GFX10-NEXT: v_and_b32_e32 v1, 0x1fffffff, v1
533 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
534 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
535 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
536 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
537 ; GFX10-NEXT: s_setpc_b64 s[30:31]
539 ; GFX11-LABEL: v_shl_v2i64_sext_v2i32:
541 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
542 ; GFX11-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
543 ; GFX11-NEXT: v_and_b32_e32 v1, 0x1fffffff, v1
544 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 2, v1
545 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
546 ; GFX11-NEXT: s_setpc_b64 s[30:31]
547 %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
548 %ext = sext <2 x i32> %and to <2 x i64>
549 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
553 define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
554 ; GFX7-LABEL: s_shl_i32_zext_i16:
556 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff
557 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2
558 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
559 ; GFX7-NEXT: ; return to shader part epilog
561 ; GFX8-LABEL: s_shl_i32_zext_i16:
563 ; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff
564 ; GFX8-NEXT: s_lshl_b32 s0, s0, 2
565 ; GFX8-NEXT: ; return to shader part epilog
567 ; GFX9-LABEL: s_shl_i32_zext_i16:
569 ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff
570 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
571 ; GFX9-NEXT: ; return to shader part epilog
573 ; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
574 ; GFX10PLUS: ; %bb.0:
575 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff
576 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
577 ; GFX10PLUS-NEXT: ; return to shader part epilog
578 %and = and i16 %x, 16383
579 %ext = zext i16 %and to i32
580 %shl = shl i32 %ext, 2
584 define i32 @v_shl_i32_zext_i16(i16 %x) {
585 ; GFX7-LABEL: v_shl_i32_zext_i16:
587 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff, v0
589 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
590 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
591 ; GFX7-NEXT: s_setpc_b64 s[30:31]
593 ; GFX8-LABEL: v_shl_i32_zext_i16:
595 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
596 ; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff, v0
597 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v0
598 ; GFX8-NEXT: s_setpc_b64 s[30:31]
600 ; GFX9-LABEL: v_shl_i32_zext_i16:
602 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
603 ; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0
604 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0
605 ; GFX9-NEXT: s_setpc_b64 s[30:31]
607 ; GFX10PLUS-LABEL: v_shl_i32_zext_i16:
608 ; GFX10PLUS: ; %bb.0:
609 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff, v0
611 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 2, v0
612 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
613 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
614 %and = and i16 %x, 16383
615 %ext = zext i16 %and to i32
616 %shl = shl i32 %ext, 2
620 define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
621 ; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
623 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
624 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
625 ; GFX7-NEXT: s_or_b32 s0, s1, s0
626 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff
627 ; GFX7-NEXT: s_lshr_b32 s1, s0, 16
628 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2
629 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2
630 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
631 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
632 ; GFX7-NEXT: ; return to shader part epilog
634 ; GFX8-LABEL: s_shl_v2i32_zext_v2i16:
636 ; GFX8-NEXT: s_movk_i32 s2, 0x3fff
637 ; GFX8-NEXT: s_mov_b32 s3, s2
638 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
639 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
640 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
641 ; GFX8-NEXT: s_lshl_b32 s0, s0, 2
642 ; GFX8-NEXT: s_lshl_b32 s1, s1, 2
643 ; GFX8-NEXT: ; return to shader part epilog
645 ; GFX9-LABEL: s_shl_v2i32_zext_v2i16:
647 ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff3fff
648 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
649 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x20002
650 ; GFX9-NEXT: s_lshl_b32 s1, s1, 2
651 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
652 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
653 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
654 ; GFX9-NEXT: ; return to shader part epilog
656 ; GFX10PLUS-LABEL: s_shl_v2i32_zext_v2i16:
657 ; GFX10PLUS: ; %bb.0:
658 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff3fff
659 ; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16
660 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x20002
661 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 2
662 ; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s0, s1
663 ; GFX10PLUS-NEXT: s_and_b32 s0, s1, 0xffff
664 ; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16
665 ; GFX10PLUS-NEXT: ; return to shader part epilog
666 %and = and <2 x i16> %x, <i16 16383, i16 16383>
667 %ext = zext <2 x i16> %and to <2 x i32>
668 %shl = shl <2 x i32> %ext, <i32 2, i32 2>
672 ; FIXME: This doesn't do what we want. The pre-legalizer combiner
673 ; fails to handle the vector splat. The post-legalizer sees the zext
674 ; legalized into the and. This is probably not that important, since
675 ; we really do this combine in the machine level for lowered
677 define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
678 ; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
680 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
681 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
682 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
683 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
684 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
685 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
686 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
687 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1
688 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
689 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
690 ; GFX7-NEXT: s_setpc_b64 s[30:31]
692 ; GFX8-LABEL: v_shl_v2i32_zext_v2i16:
694 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
695 ; GFX8-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0
696 ; GFX8-NEXT: v_mov_b32_e32 v2, 2
697 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v1
698 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
699 ; GFX8-NEXT: s_setpc_b64 s[30:31]
701 ; GFX9-LABEL: v_shl_v2i32_zext_v2i16:
703 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
704 ; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
705 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 2, v0 op_sel_hi:[0,1]
706 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
707 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
708 ; GFX9-NEXT: s_setpc_b64 s[30:31]
710 ; GFX10PLUS-LABEL: v_shl_v2i32_zext_v2i16:
711 ; GFX10PLUS: ; %bb.0:
712 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
713 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
714 ; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, 2, v0 op_sel_hi:[0,1]
715 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v1
716 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, 16, v1
717 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
718 %and = and <2 x i16> %x, <i16 16383, i16 16383>
719 %ext = zext <2 x i16> %and to <2 x i32>
720 %shl = shl <2 x i32> %ext, <i32 2, i32 2>
724 declare i32 @llvm.amdgcn.workitem.id.x() #0
726 attributes #0 = { nounwind readnone speculatable willreturn }