1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 ; Test optimization to reduce shifts to narrower sizes.
10 define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) {
11 ; GCN-LABEL: s_shl_i64_zext_i32:
13 ; GCN-NEXT: s_andn2_b32 s0, s0, -2.0
14 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
15 ; GCN-NEXT: s_mov_b32 s1, 0
16 ; GCN-NEXT: ; return to shader part epilog
18 ; GFX10-LABEL: s_shl_i64_zext_i32:
20 ; GFX10-NEXT: s_andn2_b32 s0, s0, -2.0
21 ; GFX10-NEXT: s_mov_b32 s1, 0
22 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
23 ; GFX10-NEXT: ; return to shader part epilog
25 ; GFX11-LABEL: s_shl_i64_zext_i32:
27 ; GFX11-NEXT: s_and_not1_b32 s0, s0, -2.0
28 ; GFX11-NEXT: s_mov_b32 s1, 0
29 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
30 ; GFX11-NEXT: ; return to shader part epilog
31 %and = and i32 %x, 1073741823
32 %ext = zext i32 %and to i64
33 %shl = shl i64 %ext, 2
37 define i64 @v_shl_i64_zext_i32(i32 %x) {
38 ; GCN-LABEL: v_shl_i64_zext_i32:
40 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
42 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
43 ; GCN-NEXT: v_mov_b32_e32 v1, 0
44 ; GCN-NEXT: s_setpc_b64 s[30:31]
46 ; GFX10-LABEL: v_shl_i64_zext_i32:
48 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GFX10-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
50 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
51 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
52 ; GFX10-NEXT: s_setpc_b64 s[30:31]
54 ; GFX11-LABEL: v_shl_i64_zext_i32:
56 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3fffffff, v0
58 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; GFX11-NEXT: s_setpc_b64 s[30:31]
60 %and = and i32 %x, 1073741823
61 %ext = zext i32 %and to i64
62 %shl = shl i64 %ext, 2
66 define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) {
67 ; GCN-LABEL: s_shl_i64_sext_i32:
69 ; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff
70 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
71 ; GCN-NEXT: s_mov_b32 s1, 0
72 ; GCN-NEXT: ; return to shader part epilog
74 ; GFX10PLUS-LABEL: s_shl_i64_sext_i32:
76 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x1fffffff
77 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
78 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
79 ; GFX10PLUS-NEXT: ; return to shader part epilog
80 %and = and i32 %x, 536870911
81 %ext = sext i32 %and to i64
82 %shl = shl i64 %ext, 2
86 define i64 @v_shl_i64_sext_i32(i32 %x) {
87 ; GCN-LABEL: v_shl_i64_sext_i32:
89 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
91 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
92 ; GCN-NEXT: v_mov_b32_e32 v1, 0
93 ; GCN-NEXT: s_setpc_b64 s[30:31]
95 ; GFX10-LABEL: v_shl_i64_sext_i32:
97 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98 ; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
99 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
101 ; GFX10-NEXT: s_setpc_b64 s[30:31]
103 ; GFX11-LABEL: v_shl_i64_sext_i32:
105 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x1fffffff, v0
107 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
108 ; GFX11-NEXT: s_setpc_b64 s[30:31]
109 %and = and i32 %x, 536870911
110 %ext = sext i32 %and to i64
111 %shl = shl i64 %ext, 2
115 define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) {
116 ; GCN-LABEL: s_shl_i64_zext_i32_overflow:
118 ; GCN-NEXT: s_bitset0_b32 s0, 31
119 ; GCN-NEXT: s_mov_b32 s1, 0
120 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
121 ; GCN-NEXT: ; return to shader part epilog
123 ; GFX10PLUS-LABEL: s_shl_i64_zext_i32_overflow:
124 ; GFX10PLUS: ; %bb.0:
125 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
126 ; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
127 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
128 ; GFX10PLUS-NEXT: ; return to shader part epilog
129 %and = and i32 %x, 2147483647
130 %ext = zext i32 %and to i64
131 %shl = shl i64 %ext, 2
135 define i64 @v_shl_i64_zext_i32_overflow(i32 %x) {
136 ; GFX7-LABEL: v_shl_i64_zext_i32_overflow:
138 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
140 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
141 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
142 ; GFX7-NEXT: s_setpc_b64 s[30:31]
144 ; GFX8-LABEL: v_shl_i64_zext_i32_overflow:
146 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
148 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
149 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
150 ; GFX8-NEXT: s_setpc_b64 s[30:31]
152 ; GFX9-LABEL: v_shl_i64_zext_i32_overflow:
154 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
156 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
157 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
158 ; GFX9-NEXT: s_setpc_b64 s[30:31]
160 ; GFX10-LABEL: v_shl_i64_zext_i32_overflow:
162 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
164 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
165 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
166 ; GFX10-NEXT: s_setpc_b64 s[30:31]
168 ; GFX11-LABEL: v_shl_i64_zext_i32_overflow:
170 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fffffff, v0
172 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
173 ; GFX11-NEXT: s_setpc_b64 s[30:31]
174 %and = and i32 %x, 2147483647
175 %ext = zext i32 %and to i64
176 %shl = shl i64 %ext, 2
180 define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) {
181 ; GCN-LABEL: s_shl_i64_sext_i32_overflow:
183 ; GCN-NEXT: s_bitset0_b32 s0, 31
184 ; GCN-NEXT: s_ashr_i32 s1, s0, 31
185 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
186 ; GCN-NEXT: ; return to shader part epilog
188 ; GFX10PLUS-LABEL: s_shl_i64_sext_i32_overflow:
189 ; GFX10PLUS: ; %bb.0:
190 ; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31
191 ; GFX10PLUS-NEXT: s_ashr_i32 s1, s0, 31
192 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
193 ; GFX10PLUS-NEXT: ; return to shader part epilog
194 %and = and i32 %x, 2147483647
195 %ext = sext i32 %and to i64
196 %shl = shl i64 %ext, 2
200 define i64 @v_shl_i64_sext_i32_overflow(i32 %x) {
201 ; GFX7-LABEL: v_shl_i64_sext_i32_overflow:
203 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
205 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
206 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
207 ; GFX7-NEXT: s_setpc_b64 s[30:31]
209 ; GFX8-LABEL: v_shl_i64_sext_i32_overflow:
211 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
213 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
214 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
215 ; GFX8-NEXT: s_setpc_b64 s[30:31]
217 ; GFX9-LABEL: v_shl_i64_sext_i32_overflow:
219 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
221 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
222 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
223 ; GFX9-NEXT: s_setpc_b64 s[30:31]
225 ; GFX10PLUS-LABEL: v_shl_i64_sext_i32_overflow:
226 ; GFX10PLUS: ; %bb.0:
227 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
229 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0
230 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
231 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
232 %and = and i32 %x, 2147483647
233 %ext = sext i32 %and to i64
234 %shl = shl i64 %ext, 2
238 define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
239 ; GFX7-LABEL: mulu24_shl64:
240 ; GFX7: ; %bb.0: ; %bb
241 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
242 ; GFX7-NEXT: v_and_b32_e32 v0, 6, v0
243 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0
244 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
245 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2
246 ; GFX7-NEXT: s_mov_b32 s2, 0
247 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
248 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX7-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
250 ; GFX7-NEXT: s_endpgm
252 ; GFX8-LABEL: mulu24_shl64:
253 ; GFX8: ; %bb.0: ; %bb
254 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
255 ; GFX8-NEXT: v_and_b32_e32 v0, 6, v0
256 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0
257 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
258 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
259 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
260 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
261 ; GFX8-NEXT: v_mov_b32_e32 v4, s0
262 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
263 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
264 ; GFX8-NEXT: flat_store_dword v[2:3], v1
265 ; GFX8-NEXT: s_endpgm
267 ; GFX9-LABEL: mulu24_shl64:
268 ; GFX9: ; %bb.0: ; %bb
269 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
270 ; GFX9-NEXT: v_and_b32_e32 v0, 6, v0
271 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0
272 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
273 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
274 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
275 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
276 ; GFX9-NEXT: v_mov_b32_e32 v4, s0
277 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
278 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
279 ; GFX9-NEXT: global_store_dword v[2:3], v1, off
280 ; GFX9-NEXT: s_endpgm
282 ; GFX10-LABEL: mulu24_shl64:
283 ; GFX10: ; %bb.0: ; %bb
284 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
285 ; GFX10-NEXT: v_and_b32_e32 v0, 6, v0
286 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
287 ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0
288 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
289 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX10-NEXT: v_mov_b32_e32 v5, s1
291 ; GFX10-NEXT: v_mov_b32_e32 v4, s0
292 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
293 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
294 ; GFX10-NEXT: global_store_dword v[2:3], v1, off
295 ; GFX10-NEXT: s_endpgm
297 ; GFX11-LABEL: mulu24_shl64:
298 ; GFX11: ; %bb.0: ; %bb
299 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
300 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0
301 ; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0
302 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
303 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
304 ; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
305 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
306 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
307 ; GFX11-NEXT: global_store_b32 v[2:3], v1, off
308 ; GFX11-NEXT: s_nop 0
309 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
310 ; GFX11-NEXT: s_endpgm
312 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
313 %tmp1 = and i32 %tmp, 6
314 %mulconv = mul nuw nsw i32 %tmp1, 7
315 %tmp2 = zext i32 %mulconv to i64
316 %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp2
317 store i32 0, ptr addrspace(1) %tmp3, align 4
321 define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) {
322 ; GFX7-LABEL: muli24_shl64:
323 ; GFX7: ; %bb.0: ; %bb
324 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
325 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
326 ; GFX7-NEXT: v_mov_b32_e32 v2, 0
327 ; GFX7-NEXT: s_mov_b32 s6, 0
328 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
329 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
331 ; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
332 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
333 ; GFX7-NEXT: s_waitcnt vmcnt(0)
334 ; GFX7-NEXT: v_or_b32_e32 v1, 0xff800000, v1
335 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v1
336 ; GFX7-NEXT: v_lshl_b64 v[3:4], v[1:2], 3
337 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v0
338 ; GFX7-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
339 ; GFX7-NEXT: s_endpgm
341 ; GFX8-LABEL: muli24_shl64:
342 ; GFX8: ; %bb.0: ; %bb
343 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
344 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
345 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0
346 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
348 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
349 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
350 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
351 ; GFX8-NEXT: flat_load_dword v4, v[1:2]
352 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
353 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
354 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
355 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
356 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
357 ; GFX8-NEXT: s_waitcnt vmcnt(0)
358 ; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v4
359 ; GFX8-NEXT: v_mul_i32_i24_e32 v0, -7, v0
360 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
361 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
362 ; GFX8-NEXT: s_endpgm
364 ; GFX9-LABEL: muli24_shl64:
365 ; GFX9: ; %bb.0: ; %bb
366 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
367 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
368 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
369 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
370 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX9-NEXT: global_load_dword v1, v1, s[2:3]
372 ; GFX9-NEXT: s_waitcnt vmcnt(0)
373 ; GFX9-NEXT: v_or_b32_e32 v1, 0xff800000, v1
374 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v1
375 ; GFX9-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2]
376 ; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
377 ; GFX9-NEXT: s_endpgm
379 ; GFX10-LABEL: muli24_shl64:
380 ; GFX10: ; %bb.0: ; %bb
381 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
382 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
383 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
384 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
385 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX10-NEXT: global_load_dword v1, v1, s[2:3]
387 ; GFX10-NEXT: s_waitcnt vmcnt(0)
388 ; GFX10-NEXT: v_or_b32_e32 v1, 0xff800000, v1
389 ; GFX10-NEXT: v_mul_i32_i24_e32 v1, -7, v1
390 ; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2]
391 ; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
392 ; GFX10-NEXT: s_endpgm
394 ; GFX11-LABEL: muli24_shl64:
395 ; GFX11: ; %bb.0: ; %bb
396 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
397 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0
398 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v2
399 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v2
400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
402 ; GFX11-NEXT: s_waitcnt vmcnt(0)
403 ; GFX11-NEXT: v_or_b32_e32 v0, 0xff800000, v0
404 ; GFX11-NEXT: v_mul_i32_i24_e32 v0, -7, v0
405 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
406 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
407 ; GFX11-NEXT: s_nop 0
408 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
409 ; GFX11-NEXT: s_endpgm
411 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
412 %tmp2 = sext i32 %tmp to i64
413 %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp2
414 %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
415 %tmp5 = or i32 %tmp4, -8388608
416 %tmp6 = mul nsw i32 %tmp5, -7
417 %tmp7 = zext i32 %tmp6 to i64
418 %tmp8 = shl nuw nsw i64 %tmp7, 3
419 %tmp9 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp2
420 store i64 %tmp8, ptr addrspace(1) %tmp9, align 8
424 define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
425 ; GCN-LABEL: s_shl_v2i64_zext_v2i32:
427 ; GCN-NEXT: s_brev_b32 s2, -4
428 ; GCN-NEXT: s_mov_b32 s3, s2
429 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
430 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
431 ; GCN-NEXT: s_lshl_b32 s2, s1, 2
432 ; GCN-NEXT: s_mov_b32 s1, 0
433 ; GCN-NEXT: s_mov_b32 s3, 0
434 ; GCN-NEXT: ; return to shader part epilog
436 ; GFX10PLUS-LABEL: s_shl_v2i64_zext_v2i32:
437 ; GFX10PLUS: ; %bb.0:
438 ; GFX10PLUS-NEXT: s_brev_b32 s2, -4
439 ; GFX10PLUS-NEXT: s_mov_b32 s3, s2
440 ; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
441 ; GFX10PLUS-NEXT: s_mov_b32 s3, 0
442 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
443 ; GFX10PLUS-NEXT: s_lshl_b32 s2, s1, 2
444 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
445 ; GFX10PLUS-NEXT: ; return to shader part epilog
446 %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
447 %ext = zext <2 x i32> %and to <2 x i64>
448 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
452 define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
453 ; GCN-LABEL: v_shl_v2i64_zext_v2i32:
455 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456 ; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
457 ; GCN-NEXT: v_and_b32_e32 v1, 0x3fffffff, v1
458 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
459 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v1
460 ; GCN-NEXT: v_mov_b32_e32 v1, 0
461 ; GCN-NEXT: v_mov_b32_e32 v3, 0
462 ; GCN-NEXT: s_setpc_b64 s[30:31]
464 ; GFX10-LABEL: v_shl_v2i64_zext_v2i32:
466 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GFX10-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
468 ; GFX10-NEXT: v_and_b32_e32 v1, 0x3fffffff, v1
469 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
470 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
471 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
472 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
473 ; GFX10-NEXT: s_setpc_b64 s[30:31]
475 ; GFX11-LABEL: v_shl_v2i64_zext_v2i32:
477 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0
479 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3fffffff, v1
480 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 2, v1
481 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
482 ; GFX11-NEXT: s_setpc_b64 s[30:31]
483 %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
484 %ext = zext <2 x i32> %and to <2 x i64>
485 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
489 define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) {
490 ; GCN-LABEL: s_shl_v2i64_sext_v2i32:
492 ; GCN-NEXT: s_brev_b32 s2, -8
493 ; GCN-NEXT: s_mov_b32 s3, s2
494 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
495 ; GCN-NEXT: s_lshl_b32 s0, s0, 2
496 ; GCN-NEXT: s_lshl_b32 s2, s1, 2
497 ; GCN-NEXT: s_mov_b32 s1, 0
498 ; GCN-NEXT: s_mov_b32 s3, 0
499 ; GCN-NEXT: ; return to shader part epilog
501 ; GFX10PLUS-LABEL: s_shl_v2i64_sext_v2i32:
502 ; GFX10PLUS: ; %bb.0:
503 ; GFX10PLUS-NEXT: s_brev_b32 s2, -8
504 ; GFX10PLUS-NEXT: s_mov_b32 s3, s2
505 ; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
506 ; GFX10PLUS-NEXT: s_mov_b32 s3, 0
507 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
508 ; GFX10PLUS-NEXT: s_lshl_b32 s2, s1, 2
509 ; GFX10PLUS-NEXT: s_mov_b32 s1, 0
510 ; GFX10PLUS-NEXT: ; return to shader part epilog
511 %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
512 %ext = sext <2 x i32> %and to <2 x i64>
513 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
517 define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
518 ; GCN-LABEL: v_shl_v2i64_sext_v2i32:
520 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
522 ; GCN-NEXT: v_and_b32_e32 v1, 0x1fffffff, v1
523 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
524 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v1
525 ; GCN-NEXT: v_mov_b32_e32 v1, 0
526 ; GCN-NEXT: v_mov_b32_e32 v3, 0
527 ; GCN-NEXT: s_setpc_b64 s[30:31]
529 ; GFX10-LABEL: v_shl_v2i64_sext_v2i32:
531 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532 ; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
533 ; GFX10-NEXT: v_and_b32_e32 v1, 0x1fffffff, v1
534 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
535 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
536 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
537 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
538 ; GFX10-NEXT: s_setpc_b64 s[30:31]
540 ; GFX11-LABEL: v_shl_v2i64_sext_v2i32:
542 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543 ; GFX11-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
544 ; GFX11-NEXT: v_and_b32_e32 v1, 0x1fffffff, v1
545 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 2, v1
546 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
547 ; GFX11-NEXT: s_setpc_b64 s[30:31]
548 %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
549 %ext = sext <2 x i32> %and to <2 x i64>
550 %shl = shl <2 x i64> %ext, <i64 2, i64 2>
554 define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
555 ; GFX7-LABEL: s_shl_i32_zext_i16:
557 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff
558 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2
559 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
560 ; GFX7-NEXT: ; return to shader part epilog
562 ; GFX8-LABEL: s_shl_i32_zext_i16:
564 ; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff
565 ; GFX8-NEXT: s_lshl_b32 s0, s0, 2
566 ; GFX8-NEXT: ; return to shader part epilog
568 ; GFX9-LABEL: s_shl_i32_zext_i16:
570 ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff
571 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
572 ; GFX9-NEXT: ; return to shader part epilog
574 ; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
575 ; GFX10PLUS: ; %bb.0:
576 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff
577 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
578 ; GFX10PLUS-NEXT: ; return to shader part epilog
579 %and = and i16 %x, 16383
580 %ext = zext i16 %and to i32
581 %shl = shl i32 %ext, 2
585 define i32 @v_shl_i32_zext_i16(i16 %x) {
586 ; GFX7-LABEL: v_shl_i32_zext_i16:
588 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff, v0
590 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
591 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
592 ; GFX7-NEXT: s_setpc_b64 s[30:31]
594 ; GFX8-LABEL: v_shl_i32_zext_i16:
596 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597 ; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff, v0
598 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v0
599 ; GFX8-NEXT: s_setpc_b64 s[30:31]
601 ; GFX9-LABEL: v_shl_i32_zext_i16:
603 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604 ; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0
605 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0
606 ; GFX9-NEXT: s_setpc_b64 s[30:31]
608 ; GFX10PLUS-LABEL: v_shl_i32_zext_i16:
609 ; GFX10PLUS: ; %bb.0:
610 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
611 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff, v0
612 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 2, v0
613 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
614 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
615 %and = and i16 %x, 16383
616 %ext = zext i16 %and to i32
617 %shl = shl i32 %ext, 2
621 define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
622 ; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
624 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
625 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
626 ; GFX7-NEXT: s_or_b32 s0, s1, s0
627 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff
628 ; GFX7-NEXT: s_lshr_b32 s1, s0, 16
629 ; GFX7-NEXT: s_lshl_b32 s0, s0, 2
630 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2
631 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
632 ; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
633 ; GFX7-NEXT: ; return to shader part epilog
635 ; GFX8-LABEL: s_shl_v2i32_zext_v2i16:
637 ; GFX8-NEXT: s_movk_i32 s2, 0x3fff
638 ; GFX8-NEXT: s_mov_b32 s3, s2
639 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16
640 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
641 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
642 ; GFX8-NEXT: s_lshl_b32 s0, s0, 2
643 ; GFX8-NEXT: s_lshl_b32 s1, s1, 2
644 ; GFX8-NEXT: ; return to shader part epilog
646 ; GFX9-LABEL: s_shl_v2i32_zext_v2i16:
648 ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff3fff
649 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
650 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x20002
651 ; GFX9-NEXT: s_lshl_b32 s1, s1, 2
652 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
653 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
654 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
655 ; GFX9-NEXT: ; return to shader part epilog
657 ; GFX10PLUS-LABEL: s_shl_v2i32_zext_v2i16:
658 ; GFX10PLUS: ; %bb.0:
659 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff3fff
660 ; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16
661 ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x20002
662 ; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 2
663 ; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s0, s1
664 ; GFX10PLUS-NEXT: s_and_b32 s0, s1, 0xffff
665 ; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, 16
666 ; GFX10PLUS-NEXT: ; return to shader part epilog
667 %and = and <2 x i16> %x, <i16 16383, i16 16383>
668 %ext = zext <2 x i16> %and to <2 x i32>
669 %shl = shl <2 x i32> %ext, <i32 2, i32 2>
673 ; FIXME: This doesn't do what we want. The pre-legalizer combiner
674 ; fails to handle the vector splat. The post-legalizer sees the zext
675 ; legalized into the and. This is probably not that important, since
676 ; we really do this combine in the machine level for lowered
678 define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
679 ; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
681 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
682 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
683 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
684 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
685 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
686 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
687 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
688 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1
689 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
690 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
691 ; GFX7-NEXT: s_setpc_b64 s[30:31]
693 ; GFX8-LABEL: v_shl_v2i32_zext_v2i16:
695 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696 ; GFX8-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0
697 ; GFX8-NEXT: v_mov_b32_e32 v2, 2
698 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v1
699 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
700 ; GFX8-NEXT: s_setpc_b64 s[30:31]
702 ; GFX9-LABEL: v_shl_v2i32_zext_v2i16:
704 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705 ; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
706 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 2, v0 op_sel_hi:[0,1]
707 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
708 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
709 ; GFX9-NEXT: s_setpc_b64 s[30:31]
711 ; GFX10PLUS-LABEL: v_shl_v2i32_zext_v2i16:
712 ; GFX10PLUS: ; %bb.0:
713 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0
715 ; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, 2, v0 op_sel_hi:[0,1]
716 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v1
717 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, 16, v1
718 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
719 %and = and <2 x i16> %x, <i16 16383, i16 16383>
720 %ext = zext <2 x i16> %and to <2 x i32>
721 %shl = shl <2 x i32> %ext, <i32 2, i32 2>
725 declare i32 @llvm.amdgcn.workitem.id.x() #0
727 attributes #0 = { nounwind readnone speculatable willreturn }