1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
3 ; RUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
6 declare i32 @llvm.amdgcn.workitem.id.x() #0
8 declare i32 @llvm.amdgcn.workgroup.id.x() #0
10 define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
11 ; SI-LABEL: shl_v2i32:
13 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
14 ; SI-NEXT: s_mov_b32 s3, 0xf000
15 ; SI-NEXT: s_mov_b32 s2, -1
16 ; SI-NEXT: s_mov_b32 s10, s2
17 ; SI-NEXT: s_mov_b32 s11, s3
18 ; SI-NEXT: s_waitcnt lgkmcnt(0)
19 ; SI-NEXT: s_mov_b32 s8, s6
20 ; SI-NEXT: s_mov_b32 s9, s7
21 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
22 ; SI-NEXT: s_mov_b32 s0, s4
23 ; SI-NEXT: s_mov_b32 s1, s5
24 ; SI-NEXT: s_waitcnt vmcnt(0)
25 ; SI-NEXT: v_lshl_b32_e32 v1, v1, v3
26 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v2
27 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
30 ; VI-LABEL: shl_v2i32:
32 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
33 ; VI-NEXT: s_mov_b32 s3, 0xf000
34 ; VI-NEXT: s_mov_b32 s2, -1
35 ; VI-NEXT: s_waitcnt lgkmcnt(0)
36 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
37 ; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
38 ; VI-NEXT: s_mov_b32 s0, s4
39 ; VI-NEXT: s_mov_b32 s1, s5
40 ; VI-NEXT: s_waitcnt lgkmcnt(0)
41 ; VI-NEXT: s_lshl_b32 s4, s9, s7
42 ; VI-NEXT: s_lshl_b32 s5, s8, s6
43 ; VI-NEXT: v_mov_b32_e32 v0, s5
44 ; VI-NEXT: v_mov_b32_e32 v1, s4
45 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
48 ; EG-LABEL: shl_v2i32:
50 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
52 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
53 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
56 ; EG-NEXT: Fetch clause starting at 6:
57 ; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1
58 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
59 ; EG-NEXT: ALU clause starting at 10:
60 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
61 ; EG-NEXT: ALU clause starting at 11:
62 ; EG-NEXT: LSHL * T0.Y, T0.Y, T1.Y,
63 ; EG-NEXT: LSHL T0.X, T0.X, T1.X,
64 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
65 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
66 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
67 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
68 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
69 %result = shl <2 x i32> %a, %b
70 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
74 define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
75 ; SI-LABEL: shl_v4i32:
77 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
78 ; SI-NEXT: s_mov_b32 s3, 0xf000
79 ; SI-NEXT: s_mov_b32 s2, -1
80 ; SI-NEXT: s_mov_b32 s10, s2
81 ; SI-NEXT: s_mov_b32 s11, s3
82 ; SI-NEXT: s_waitcnt lgkmcnt(0)
83 ; SI-NEXT: s_mov_b32 s8, s6
84 ; SI-NEXT: s_mov_b32 s9, s7
85 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
86 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
87 ; SI-NEXT: s_mov_b32 s0, s4
88 ; SI-NEXT: s_mov_b32 s1, s5
89 ; SI-NEXT: s_waitcnt vmcnt(0)
90 ; SI-NEXT: v_lshl_b32_e32 v3, v3, v7
91 ; SI-NEXT: v_lshl_b32_e32 v2, v2, v6
92 ; SI-NEXT: v_lshl_b32_e32 v1, v1, v5
93 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v4
94 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
97 ; VI-LABEL: shl_v4i32:
99 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
100 ; VI-NEXT: s_mov_b32 s3, 0xf000
101 ; VI-NEXT: s_mov_b32 s2, -1
102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
103 ; VI-NEXT: s_mov_b32 s0, s4
104 ; VI-NEXT: s_mov_b32 s1, s5
105 ; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
106 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10
107 ; VI-NEXT: s_waitcnt lgkmcnt(0)
108 ; VI-NEXT: s_lshl_b32 s7, s11, s7
109 ; VI-NEXT: s_lshl_b32 s6, s10, s6
110 ; VI-NEXT: s_lshl_b32 s5, s9, s5
111 ; VI-NEXT: s_lshl_b32 s4, s8, s4
112 ; VI-NEXT: v_mov_b32_e32 v0, s4
113 ; VI-NEXT: v_mov_b32_e32 v1, s5
114 ; VI-NEXT: v_mov_b32_e32 v2, s6
115 ; VI-NEXT: v_mov_b32_e32 v3, s7
116 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
119 ; EG-LABEL: shl_v4i32:
121 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
123 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
124 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
127 ; EG-NEXT: Fetch clause starting at 6:
128 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
129 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
130 ; EG-NEXT: ALU clause starting at 10:
131 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
132 ; EG-NEXT: ALU clause starting at 11:
133 ; EG-NEXT: LSHL * T0.W, T0.W, T1.W,
134 ; EG-NEXT: LSHL * T0.Z, T0.Z, T1.Z,
135 ; EG-NEXT: LSHL * T0.Y, T0.Y, T1.Y,
136 ; EG-NEXT: LSHL T0.X, T0.X, T1.X,
137 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
138 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
139 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
140 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
141 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
142 %result = shl <4 x i32> %a, %b
143 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
147 define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
150 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
151 ; SI-NEXT: s_mov_b32 s3, 0xf000
152 ; SI-NEXT: s_mov_b32 s2, -1
153 ; SI-NEXT: s_waitcnt lgkmcnt(0)
154 ; SI-NEXT: s_mov_b32 s0, s4
155 ; SI-NEXT: s_mov_b32 s1, s5
156 ; SI-NEXT: s_mov_b32 s4, s6
157 ; SI-NEXT: s_mov_b32 s5, s7
158 ; SI-NEXT: s_mov_b32 s6, s2
159 ; SI-NEXT: s_mov_b32 s7, s3
160 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
161 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2
162 ; SI-NEXT: s_waitcnt vmcnt(0)
163 ; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
164 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
169 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
170 ; VI-NEXT: s_mov_b32 s3, 0xf000
171 ; VI-NEXT: s_mov_b32 s2, -1
172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
173 ; VI-NEXT: s_mov_b32 s0, s4
174 ; VI-NEXT: s_mov_b32 s1, s5
175 ; VI-NEXT: s_mov_b32 s4, s6
176 ; VI-NEXT: s_mov_b32 s5, s7
177 ; VI-NEXT: s_mov_b32 s6, s2
178 ; VI-NEXT: s_mov_b32 s7, s3
179 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
180 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2
181 ; VI-NEXT: s_waitcnt vmcnt(0)
182 ; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
183 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
188 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
190 ; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
191 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
194 ; EG-NEXT: Fetch clause starting at 6:
195 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
196 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
197 ; EG-NEXT: ALU clause starting at 10:
198 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
199 ; EG-NEXT: ALU clause starting at 11:
200 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
201 ; EG-NEXT: LSHL * T1.W, T0.X, T1.X,
202 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
203 ; EG-NEXT: AND_INT T1.W, PS, literal.x,
204 ; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
205 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
206 ; EG-NEXT: LSHL T0.X, PV.W, PS,
207 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
208 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
209 ; EG-NEXT: MOV T0.Y, 0.0,
210 ; EG-NEXT: MOV * T0.Z, 0.0,
211 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
212 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
213 %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
214 %a = load i16, i16 addrspace(1)* %in
215 %b = load i16, i16 addrspace(1)* %b_ptr
216 %result = shl i16 %a, %b
217 store i16 %result, i16 addrspace(1)* %out
221 define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
222 ; SI-LABEL: shl_i16_v_s:
224 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
225 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
226 ; SI-NEXT: s_mov_b32 s3, 0xf000
227 ; SI-NEXT: s_mov_b32 s2, -1
228 ; SI-NEXT: s_waitcnt lgkmcnt(0)
229 ; SI-NEXT: s_mov_b32 s0, s4
230 ; SI-NEXT: s_mov_b32 s1, s5
231 ; SI-NEXT: s_mov_b32 s4, s6
232 ; SI-NEXT: s_mov_b32 s5, s7
233 ; SI-NEXT: s_mov_b32 s6, s2
234 ; SI-NEXT: s_mov_b32 s7, s3
235 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
236 ; SI-NEXT: s_and_b32 s8, s8, 0xffff
237 ; SI-NEXT: s_waitcnt vmcnt(0)
238 ; SI-NEXT: v_lshlrev_b32_e32 v0, s8, v0
239 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
242 ; VI-LABEL: shl_i16_v_s:
244 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
245 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
246 ; VI-NEXT: s_mov_b32 s3, 0xf000
247 ; VI-NEXT: s_mov_b32 s2, -1
248 ; VI-NEXT: s_waitcnt lgkmcnt(0)
249 ; VI-NEXT: s_mov_b32 s0, s4
250 ; VI-NEXT: s_mov_b32 s1, s5
251 ; VI-NEXT: s_mov_b32 s4, s6
252 ; VI-NEXT: s_mov_b32 s5, s7
253 ; VI-NEXT: s_mov_b32 s6, s2
254 ; VI-NEXT: s_mov_b32 s7, s3
255 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
256 ; VI-NEXT: s_and_b32 s4, s8, 0xffff
257 ; VI-NEXT: s_waitcnt vmcnt(0)
258 ; VI-NEXT: v_lshlrev_b32_e32 v0, s4, v0
259 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
262 ; EG-LABEL: shl_i16_v_s:
264 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
266 ; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
267 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
270 ; EG-NEXT: Fetch clause starting at 6:
271 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
272 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
273 ; EG-NEXT: ALU clause starting at 10:
274 ; EG-NEXT: MOV T0.X, 0.0,
275 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
276 ; EG-NEXT: ALU clause starting at 12:
277 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
278 ; EG-NEXT: LSHL * T1.W, T1.X, T0.X,
279 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
280 ; EG-NEXT: AND_INT T1.W, PS, literal.x,
281 ; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
282 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
283 ; EG-NEXT: LSHL T0.X, PV.W, PS,
284 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
285 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
286 ; EG-NEXT: MOV T0.Y, 0.0,
287 ; EG-NEXT: MOV * T0.Z, 0.0,
288 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
289 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
290 %a = load i16, i16 addrspace(1)* %in
291 %result = shl i16 %a, %b
292 store i16 %result, i16 addrspace(1)* %out
296 define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
297 ; SI-LABEL: shl_i16_v_compute_s:
299 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
300 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
301 ; SI-NEXT: s_mov_b32 s3, 0xf000
302 ; SI-NEXT: s_mov_b32 s2, -1
303 ; SI-NEXT: s_waitcnt lgkmcnt(0)
304 ; SI-NEXT: s_mov_b32 s0, s4
305 ; SI-NEXT: s_mov_b32 s1, s5
306 ; SI-NEXT: s_mov_b32 s4, s6
307 ; SI-NEXT: s_mov_b32 s5, s7
308 ; SI-NEXT: s_mov_b32 s6, s2
309 ; SI-NEXT: s_mov_b32 s7, s3
310 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
311 ; SI-NEXT: s_add_i32 s8, s8, 3
312 ; SI-NEXT: s_and_b32 s4, s8, 0xffff
313 ; SI-NEXT: s_waitcnt vmcnt(0)
314 ; SI-NEXT: v_lshlrev_b32_e32 v0, s4, v0
315 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
318 ; VI-LABEL: shl_i16_v_compute_s:
320 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
321 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
322 ; VI-NEXT: s_mov_b32 s3, 0xf000
323 ; VI-NEXT: s_mov_b32 s2, -1
324 ; VI-NEXT: s_waitcnt lgkmcnt(0)
325 ; VI-NEXT: s_mov_b32 s0, s4
326 ; VI-NEXT: s_mov_b32 s1, s5
327 ; VI-NEXT: s_mov_b32 s4, s6
328 ; VI-NEXT: s_mov_b32 s5, s7
329 ; VI-NEXT: s_mov_b32 s6, s2
330 ; VI-NEXT: s_mov_b32 s7, s3
331 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
332 ; VI-NEXT: s_add_i32 s8, s8, 3
333 ; VI-NEXT: s_and_b32 s4, s8, 0xffff
334 ; VI-NEXT: s_waitcnt vmcnt(0)
335 ; VI-NEXT: v_lshlrev_b32_e32 v0, s4, v0
336 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
339 ; EG-LABEL: shl_i16_v_compute_s:
341 ; EG-NEXT: ALU 0, @12, KC0[], KC1[]
343 ; EG-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[]
345 ; EG-NEXT: ALU 15, @14, KC0[CB0:0-32], KC1[]
346 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
349 ; EG-NEXT: Fetch clause starting at 8:
350 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
351 ; EG-NEXT: Fetch clause starting at 10:
352 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
353 ; EG-NEXT: ALU clause starting at 12:
354 ; EG-NEXT: MOV * T0.X, 0.0,
355 ; EG-NEXT: ALU clause starting at 13:
356 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
357 ; EG-NEXT: ALU clause starting at 14:
358 ; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x,
359 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
360 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
361 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
362 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
363 ; EG-NEXT: LSHL * T0.W, T1.X, PV.W,
364 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
365 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
366 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
367 ; EG-NEXT: LSHL T0.X, PV.W, PS,
368 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
369 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
370 ; EG-NEXT: MOV T0.Y, 0.0,
371 ; EG-NEXT: MOV * T0.Z, 0.0,
372 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
373 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
374 %a = load i16, i16 addrspace(1)* %in
375 %b.add = add i16 %b, 3
376 %result = shl i16 %a, %b.add
377 store i16 %result, i16 addrspace(1)* %out
381 define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
382 ; SI-LABEL: shl_i16_computed_amount:
384 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
385 ; SI-NEXT: s_mov_b32 s3, 0xf000
386 ; SI-NEXT: s_mov_b32 s2, -1
387 ; SI-NEXT: s_mov_b32 s10, s2
388 ; SI-NEXT: s_mov_b32 s11, s3
389 ; SI-NEXT: s_waitcnt lgkmcnt(0)
390 ; SI-NEXT: s_mov_b32 s8, s6
391 ; SI-NEXT: s_mov_b32 s9, s7
392 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
393 ; SI-NEXT: v_mov_b32_e32 v1, 0
394 ; SI-NEXT: s_mov_b32 s14, 0
395 ; SI-NEXT: s_mov_b32 s15, s3
396 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
397 ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
398 ; SI-NEXT: s_waitcnt vmcnt(0)
399 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
400 ; SI-NEXT: s_waitcnt vmcnt(0)
401 ; SI-NEXT: s_mov_b32 s0, s4
402 ; SI-NEXT: s_mov_b32 s1, s5
403 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0
404 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
405 ; SI-NEXT: v_lshl_b32_e32 v0, v2, v0
406 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
409 ; VI-LABEL: shl_i16_computed_amount:
411 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
412 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
413 ; VI-NEXT: s_mov_b32 s3, 0xf000
414 ; VI-NEXT: s_mov_b32 s2, -1
415 ; VI-NEXT: s_mov_b32 s10, s2
416 ; VI-NEXT: s_waitcnt lgkmcnt(0)
417 ; VI-NEXT: v_mov_b32_e32 v1, s7
418 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
419 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
420 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
421 ; VI-NEXT: s_mov_b32 s8, s6
422 ; VI-NEXT: s_mov_b32 s9, s7
423 ; VI-NEXT: s_mov_b32 s11, s3
424 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
425 ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
426 ; VI-NEXT: s_waitcnt vmcnt(0)
427 ; VI-NEXT: flat_load_ushort v0, v[0:1] glc
428 ; VI-NEXT: s_waitcnt vmcnt(0)
429 ; VI-NEXT: s_mov_b32 s0, s4
430 ; VI-NEXT: s_mov_b32 s1, s5
431 ; VI-NEXT: v_add_u16_e32 v0, 3, v0
432 ; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2
433 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
436 ; EG-LABEL: shl_i16_computed_amount:
438 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
440 ; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[]
442 ; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
443 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
446 ; EG-NEXT: Fetch clause starting at 8:
447 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
448 ; EG-NEXT: Fetch clause starting at 10:
449 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
450 ; EG-NEXT: ALU clause starting at 12:
451 ; EG-NEXT: MOV * T1.X, KC0[2].Z,
452 ; EG-NEXT: ALU clause starting at 13:
453 ; EG-NEXT: LSHL * T0.W, T0.X, 1,
454 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
455 ; EG-NEXT: ALU clause starting at 15:
456 ; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x,
457 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
458 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
459 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
460 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
461 ; EG-NEXT: LSHL * T0.W, T1.X, PV.W,
462 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
463 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
464 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
465 ; EG-NEXT: LSHL T0.X, PV.W, PS,
466 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
467 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
468 ; EG-NEXT: MOV T0.Y, 0.0,
469 ; EG-NEXT: MOV * T0.Z, 0.0,
470 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
471 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
472 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
473 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
474 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
475 %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1
476 %a = load volatile i16, i16 addrspace(1)* %in
477 %b = load volatile i16, i16 addrspace(1)* %b_ptr
478 %b.add = add i16 %b, 3
479 %result = shl i16 %a, %b.add
480 store i16 %result, i16 addrspace(1)* %out
484 define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
485 ; SI-LABEL: shl_i16_i_s:
487 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
488 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
489 ; SI-NEXT: s_mov_b32 s7, 0xf000
490 ; SI-NEXT: s_mov_b32 s6, -1
491 ; SI-NEXT: s_waitcnt lgkmcnt(0)
492 ; SI-NEXT: s_lshl_b32 s0, s0, 12
493 ; SI-NEXT: v_mov_b32_e32 v0, s0
494 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
497 ; VI-LABEL: shl_i16_i_s:
499 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
500 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
501 ; VI-NEXT: s_mov_b32 s7, 0xf000
502 ; VI-NEXT: s_mov_b32 s6, -1
503 ; VI-NEXT: s_waitcnt lgkmcnt(0)
504 ; VI-NEXT: s_lshl_b32 s0, s0, 12
505 ; VI-NEXT: v_mov_b32_e32 v0, s0
506 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
509 ; EG-LABEL: shl_i16_i_s:
511 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
513 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
514 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
517 ; EG-NEXT: Fetch clause starting at 6:
518 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
519 ; EG-NEXT: ALU clause starting at 8:
520 ; EG-NEXT: MOV * T0.X, 0.0,
521 ; EG-NEXT: ALU clause starting at 9:
522 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
523 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
524 ; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
525 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
526 ; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
527 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
528 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
529 ; EG-NEXT: 61440(8.609578e-41), 3(4.203895e-45)
530 ; EG-NEXT: LSHL T0.X, PV.W, PS,
531 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
532 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
533 ; EG-NEXT: MOV T0.Y, 0.0,
534 ; EG-NEXT: MOV * T0.Z, 0.0,
535 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
536 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
537 %result = shl i16 %a, 12
538 store i16 %result, i16 addrspace(1)* %out
542 define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
543 ; SI-LABEL: shl_v2i16:
545 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
546 ; SI-NEXT: s_mov_b32 s3, 0xf000
547 ; SI-NEXT: s_mov_b32 s2, -1
548 ; SI-NEXT: s_mov_b32 s10, s2
549 ; SI-NEXT: s_mov_b32 s11, s3
550 ; SI-NEXT: s_waitcnt lgkmcnt(0)
551 ; SI-NEXT: s_mov_b32 s8, s6
552 ; SI-NEXT: s_mov_b32 s9, s7
553 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
554 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
555 ; SI-NEXT: v_mov_b32_e32 v1, 0
556 ; SI-NEXT: s_mov_b32 s14, 0
557 ; SI-NEXT: s_mov_b32 s15, s3
558 ; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
559 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
560 ; SI-NEXT: s_mov_b32 s6, 0xffff
561 ; SI-NEXT: s_mov_b32 s0, s4
562 ; SI-NEXT: s_mov_b32 s1, s5
563 ; SI-NEXT: s_waitcnt vmcnt(1)
564 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
565 ; SI-NEXT: s_waitcnt vmcnt(0)
566 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
567 ; SI-NEXT: v_and_b32_e32 v0, s6, v0
568 ; SI-NEXT: v_lshl_b32_e32 v0, v2, v0
569 ; SI-NEXT: v_lshl_b32_e32 v1, v1, v3
570 ; SI-NEXT: v_and_b32_e32 v0, s6, v0
571 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
572 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
573 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
576 ; VI-LABEL: shl_v2i16:
578 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
579 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
580 ; VI-NEXT: s_waitcnt lgkmcnt(0)
581 ; VI-NEXT: v_mov_b32_e32 v1, s3
582 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
583 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
584 ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
585 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
586 ; VI-NEXT: flat_load_dword v0, v[0:1]
587 ; VI-NEXT: s_load_dword s4, s[2:3], 0x0
588 ; VI-NEXT: s_mov_b32 s3, 0xf000
589 ; VI-NEXT: s_mov_b32 s2, -1
590 ; VI-NEXT: s_waitcnt lgkmcnt(0)
591 ; VI-NEXT: s_lshr_b32 s5, s4, 16
592 ; VI-NEXT: v_mov_b32_e32 v1, s5
593 ; VI-NEXT: s_waitcnt vmcnt(0)
594 ; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4
595 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
596 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
597 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
600 ; EG-LABEL: shl_v2i16:
602 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
604 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
606 ; EG-NEXT: ALU 12, @16, KC0[CB0:0-32], KC1[]
607 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
610 ; EG-NEXT: Fetch clause starting at 8:
611 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
612 ; EG-NEXT: Fetch clause starting at 10:
613 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
614 ; EG-NEXT: ALU clause starting at 12:
615 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
616 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
617 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
618 ; EG-NEXT: ALU clause starting at 15:
619 ; EG-NEXT: MOV * T7.X, KC0[2].Z,
620 ; EG-NEXT: ALU clause starting at 16:
621 ; EG-NEXT: AND_INT T0.Y, T0.X, literal.x,
622 ; EG-NEXT: AND_INT T0.Z, T7.X, literal.x, BS:VEC_120/SCL_212
623 ; EG-NEXT: LSHR T0.W, T0.X, literal.y,
624 ; EG-NEXT: LSHR * T1.W, T7.X, literal.y,
625 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
626 ; EG-NEXT: LSHL T0.W, PS, PV.W,
627 ; EG-NEXT: LSHL * T1.W, PV.Z, PV.Y,
628 ; EG-NEXT: AND_INT T1.W, PS, literal.x,
629 ; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
630 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
631 ; EG-NEXT: OR_INT T0.X, PV.W, PS,
632 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
633 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
634 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
635 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
636 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
637 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
638 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
639 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
640 %result = shl <2 x i16> %a, %b
641 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
645 define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
646 ; SI-LABEL: shl_v4i16:
648 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
649 ; SI-NEXT: s_mov_b32 s3, 0xf000
650 ; SI-NEXT: s_mov_b32 s2, 0
651 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
652 ; SI-NEXT: v_mov_b32_e32 v1, 0
653 ; SI-NEXT: s_waitcnt lgkmcnt(0)
654 ; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
655 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
656 ; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
657 ; SI-NEXT: s_mov_b32 s0, 0xffff
658 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
659 ; SI-NEXT: s_waitcnt vmcnt(1)
660 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
661 ; SI-NEXT: s_waitcnt vmcnt(0)
662 ; SI-NEXT: v_and_b32_e32 v8, s0, v4
663 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
664 ; SI-NEXT: v_and_b32_e32 v9, s0, v5
665 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
666 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
667 ; SI-NEXT: v_lshl_b32_e32 v5, v7, v5
668 ; SI-NEXT: v_lshl_b32_e32 v3, v3, v9
669 ; SI-NEXT: v_lshl_b32_e32 v4, v6, v4
670 ; SI-NEXT: v_lshl_b32_e32 v2, v2, v8
671 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
672 ; SI-NEXT: v_and_b32_e32 v3, s0, v3
673 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
674 ; SI-NEXT: v_and_b32_e32 v2, s0, v2
675 ; SI-NEXT: v_or_b32_e32 v3, v3, v5
676 ; SI-NEXT: v_or_b32_e32 v2, v2, v4
677 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
680 ; VI-LABEL: shl_v4i16:
682 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
683 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
684 ; VI-NEXT: s_waitcnt lgkmcnt(0)
685 ; VI-NEXT: v_mov_b32_e32 v1, s3
686 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
687 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
688 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
689 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
690 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
691 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
692 ; VI-NEXT: v_mov_b32_e32 v5, s1
693 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
694 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
695 ; VI-NEXT: s_waitcnt vmcnt(0)
696 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
697 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
698 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
699 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
700 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
701 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
702 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
705 ; EG-LABEL: shl_v4i16:
707 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
709 ; EG-NEXT: ALU 3, @15, KC0[], KC1[]
711 ; EG-NEXT: ALU 49, @19, KC0[CB0:0-32], KC1[]
712 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
715 ; EG-NEXT: Fetch clause starting at 8:
716 ; EG-NEXT: VTX_READ_64 T10.XY, T0.X, 0, #1
717 ; EG-NEXT: Fetch clause starting at 10:
718 ; EG-NEXT: VTX_READ_64 T10.XY, T0.X, 8, #1
719 ; EG-NEXT: ALU clause starting at 12:
720 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
721 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
722 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
723 ; EG-NEXT: ALU clause starting at 15:
724 ; EG-NEXT: MOV T4.X, T10.X,
725 ; EG-NEXT: MOV * T5.X, T10.Y,
726 ; EG-NEXT: MOV T0.Y, PV.X,
727 ; EG-NEXT: MOV * T0.Z, PS,
728 ; EG-NEXT: ALU clause starting at 19:
729 ; EG-NEXT: MOV T2.X, T10.X,
730 ; EG-NEXT: MOV * T3.X, T10.Y,
731 ; EG-NEXT: MOV T0.X, T6.X,
732 ; EG-NEXT: MOV * T1.Y, PV.X,
733 ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
734 ; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x,
735 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
736 ; EG-NEXT: LSHL * T1.W, PS, PV.W,
737 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
738 ; EG-NEXT: AND_INT * T2.W, T0.X, literal.y,
739 ; EG-NEXT: 65535(9.183409e-41), -65536(nan)
740 ; EG-NEXT: OR_INT * T1.W, PS, PV.W,
741 ; EG-NEXT: MOV T0.X, T3.X,
742 ; EG-NEXT: MOV * T6.X, PV.W,
743 ; EG-NEXT: MOV T1.Z, PS,
744 ; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
745 ; EG-NEXT: LSHR * T2.W, T0.Y, literal.x,
746 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
747 ; EG-NEXT: LSHL T1.W, PS, PV.W,
748 ; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x,
749 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
750 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
751 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
752 ; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
753 ; EG-NEXT: MOV T6.X, PV.W,
754 ; EG-NEXT: MOV T0.Y, T7.X,
755 ; EG-NEXT: AND_INT T1.W, T0.X, literal.x, BS:VEC_120/SCL_212
756 ; EG-NEXT: AND_INT * T2.W, T0.Z, literal.x,
757 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
758 ; EG-NEXT: LSHL T1.W, PS, PV.W,
759 ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
760 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
761 ; EG-NEXT: AND_INT * T1.W, PV.W, literal.x,
762 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
763 ; EG-NEXT: OR_INT * T1.W, T2.W, PV.W,
764 ; EG-NEXT: MOV * T7.X, PV.W,
765 ; EG-NEXT: MOV T0.Y, PV.X,
766 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
767 ; EG-NEXT: LSHR * T2.W, T0.Z, literal.x,
768 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
769 ; EG-NEXT: LSHL * T1.W, PS, PV.W,
770 ; EG-NEXT: AND_INT T0.Z, T0.Y, literal.x,
771 ; EG-NEXT: LSHL T1.W, PV.W, literal.y,
772 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
773 ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
774 ; EG-NEXT: LSHR T0.X, PS, literal.x,
775 ; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W,
776 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
777 ; EG-NEXT: MOV T7.X, PV.Y,
778 ; EG-NEXT: MOV * T10.X, T6.X,
779 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
780 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
781 %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
782 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
783 %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep
784 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
785 %result = shl <4 x i16> %a, %b
786 store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out
790 define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
793 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
794 ; SI-NEXT: s_mov_b32 s3, 0xf000
795 ; SI-NEXT: s_mov_b32 s2, -1
796 ; SI-NEXT: s_mov_b32 s10, s2
797 ; SI-NEXT: s_mov_b32 s11, s3
798 ; SI-NEXT: s_waitcnt lgkmcnt(0)
799 ; SI-NEXT: s_mov_b32 s8, s6
800 ; SI-NEXT: s_mov_b32 s9, s7
801 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
802 ; SI-NEXT: s_mov_b32 s0, s4
803 ; SI-NEXT: s_mov_b32 s1, s5
804 ; SI-NEXT: s_waitcnt vmcnt(0)
805 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
806 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
811 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
812 ; VI-NEXT: s_mov_b32 s7, 0xf000
813 ; VI-NEXT: s_mov_b32 s6, -1
814 ; VI-NEXT: s_waitcnt lgkmcnt(0)
815 ; VI-NEXT: s_mov_b32 s4, s0
816 ; VI-NEXT: s_mov_b32 s5, s1
817 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
818 ; VI-NEXT: s_waitcnt lgkmcnt(0)
819 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
820 ; VI-NEXT: v_mov_b32_e32 v0, s0
821 ; VI-NEXT: v_mov_b32_e32 v1, s1
822 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
827 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
829 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
830 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
833 ; EG-NEXT: Fetch clause starting at 6:
834 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
835 ; EG-NEXT: ALU clause starting at 8:
836 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
837 ; EG-NEXT: ALU clause starting at 9:
838 ; EG-NEXT: AND_INT T1.Y, T0.Z, literal.x,
839 ; EG-NEXT: LSHR T1.Z, T0.Y, 1,
840 ; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
841 ; EG-NEXT: NOT_INT * T1.W, T0.Z,
842 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
843 ; EG-NEXT: BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS,
844 ; EG-NEXT: LSHL T0.W, T0.X, PV.Y,
845 ; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x,
846 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
847 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W,
848 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
849 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
850 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
851 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
852 %a = load i64, i64 addrspace(1)* %in
853 %b = load i64, i64 addrspace(1)* %b_ptr
854 %result = shl i64 %a, %b
855 store i64 %result, i64 addrspace(1)* %out
859 define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
860 ; SI-LABEL: shl_v2i64:
862 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
863 ; SI-NEXT: s_mov_b32 s3, 0xf000
864 ; SI-NEXT: s_mov_b32 s2, -1
865 ; SI-NEXT: s_mov_b32 s10, s2
866 ; SI-NEXT: s_mov_b32 s11, s3
867 ; SI-NEXT: s_waitcnt lgkmcnt(0)
868 ; SI-NEXT: s_mov_b32 s8, s6
869 ; SI-NEXT: s_mov_b32 s9, s7
870 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
871 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
872 ; SI-NEXT: s_mov_b32 s0, s4
873 ; SI-NEXT: s_mov_b32 s1, s5
874 ; SI-NEXT: s_waitcnt vmcnt(0)
875 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6
876 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
877 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
880 ; VI-LABEL: shl_v2i64:
882 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
883 ; VI-NEXT: s_mov_b32 s3, 0xf000
884 ; VI-NEXT: s_mov_b32 s2, -1
885 ; VI-NEXT: s_waitcnt lgkmcnt(0)
886 ; VI-NEXT: s_mov_b32 s0, s4
887 ; VI-NEXT: s_mov_b32 s1, s5
888 ; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
889 ; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10
890 ; VI-NEXT: s_waitcnt lgkmcnt(0)
891 ; VI-NEXT: s_lshl_b64 s[6:7], s[10:11], s6
892 ; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s4
893 ; VI-NEXT: v_mov_b32_e32 v0, s4
894 ; VI-NEXT: v_mov_b32_e32 v1, s5
895 ; VI-NEXT: v_mov_b32_e32 v2, s6
896 ; VI-NEXT: v_mov_b32_e32 v3, s7
897 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
900 ; EG-LABEL: shl_v2i64:
902 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
904 ; EG-NEXT: ALU 22, @11, KC0[CB0:0-32], KC1[]
905 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
908 ; EG-NEXT: Fetch clause starting at 6:
909 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
910 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
911 ; EG-NEXT: ALU clause starting at 10:
912 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
913 ; EG-NEXT: ALU clause starting at 11:
914 ; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x,
915 ; EG-NEXT: LSHR T2.Z, T0.W, 1,
916 ; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
917 ; EG-NEXT: NOT_INT * T1.W, T1.Z,
918 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
919 ; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS,
920 ; EG-NEXT: LSHL * T1.W, T0.Z, PV.Y,
921 ; EG-NEXT: AND_INT T2.X, T1.Z, literal.x,
922 ; EG-NEXT: AND_INT T1.Y, T1.X, literal.y,
923 ; EG-NEXT: LSHR T0.Z, T0.Y, 1,
924 ; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
925 ; EG-NEXT: NOT_INT * T3.W, T1.X,
926 ; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44)
927 ; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
928 ; EG-NEXT: LSHL T0.Z, T0.X, PV.Y,
929 ; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
930 ; EG-NEXT: CNDE_INT * T3.W, PV.X, T0.W, T1.W,
931 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
932 ; EG-NEXT: CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
933 ; EG-NEXT: CNDE_INT * T3.Z, T2.X, T1.W, 0.0,
934 ; EG-NEXT: CNDE_INT T3.X, T2.W, T0.Z, 0.0,
935 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
936 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
937 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
938 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
939 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
940 %result = shl <2 x i64> %a, %b
941 store <2 x i64> %result, <2 x i64> addrspace(1)* %out
945 define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
946 ; SI-LABEL: shl_v4i64:
948 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
949 ; SI-NEXT: s_mov_b32 s3, 0xf000
950 ; SI-NEXT: s_mov_b32 s2, -1
951 ; SI-NEXT: s_mov_b32 s10, s2
952 ; SI-NEXT: s_mov_b32 s11, s3
953 ; SI-NEXT: s_waitcnt lgkmcnt(0)
954 ; SI-NEXT: s_mov_b32 s8, s6
955 ; SI-NEXT: s_mov_b32 s9, s7
956 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
957 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
958 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
959 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
960 ; SI-NEXT: s_mov_b32 s0, s4
961 ; SI-NEXT: s_mov_b32 s1, s5
962 ; SI-NEXT: s_waitcnt vmcnt(1)
963 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
964 ; SI-NEXT: s_waitcnt vmcnt(0)
965 ; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13
966 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11
967 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
968 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
969 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
972 ; VI-LABEL: shl_v4i64:
974 ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
975 ; VI-NEXT: s_mov_b32 s19, 0xf000
976 ; VI-NEXT: s_mov_b32 s18, -1
977 ; VI-NEXT: s_waitcnt lgkmcnt(0)
978 ; VI-NEXT: s_mov_b32 s16, s8
979 ; VI-NEXT: s_mov_b32 s17, s9
980 ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
981 ; VI-NEXT: s_load_dwordx8 s[8:15], s[10:11], 0x20
982 ; VI-NEXT: s_waitcnt lgkmcnt(0)
983 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s14
984 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s12
985 ; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s10
986 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
987 ; VI-NEXT: v_mov_b32_e32 v0, s4
988 ; VI-NEXT: v_mov_b32_e32 v1, s5
989 ; VI-NEXT: v_mov_b32_e32 v2, s6
990 ; VI-NEXT: v_mov_b32_e32 v3, s7
991 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
993 ; VI-NEXT: v_mov_b32_e32 v0, s0
994 ; VI-NEXT: v_mov_b32_e32 v1, s1
995 ; VI-NEXT: v_mov_b32_e32 v2, s2
996 ; VI-NEXT: v_mov_b32_e32 v3, s3
997 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1000 ; EG-LABEL: shl_v4i64:
1002 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1004 ; EG-NEXT: ALU 47, @15, KC0[CB0:0-32], KC1[]
1005 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
1006 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
1008 ; EG-NEXT: Fetch clause starting at 6:
1009 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1
1010 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
1011 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 32, #1
1012 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
1013 ; EG-NEXT: ALU clause starting at 14:
1014 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1015 ; EG-NEXT: ALU clause starting at 15:
1016 ; EG-NEXT: AND_INT T4.Z, T1.Z, literal.x,
1017 ; EG-NEXT: LSHR T1.W, T0.W, 1,
1018 ; EG-NEXT: NOT_INT * T3.W, T1.Z,
1019 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1020 ; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1,
1021 ; EG-NEXT: AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201
1022 ; EG-NEXT: LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212
1023 ; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
1024 ; EG-NEXT: NOT_INT * T2.W, T3.Z,
1025 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1026 ; EG-NEXT: BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS,
1027 ; EG-NEXT: LSHL T2.Z, T2.Z, PV.Y,
1028 ; EG-NEXT: BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W,
1029 ; EG-NEXT: LSHL * T1.W, T0.Z, T4.Z,
1030 ; EG-NEXT: AND_INT T4.X, T1.Z, literal.x,
1031 ; EG-NEXT: AND_INT T1.Y, T1.X, literal.y,
1032 ; EG-NEXT: LSHR T0.Z, T0.Y, 1,
1033 ; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
1034 ; EG-NEXT: NOT_INT * T3.W, T1.X,
1035 ; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44)
1036 ; EG-NEXT: AND_INT T5.X, T3.Z, literal.x,
1037 ; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
1038 ; EG-NEXT: LSHL T0.Z, T0.X, PV.Y,
1039 ; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
1040 ; EG-NEXT: CNDE_INT * T4.W, PV.X, T0.W, T1.W,
1041 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1042 ; EG-NEXT: AND_INT T0.X, T3.X, literal.x,
1043 ; EG-NEXT: CNDE_INT T4.Y, PV.W, PV.Y, PV.Z,
1044 ; EG-NEXT: LSHR T1.Z, T2.Y, 1,
1045 ; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1,
1046 ; EG-NEXT: NOT_INT * T3.W, T3.X,
1047 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1048 ; EG-NEXT: BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS,
1049 ; EG-NEXT: LSHL T0.Y, T2.X, PV.X,
1050 ; EG-NEXT: CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212
1051 ; EG-NEXT: AND_INT * T0.W, T3.X, literal.x, BS:VEC_201
1052 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1053 ; EG-NEXT: CNDE_INT * T1.W, T5.X, T3.Y, T2.Z,
1054 ; EG-NEXT: CNDE_INT T4.X, T2.W, T0.Z, 0.0,
1055 ; EG-NEXT: CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212
1056 ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
1057 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1058 ; EG-NEXT: LSHR T0.X, PV.W, literal.x,
1059 ; EG-NEXT: CNDE_INT T1.Z, T5.X, T2.Z, 0.0,
1060 ; EG-NEXT: CNDE_INT * T1.X, T0.W, T0.Y, 0.0,
1061 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1062 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
1063 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1064 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
1065 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
1066 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
1067 %result = shl <4 x i64> %a, %b
1068 store <4 x i64> %result, <4 x i64> addrspace(1)* %out
1072 ; Make sure load width gets reduced to i32 load.
1073 define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
1074 ; SI-LABEL: s_shl_32_i64:
1076 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1077 ; SI-NEXT: s_load_dword s0, s[0:1], 0x13
1078 ; SI-NEXT: s_mov_b32 s7, 0xf000
1079 ; SI-NEXT: s_mov_b32 s6, -1
1080 ; SI-NEXT: v_mov_b32_e32 v0, 0
1081 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1082 ; SI-NEXT: v_mov_b32_e32 v1, s0
1083 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1086 ; VI-LABEL: s_shl_32_i64:
1088 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1089 ; VI-NEXT: s_load_dword s0, s[0:1], 0x4c
1090 ; VI-NEXT: s_mov_b32 s7, 0xf000
1091 ; VI-NEXT: s_mov_b32 s6, -1
1092 ; VI-NEXT: v_mov_b32_e32 v0, 0
1093 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1094 ; VI-NEXT: v_mov_b32_e32 v1, s0
1095 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1098 ; EG-LABEL: s_shl_32_i64:
1100 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1101 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1104 ; EG-NEXT: ALU clause starting at 4:
1105 ; EG-NEXT: MOV * T0.Y, KC0[4].W,
1106 ; EG-NEXT: MOV T0.X, 0.0,
1107 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1108 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1109 %result = shl i64 %a, 32
1110 store i64 %result, i64 addrspace(1)* %out
1114 define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
1115 ; SI-LABEL: v_shl_32_i64:
1117 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1118 ; SI-NEXT: s_ashr_i32 s3, s2, 31
1119 ; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
1120 ; SI-NEXT: v_mov_b32_e32 v0, s0
1121 ; SI-NEXT: s_mov_b32 s7, 0xf000
1122 ; SI-NEXT: s_mov_b32 s6, 0
1123 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1124 ; SI-NEXT: s_mov_b64 s[4:5], s[10:11]
1125 ; SI-NEXT: v_mov_b32_e32 v1, s1
1126 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
1127 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
1128 ; SI-NEXT: v_mov_b32_e32 v2, 0
1129 ; SI-NEXT: s_waitcnt vmcnt(0)
1130 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1133 ; VI-LABEL: v_shl_32_i64:
1135 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1136 ; VI-NEXT: s_ashr_i32 s3, s2, 31
1137 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
1138 ; VI-NEXT: v_mov_b32_e32 v0, 0
1139 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1140 ; VI-NEXT: s_add_u32 s2, s6, s0
1141 ; VI-NEXT: s_addc_u32 s3, s7, s1
1142 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
1143 ; VI-NEXT: s_add_u32 s0, s4, s0
1144 ; VI-NEXT: s_addc_u32 s1, s5, s1
1145 ; VI-NEXT: v_mov_b32_e32 v3, s1
1146 ; VI-NEXT: v_mov_b32_e32 v2, s0
1147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1148 ; VI-NEXT: v_mov_b32_e32 v1, s2
1149 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1152 ; EG-LABEL: v_shl_32_i64:
1154 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
1156 ; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
1157 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
1160 ; EG-NEXT: Fetch clause starting at 6:
1161 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1162 ; EG-NEXT: ALU clause starting at 8:
1163 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1164 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1165 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
1166 ; EG-NEXT: ALU clause starting at 11:
1167 ; EG-NEXT: MOV T1.X, 0.0,
1168 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1169 ; EG-NEXT: LSHR T2.X, PV.W, literal.x,
1170 ; EG-NEXT: MOV * T1.Y, T0.X,
1171 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1172 %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0
1173 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
1174 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
1175 %a = load i64, i64 addrspace(1)* %gep.in
1176 %result = shl i64 %a, 32
1177 store i64 %result, i64 addrspace(1)* %gep.out
1181 define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
1182 ; SI-LABEL: s_shl_constant_i64:
1184 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1185 ; SI-NEXT: s_mov_b32 s2, -1
1186 ; SI-NEXT: s_mov_b32 s9, 0xffff
1187 ; SI-NEXT: s_mov_b32 s8, s2
1188 ; SI-NEXT: s_mov_b32 s3, 0xf000
1189 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1190 ; SI-NEXT: s_mov_b32 s0, s4
1191 ; SI-NEXT: s_mov_b32 s1, s5
1192 ; SI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6
1193 ; SI-NEXT: v_mov_b32_e32 v0, s4
1194 ; SI-NEXT: v_mov_b32_e32 v1, s5
1195 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1198 ; VI-LABEL: s_shl_constant_i64:
1200 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1201 ; VI-NEXT: s_mov_b32 s2, -1
1202 ; VI-NEXT: s_mov_b32 s9, 0xffff
1203 ; VI-NEXT: s_mov_b32 s8, s2
1204 ; VI-NEXT: s_mov_b32 s3, 0xf000
1205 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1206 ; VI-NEXT: s_mov_b32 s0, s4
1207 ; VI-NEXT: s_mov_b32 s1, s5
1208 ; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6
1209 ; VI-NEXT: v_mov_b32_e32 v0, s4
1210 ; VI-NEXT: v_mov_b32_e32 v1, s5
1211 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1214 ; EG-LABEL: s_shl_constant_i64:
1216 ; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
1217 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1220 ; EG-NEXT: ALU clause starting at 4:
1221 ; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
1222 ; EG-NEXT: MOV T0.W, literal.y,
1223 ; EG-NEXT: NOT_INT * T1.W, KC0[2].W,
1224 ; EG-NEXT: 31(4.344025e-44), -1(nan)
1225 ; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
1226 ; EG-NEXT: LSHL T0.W, literal.y, PV.Z,
1227 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z,
1228 ; EG-NEXT: 32767(4.591635e-41), -1(nan)
1229 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1230 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1231 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
1232 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1233 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1234 %shl = shl i64 281474976710655, %a
1235 store i64 %shl, i64 addrspace(1)* %out, align 8
1239 define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1240 ; SI-LABEL: v_shl_constant_i64:
1242 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1243 ; SI-NEXT: s_mov_b32 s3, 0xf000
1244 ; SI-NEXT: s_mov_b32 s2, -1
1245 ; SI-NEXT: s_mov_b32 s10, s2
1246 ; SI-NEXT: s_mov_b32 s11, s3
1247 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1248 ; SI-NEXT: s_mov_b32 s8, s6
1249 ; SI-NEXT: s_mov_b32 s9, s7
1250 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1251 ; SI-NEXT: s_mov_b32 s6, 0xab19b207
1252 ; SI-NEXT: s_movk_i32 s7, 0x11e
1253 ; SI-NEXT: s_mov_b32 s0, s4
1254 ; SI-NEXT: s_mov_b32 s1, s5
1255 ; SI-NEXT: s_waitcnt vmcnt(0)
1256 ; SI-NEXT: v_lshl_b64 v[0:1], s[6:7], v0
1257 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1260 ; VI-LABEL: v_shl_constant_i64:
1262 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1263 ; VI-NEXT: s_mov_b32 s7, 0xf000
1264 ; VI-NEXT: s_mov_b32 s6, -1
1265 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1266 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
1267 ; VI-NEXT: s_mov_b32 s4, s0
1268 ; VI-NEXT: s_mov_b32 s5, s1
1269 ; VI-NEXT: s_mov_b32 s0, 0xab19b207
1270 ; VI-NEXT: s_movk_i32 s1, 0x11e
1271 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1272 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
1273 ; VI-NEXT: v_mov_b32_e32 v0, s0
1274 ; VI-NEXT: v_mov_b32_e32 v1, s1
1275 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1278 ; EG-LABEL: v_shl_constant_i64:
1280 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1282 ; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
1283 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1286 ; EG-NEXT: Fetch clause starting at 6:
1287 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1288 ; EG-NEXT: ALU clause starting at 8:
1289 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1290 ; EG-NEXT: ALU clause starting at 9:
1291 ; EG-NEXT: NOT_INT T0.Z, T0.X,
1292 ; EG-NEXT: MOV T0.W, literal.x,
1293 ; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
1294 ; EG-NEXT: 1435293955(1.935796e+13), 31(4.344025e-44)
1295 ; EG-NEXT: LSHL T1.Z, literal.x, PS,
1296 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z,
1297 ; EG-NEXT: AND_INT * T1.W, T0.X, literal.z,
1298 ; EG-NEXT: -1424379385(-5.460358e-13), 143(2.003857e-43)
1299 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1300 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1301 ; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1302 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1303 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1304 %a = load i64, i64 addrspace(1)* %aptr, align 8
1305 %shl = shl i64 1231231234567, %a
1306 store i64 %shl, i64 addrspace(1)* %out, align 8
1310 define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1311 ; SI-LABEL: v_shl_i64_32_bit_constant:
1313 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1314 ; SI-NEXT: s_mov_b32 s3, 0xf000
1315 ; SI-NEXT: s_mov_b32 s2, -1
1316 ; SI-NEXT: s_mov_b32 s10, s2
1317 ; SI-NEXT: s_mov_b32 s11, s3
1318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1319 ; SI-NEXT: s_mov_b32 s8, s6
1320 ; SI-NEXT: s_mov_b32 s9, s7
1321 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1322 ; SI-NEXT: s_mov_b64 s[6:7], 0x12d687
1323 ; SI-NEXT: s_mov_b32 s0, s4
1324 ; SI-NEXT: s_mov_b32 s1, s5
1325 ; SI-NEXT: s_waitcnt vmcnt(0)
1326 ; SI-NEXT: v_lshl_b64 v[0:1], s[6:7], v0
1327 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1330 ; VI-LABEL: v_shl_i64_32_bit_constant:
1332 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1333 ; VI-NEXT: s_mov_b32 s7, 0xf000
1334 ; VI-NEXT: s_mov_b32 s6, -1
1335 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1336 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
1337 ; VI-NEXT: s_mov_b32 s4, s0
1338 ; VI-NEXT: s_mov_b32 s5, s1
1339 ; VI-NEXT: s_mov_b64 s[0:1], 0x12d687
1340 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1341 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
1342 ; VI-NEXT: v_mov_b32_e32 v0, s0
1343 ; VI-NEXT: v_mov_b32_e32 v1, s1
1344 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1347 ; EG-LABEL: v_shl_i64_32_bit_constant:
1349 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1351 ; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
1352 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1355 ; EG-NEXT: Fetch clause starting at 6:
1356 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1357 ; EG-NEXT: ALU clause starting at 8:
1358 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1359 ; EG-NEXT: ALU clause starting at 9:
1360 ; EG-NEXT: AND_INT T0.W, T0.X, literal.x,
1361 ; EG-NEXT: NOT_INT * T1.W, T0.X,
1362 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1363 ; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1364 ; EG-NEXT: LSHL T0.W, literal.y, PV.W,
1365 ; EG-NEXT: AND_INT * T1.W, T0.X, literal.z,
1366 ; EG-NEXT: 617283(8.649977e-40), 1234567(1.729997e-39)
1367 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1368 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1369 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
1370 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1371 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1372 %a = load i64, i64 addrspace(1)* %aptr, align 8
1373 %shl = shl i64 1234567, %a
1374 store i64 %shl, i64 addrspace(1)* %out, align 8
1378 define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
1379 ; SI-LABEL: v_shl_inline_imm_64_i64:
1381 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1382 ; SI-NEXT: s_mov_b32 s3, 0xf000
1383 ; SI-NEXT: s_mov_b32 s2, -1
1384 ; SI-NEXT: s_mov_b32 s10, s2
1385 ; SI-NEXT: s_mov_b32 s11, s3
1386 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1387 ; SI-NEXT: s_mov_b32 s8, s6
1388 ; SI-NEXT: s_mov_b32 s9, s7
1389 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1390 ; SI-NEXT: s_mov_b32 s0, s4
1391 ; SI-NEXT: s_mov_b32 s1, s5
1392 ; SI-NEXT: s_waitcnt vmcnt(0)
1393 ; SI-NEXT: v_lshl_b64 v[0:1], 64, v0
1394 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1397 ; VI-LABEL: v_shl_inline_imm_64_i64:
1399 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1400 ; VI-NEXT: s_mov_b32 s7, 0xf000
1401 ; VI-NEXT: s_mov_b32 s6, -1
1402 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1403 ; VI-NEXT: s_mov_b32 s4, s0
1404 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0
1405 ; VI-NEXT: s_mov_b32 s5, s1
1406 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1407 ; VI-NEXT: s_lshl_b64 s[0:1], 64, s0
1408 ; VI-NEXT: v_mov_b32_e32 v0, s0
1409 ; VI-NEXT: v_mov_b32_e32 v1, s1
1410 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1413 ; EG-LABEL: v_shl_inline_imm_64_i64:
1415 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1417 ; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
1418 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1421 ; EG-NEXT: Fetch clause starting at 6:
1422 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1423 ; EG-NEXT: ALU clause starting at 8:
1424 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1425 ; EG-NEXT: ALU clause starting at 9:
1426 ; EG-NEXT: AND_INT T0.W, T0.X, literal.x,
1427 ; EG-NEXT: NOT_INT * T1.W, T0.X,
1428 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1429 ; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1430 ; EG-NEXT: LSHL T0.W, literal.y, PV.W,
1431 ; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
1432 ; EG-NEXT: 32(4.484155e-44), 64(8.968310e-44)
1433 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1434 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
1435 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1436 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1437 %a = load i64, i64 addrspace(1)* %aptr, align 8
1438 %shl = shl i64 64, %a
1439 store i64 %shl, i64 addrspace(1)* %out, align 8
1443 define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1444 ; SI-LABEL: s_shl_inline_imm_64_i64:
1446 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1447 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1448 ; SI-NEXT: s_mov_b32 s7, 0xf000
1449 ; SI-NEXT: s_mov_b32 s6, -1
1450 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1451 ; SI-NEXT: s_lshl_b64 s[0:1], 64, s0
1452 ; SI-NEXT: v_mov_b32_e32 v0, s0
1453 ; SI-NEXT: v_mov_b32_e32 v1, s1
1454 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1457 ; VI-LABEL: s_shl_inline_imm_64_i64:
1459 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1460 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1461 ; VI-NEXT: s_mov_b32 s7, 0xf000
1462 ; VI-NEXT: s_mov_b32 s6, -1
1463 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1464 ; VI-NEXT: s_lshl_b64 s[0:1], 64, s0
1465 ; VI-NEXT: v_mov_b32_e32 v0, s0
1466 ; VI-NEXT: v_mov_b32_e32 v1, s1
1467 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1470 ; EG-LABEL: s_shl_inline_imm_64_i64:
1472 ; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
1473 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1476 ; EG-NEXT: ALU clause starting at 4:
1477 ; EG-NEXT: NOT_INT T0.W, KC0[2].W,
1478 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
1479 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1480 ; EG-NEXT: LSHL T0.Z, literal.x, PS,
1481 ; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1482 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1483 ; EG-NEXT: 64(8.968310e-44), 32(4.484155e-44)
1484 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1485 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1486 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1487 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1488 %shl = shl i64 64, %a
1489 store i64 %shl, i64 addrspace(1)* %out, align 8
1493 define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1494 ; SI-LABEL: s_shl_inline_imm_1_i64:
1496 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1497 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1498 ; SI-NEXT: s_mov_b32 s7, 0xf000
1499 ; SI-NEXT: s_mov_b32 s6, -1
1500 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1501 ; SI-NEXT: s_lshl_b64 s[0:1], 1, s0
1502 ; SI-NEXT: v_mov_b32_e32 v0, s0
1503 ; SI-NEXT: v_mov_b32_e32 v1, s1
1504 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1507 ; VI-LABEL: s_shl_inline_imm_1_i64:
1509 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1510 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1511 ; VI-NEXT: s_mov_b32 s7, 0xf000
1512 ; VI-NEXT: s_mov_b32 s6, -1
1513 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1514 ; VI-NEXT: s_lshl_b64 s[0:1], 1, s0
1515 ; VI-NEXT: v_mov_b32_e32 v0, s0
1516 ; VI-NEXT: v_mov_b32_e32 v1, s1
1517 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1520 ; EG-LABEL: s_shl_inline_imm_1_i64:
1522 ; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
1523 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1526 ; EG-NEXT: ALU clause starting at 4:
1527 ; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
1528 ; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y,
1529 ; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44)
1530 ; EG-NEXT: ASHR T1.W, PS, literal.x,
1531 ; EG-NEXT: LSHL * T0.W, 1, PV.W,
1532 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1533 ; EG-NEXT: AND_INT T0.Y, PV.W, PS,
1534 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
1535 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1536 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0,
1537 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1538 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1539 %shl = shl i64 1, %a
1540 store i64 %shl, i64 addrspace(1)* %out, align 8
1544 define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1545 ; SI-LABEL: s_shl_inline_imm_1_0_i64:
1547 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1548 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1549 ; SI-NEXT: s_mov_b32 s7, 0xf000
1550 ; SI-NEXT: s_mov_b32 s6, -1
1551 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1552 ; SI-NEXT: s_lshl_b64 s[0:1], 1.0, s0
1553 ; SI-NEXT: v_mov_b32_e32 v0, s0
1554 ; SI-NEXT: v_mov_b32_e32 v1, s1
1555 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1558 ; VI-LABEL: s_shl_inline_imm_1_0_i64:
1560 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1561 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1562 ; VI-NEXT: s_mov_b32 s7, 0xf000
1563 ; VI-NEXT: s_mov_b32 s6, -1
1564 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1565 ; VI-NEXT: s_lshl_b64 s[0:1], 1.0, s0
1566 ; VI-NEXT: v_mov_b32_e32 v0, s0
1567 ; VI-NEXT: v_mov_b32_e32 v1, s1
1568 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1571 ; EG-LABEL: s_shl_inline_imm_1_0_i64:
1573 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1574 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1577 ; EG-NEXT: ALU clause starting at 4:
1578 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1579 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1580 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1581 ; EG-NEXT: 536346624(1.050321e-19), 32(4.484155e-44)
1582 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1583 ; EG-NEXT: MOV T0.X, 0.0,
1584 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1585 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1586 %shl = shl i64 4607182418800017408, %a
1587 store i64 %shl, i64 addrspace(1)* %out, align 8
1591 define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1592 ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1594 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1595 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1596 ; SI-NEXT: s_mov_b32 s7, 0xf000
1597 ; SI-NEXT: s_mov_b32 s6, -1
1598 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1599 ; SI-NEXT: s_lshl_b64 s[0:1], -1.0, s0
1600 ; SI-NEXT: v_mov_b32_e32 v0, s0
1601 ; SI-NEXT: v_mov_b32_e32 v1, s1
1602 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1605 ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1607 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1608 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1609 ; VI-NEXT: s_mov_b32 s7, 0xf000
1610 ; VI-NEXT: s_mov_b32 s6, -1
1611 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1612 ; VI-NEXT: s_lshl_b64 s[0:1], -1.0, s0
1613 ; VI-NEXT: v_mov_b32_e32 v0, s0
1614 ; VI-NEXT: v_mov_b32_e32 v1, s1
1615 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1618 ; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
1620 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1621 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1624 ; EG-NEXT: ALU clause starting at 4:
1625 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1626 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1627 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1628 ; EG-NEXT: 1610088448(3.574057e+19), 32(4.484155e-44)
1629 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1630 ; EG-NEXT: MOV T0.X, 0.0,
1631 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1632 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1633 %shl = shl i64 13830554455654793216, %a
1634 store i64 %shl, i64 addrspace(1)* %out, align 8
1638 define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1639 ; SI-LABEL: s_shl_inline_imm_0_5_i64:
1641 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1642 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1643 ; SI-NEXT: s_mov_b32 s7, 0xf000
1644 ; SI-NEXT: s_mov_b32 s6, -1
1645 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1646 ; SI-NEXT: s_lshl_b64 s[0:1], 0.5, s0
1647 ; SI-NEXT: v_mov_b32_e32 v0, s0
1648 ; SI-NEXT: v_mov_b32_e32 v1, s1
1649 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1652 ; VI-LABEL: s_shl_inline_imm_0_5_i64:
1654 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1655 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1656 ; VI-NEXT: s_mov_b32 s7, 0xf000
1657 ; VI-NEXT: s_mov_b32 s6, -1
1658 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1659 ; VI-NEXT: s_lshl_b64 s[0:1], 0.5, s0
1660 ; VI-NEXT: v_mov_b32_e32 v0, s0
1661 ; VI-NEXT: v_mov_b32_e32 v1, s1
1662 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1665 ; EG-LABEL: s_shl_inline_imm_0_5_i64:
1667 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1668 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1671 ; EG-NEXT: ALU clause starting at 4:
1672 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1673 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1674 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1675 ; EG-NEXT: 535822336(1.016440e-19), 32(4.484155e-44)
1676 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1677 ; EG-NEXT: MOV T0.X, 0.0,
1678 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1679 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1680 %shl = shl i64 4602678819172646912, %a
1681 store i64 %shl, i64 addrspace(1)* %out, align 8
1685 define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1686 ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1688 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1689 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1690 ; SI-NEXT: s_mov_b32 s7, 0xf000
1691 ; SI-NEXT: s_mov_b32 s6, -1
1692 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1693 ; SI-NEXT: s_lshl_b64 s[0:1], -0.5, s0
1694 ; SI-NEXT: v_mov_b32_e32 v0, s0
1695 ; SI-NEXT: v_mov_b32_e32 v1, s1
1696 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1699 ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1701 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1702 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1703 ; VI-NEXT: s_mov_b32 s7, 0xf000
1704 ; VI-NEXT: s_mov_b32 s6, -1
1705 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1706 ; VI-NEXT: s_lshl_b64 s[0:1], -0.5, s0
1707 ; VI-NEXT: v_mov_b32_e32 v0, s0
1708 ; VI-NEXT: v_mov_b32_e32 v1, s1
1709 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1712 ; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
1714 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1715 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1718 ; EG-NEXT: ALU clause starting at 4:
1719 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1720 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1721 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1722 ; EG-NEXT: 1609564160(3.458765e+19), 32(4.484155e-44)
1723 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1724 ; EG-NEXT: MOV T0.X, 0.0,
1725 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1726 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1727 %shl = shl i64 13826050856027422720, %a
1728 store i64 %shl, i64 addrspace(1)* %out, align 8
1732 define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1733 ; SI-LABEL: s_shl_inline_imm_2_0_i64:
1735 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1736 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1737 ; SI-NEXT: s_mov_b32 s7, 0xf000
1738 ; SI-NEXT: s_mov_b32 s6, -1
1739 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1740 ; SI-NEXT: s_lshl_b64 s[0:1], 2.0, s0
1741 ; SI-NEXT: v_mov_b32_e32 v0, s0
1742 ; SI-NEXT: v_mov_b32_e32 v1, s1
1743 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1746 ; VI-LABEL: s_shl_inline_imm_2_0_i64:
1748 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1749 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1750 ; VI-NEXT: s_mov_b32 s7, 0xf000
1751 ; VI-NEXT: s_mov_b32 s6, -1
1752 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1753 ; VI-NEXT: s_lshl_b64 s[0:1], 2.0, s0
1754 ; VI-NEXT: v_mov_b32_e32 v0, s0
1755 ; VI-NEXT: v_mov_b32_e32 v1, s1
1756 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1759 ; EG-LABEL: s_shl_inline_imm_2_0_i64:
1761 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1762 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1765 ; EG-NEXT: ALU clause starting at 4:
1766 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1767 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1768 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1769 ; EG-NEXT: 536870912(1.084202e-19), 32(4.484155e-44)
1770 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1771 ; EG-NEXT: MOV T0.X, 0.0,
1772 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1773 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1774 %shl = shl i64 4611686018427387904, %a
1775 store i64 %shl, i64 addrspace(1)* %out, align 8
1779 define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1780 ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1782 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1783 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1784 ; SI-NEXT: s_mov_b32 s7, 0xf000
1785 ; SI-NEXT: s_mov_b32 s6, -1
1786 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1787 ; SI-NEXT: s_lshl_b64 s[0:1], -2.0, s0
1788 ; SI-NEXT: v_mov_b32_e32 v0, s0
1789 ; SI-NEXT: v_mov_b32_e32 v1, s1
1790 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1793 ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1795 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1796 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1797 ; VI-NEXT: s_mov_b32 s7, 0xf000
1798 ; VI-NEXT: s_mov_b32 s6, -1
1799 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1800 ; VI-NEXT: s_lshl_b64 s[0:1], -2.0, s0
1801 ; VI-NEXT: v_mov_b32_e32 v0, s0
1802 ; VI-NEXT: v_mov_b32_e32 v1, s1
1803 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1806 ; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
1808 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1809 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1812 ; EG-NEXT: ALU clause starting at 4:
1813 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1814 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1815 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1816 ; EG-NEXT: 1610612736(3.689349e+19), 32(4.484155e-44)
1817 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1818 ; EG-NEXT: MOV T0.X, 0.0,
1819 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1820 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1821 %shl = shl i64 13835058055282163712, %a
1822 store i64 %shl, i64 addrspace(1)* %out, align 8
1826 define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1827 ; SI-LABEL: s_shl_inline_imm_4_0_i64:
1829 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1830 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1831 ; SI-NEXT: s_mov_b32 s7, 0xf000
1832 ; SI-NEXT: s_mov_b32 s6, -1
1833 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1834 ; SI-NEXT: s_lshl_b64 s[0:1], 4.0, s0
1835 ; SI-NEXT: v_mov_b32_e32 v0, s0
1836 ; SI-NEXT: v_mov_b32_e32 v1, s1
1837 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1840 ; VI-LABEL: s_shl_inline_imm_4_0_i64:
1842 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1843 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1844 ; VI-NEXT: s_mov_b32 s7, 0xf000
1845 ; VI-NEXT: s_mov_b32 s6, -1
1846 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1847 ; VI-NEXT: s_lshl_b64 s[0:1], 4.0, s0
1848 ; VI-NEXT: v_mov_b32_e32 v0, s0
1849 ; VI-NEXT: v_mov_b32_e32 v1, s1
1850 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1853 ; EG-LABEL: s_shl_inline_imm_4_0_i64:
1855 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1856 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1859 ; EG-NEXT: ALU clause starting at 4:
1860 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1861 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1862 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1863 ; EG-NEXT: 537395200(1.151965e-19), 32(4.484155e-44)
1864 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1865 ; EG-NEXT: MOV T0.X, 0.0,
1866 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1867 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1868 %shl = shl i64 4616189618054758400, %a
1869 store i64 %shl, i64 addrspace(1)* %out, align 8
1873 define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1874 ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1876 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1877 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
1878 ; SI-NEXT: s_mov_b32 s7, 0xf000
1879 ; SI-NEXT: s_mov_b32 s6, -1
1880 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1881 ; SI-NEXT: s_lshl_b64 s[0:1], -4.0, s0
1882 ; SI-NEXT: v_mov_b32_e32 v0, s0
1883 ; SI-NEXT: v_mov_b32_e32 v1, s1
1884 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1887 ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1889 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1890 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
1891 ; VI-NEXT: s_mov_b32 s7, 0xf000
1892 ; VI-NEXT: s_mov_b32 s6, -1
1893 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1894 ; VI-NEXT: s_lshl_b64 s[0:1], -4.0, s0
1895 ; VI-NEXT: v_mov_b32_e32 v0, s0
1896 ; VI-NEXT: v_mov_b32_e32 v1, s1
1897 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1900 ; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
1902 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1903 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1906 ; EG-NEXT: ALU clause starting at 4:
1907 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
1908 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1909 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
1910 ; EG-NEXT: 1611137024(3.919933e+19), 32(4.484155e-44)
1911 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
1912 ; EG-NEXT: MOV T0.X, 0.0,
1913 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1914 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1915 %shl = shl i64 13839561654909534208, %a
1916 store i64 %shl, i64 addrspace(1)* %out, align 8
1921 ; Test with the 64-bit integer bitpattern for a 32-bit float in the
1922 ; low 32-bits, which is not a valid 64-bit inline immmediate.
1923 define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1924 ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1926 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1927 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
1928 ; SI-NEXT: s_mov_b64 s[0:1], 0x40800000
1929 ; SI-NEXT: s_mov_b32 s7, 0xf000
1930 ; SI-NEXT: s_mov_b32 s6, -1
1931 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1932 ; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
1933 ; SI-NEXT: v_mov_b32_e32 v0, s0
1934 ; SI-NEXT: v_mov_b32_e32 v1, s1
1935 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1938 ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1940 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1941 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34
1942 ; VI-NEXT: s_mov_b64 s[0:1], 0x40800000
1943 ; VI-NEXT: s_mov_b32 s7, 0xf000
1944 ; VI-NEXT: s_mov_b32 s6, -1
1945 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1946 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
1947 ; VI-NEXT: v_mov_b32_e32 v0, s0
1948 ; VI-NEXT: v_mov_b32_e32 v1, s1
1949 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1952 ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
1954 ; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
1955 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1958 ; EG-NEXT: ALU clause starting at 4:
1959 ; EG-NEXT: NOT_INT T0.W, KC0[2].W,
1960 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
1961 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1962 ; EG-NEXT: LSHL T0.Z, literal.x, PS,
1963 ; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
1964 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z,
1965 ; EG-NEXT: 1082130432(4.000000e+00), 541065216(1.626303e-19)
1966 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
1967 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1968 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.Z, 0.0,
1969 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1970 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1971 %shl = shl i64 1082130432, %a
1972 store i64 %shl, i64 addrspace(1)* %out, align 8
1976 ; FIXME: Copy of -1 register
1977 define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
1978 ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1980 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1981 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
1982 ; SI-NEXT: s_mov_b32 s6, -1
1983 ; SI-NEXT: s_mov_b32 s0, -4.0
1984 ; SI-NEXT: s_mov_b32 s1, s6
1985 ; SI-NEXT: s_mov_b32 s7, 0xf000
1986 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1987 ; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
1988 ; SI-NEXT: v_mov_b32_e32 v0, s0
1989 ; SI-NEXT: v_mov_b32_e32 v1, s1
1990 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1993 ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1995 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1996 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34
1997 ; VI-NEXT: s_mov_b32 s6, -1
1998 ; VI-NEXT: s_mov_b32 s0, -4.0
1999 ; VI-NEXT: s_mov_b32 s1, s6
2000 ; VI-NEXT: s_mov_b32 s7, 0xf000
2001 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2002 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
2003 ; VI-NEXT: v_mov_b32_e32 v0, s0
2004 ; VI-NEXT: v_mov_b32_e32 v1, s1
2005 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2008 ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
2010 ; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
2011 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2014 ; EG-NEXT: ALU clause starting at 4:
2015 ; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
2016 ; EG-NEXT: MOV T0.W, literal.y,
2017 ; EG-NEXT: NOT_INT * T1.W, KC0[2].W,
2018 ; EG-NEXT: 31(4.344025e-44), -532676608(-5.534023e+19)
2019 ; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
2020 ; EG-NEXT: LSHL T0.W, literal.y, PV.Z,
2021 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z,
2022 ; EG-NEXT: 2147483647(nan), -1065353216(-4.000000e+00)
2023 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
2024 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W,
2025 ; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
2026 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2027 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2028 %shl = shl i64 -1065353216, %a
2029 store i64 %shl, i64 addrspace(1)* %out, align 8
2033 define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
2034 ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2036 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2037 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2038 ; SI-NEXT: s_mov_b32 s0, 0
2039 ; SI-NEXT: s_mov_b32 s1, 4.0
2040 ; SI-NEXT: s_mov_b32 s7, 0xf000
2041 ; SI-NEXT: s_mov_b32 s6, -1
2042 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2043 ; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
2044 ; SI-NEXT: v_mov_b32_e32 v0, s0
2045 ; SI-NEXT: v_mov_b32_e32 v1, s1
2046 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2049 ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2051 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2052 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34
2053 ; VI-NEXT: s_mov_b32 s0, 0
2054 ; VI-NEXT: s_mov_b32 s1, 4.0
2055 ; VI-NEXT: s_mov_b32 s7, 0xf000
2056 ; VI-NEXT: s_mov_b32 s6, -1
2057 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2058 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
2059 ; VI-NEXT: v_mov_b32_e32 v0, s0
2060 ; VI-NEXT: v_mov_b32_e32 v1, s1
2061 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2064 ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2066 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
2067 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2070 ; EG-NEXT: ALU clause starting at 4:
2071 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
2072 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2073 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
2074 ; EG-NEXT: 541065216(1.626303e-19), 32(4.484155e-44)
2075 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
2076 ; EG-NEXT: MOV T0.X, 0.0,
2077 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2078 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2079 %shl = shl i64 4647714815446351872, %a
2080 store i64 %shl, i64 addrspace(1)* %out, align 8
2084 define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
2085 ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2087 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2088 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2089 ; SI-NEXT: s_mov_b32 s0, 0
2090 ; SI-NEXT: s_mov_b32 s1, -4.0
2091 ; SI-NEXT: s_mov_b32 s7, 0xf000
2092 ; SI-NEXT: s_mov_b32 s6, -1
2093 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2094 ; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
2095 ; SI-NEXT: v_mov_b32_e32 v0, s0
2096 ; SI-NEXT: v_mov_b32_e32 v1, s1
2097 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2100 ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2102 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2103 ; VI-NEXT: s_load_dword s2, s[0:1], 0x34
2104 ; VI-NEXT: s_mov_b32 s0, 0
2105 ; VI-NEXT: s_mov_b32 s1, -4.0
2106 ; VI-NEXT: s_mov_b32 s7, 0xf000
2107 ; VI-NEXT: s_mov_b32 s6, -1
2108 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2109 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
2110 ; VI-NEXT: v_mov_b32_e32 v0, s0
2111 ; VI-NEXT: v_mov_b32_e32 v1, s1
2112 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2115 ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2117 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
2118 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2121 ; EG-NEXT: ALU clause starting at 4:
2122 ; EG-NEXT: NOT_INT * T0.W, KC0[2].W,
2123 ; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2124 ; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
2125 ; EG-NEXT: 1614807040(5.534023e+19), 32(4.484155e-44)
2126 ; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0,
2127 ; EG-NEXT: MOV T0.X, 0.0,
2128 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2129 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2130 %shl = shl i64 13871086852301127680, %a
2131 store i64 %shl, i64 addrspace(1)* %out, align 8
2135 define amdgpu_kernel void @test_mul2(i32 %p) {
2136 ; SI-LABEL: test_mul2:
2138 ; SI-NEXT: s_load_dword s0, s[0:1], 0x9
2139 ; SI-NEXT: s_mov_b32 s3, 0xf000
2140 ; SI-NEXT: s_mov_b32 s2, -1
2141 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2142 ; SI-NEXT: s_lshl_b32 s0, s0, 1
2143 ; SI-NEXT: v_mov_b32_e32 v0, s0
2144 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2145 ; SI-NEXT: s_waitcnt vmcnt(0)
2148 ; VI-LABEL: test_mul2:
2150 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
2151 ; VI-NEXT: s_mov_b32 s3, 0xf000
2152 ; VI-NEXT: s_mov_b32 s2, -1
2153 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2154 ; VI-NEXT: s_lshl_b32 s0, s0, 1
2155 ; VI-NEXT: v_mov_b32_e32 v0, s0
2156 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2157 ; VI-NEXT: s_waitcnt vmcnt(0)
2160 ; EG-LABEL: test_mul2:
2162 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2163 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2166 ; EG-NEXT: ALU clause starting at 4:
2167 ; EG-NEXT: MOV T0.X, literal.x,
2168 ; EG-NEXT: LSHL * T1.X, KC0[2].Y, 1,
2169 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
2171 store volatile i32 %i, i32 addrspace(1)* undef
2175 define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
2176 ; SI-LABEL: shl_or_k:
2178 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2179 ; SI-NEXT: s_mov_b32 s6, 0
2180 ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2
2181 ; SI-NEXT: s_mov_b32 s7, 0xf000
2182 ; SI-NEXT: s_mov_b32 s4, s6
2183 ; SI-NEXT: s_mov_b32 s5, s6
2184 ; SI-NEXT: v_or_b32_e32 v2, 4, v2
2185 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
2186 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2187 ; SI-NEXT: s_setpc_b64 s[30:31]
2189 ; VI-LABEL: shl_or_k:
2191 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2192 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v2
2193 ; VI-NEXT: v_or_b32_e32 v2, 4, v2
2194 ; VI-NEXT: flat_store_dword v[0:1], v2
2195 ; VI-NEXT: s_waitcnt vmcnt(0)
2196 ; VI-NEXT: s_setpc_b64 s[30:31]
2198 ; EG-LABEL: shl_or_k:
2200 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
2201 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2204 ; EG-NEXT: ALU clause starting at 4:
2205 ; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
2206 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2207 ; EG-NEXT: OR_INT T0.X, PV.W, literal.x,
2208 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
2209 ; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45)
2210 %tmp0 = or i32 %in, 1
2211 %tmp2 = shl i32 %tmp0, 2
2212 store i32 %tmp2, i32 addrspace(1)* %out
2216 define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
2217 ; SI-LABEL: shl_or_k_two_uses:
2219 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2220 ; SI-NEXT: s_mov_b32 s6, 0
2221 ; SI-NEXT: v_or_b32_e32 v4, 1, v4
2222 ; SI-NEXT: s_mov_b32 s7, 0xf000
2223 ; SI-NEXT: s_mov_b32 s4, s6
2224 ; SI-NEXT: s_mov_b32 s5, s6
2225 ; SI-NEXT: v_lshlrev_b32_e32 v5, 2, v4
2226 ; SI-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
2227 ; SI-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
2228 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2229 ; SI-NEXT: s_setpc_b64 s[30:31]
2231 ; VI-LABEL: shl_or_k_two_uses:
2233 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2234 ; VI-NEXT: v_or_b32_e32 v4, 1, v4
2235 ; VI-NEXT: v_lshlrev_b32_e32 v5, 2, v4
2236 ; VI-NEXT: flat_store_dword v[0:1], v5
2237 ; VI-NEXT: flat_store_dword v[2:3], v4
2238 ; VI-NEXT: s_waitcnt vmcnt(0)
2239 ; VI-NEXT: s_setpc_b64 s[30:31]
2241 ; EG-LABEL: shl_or_k_two_uses:
2243 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
2244 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
2245 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2247 ; EG-NEXT: ALU clause starting at 4:
2248 ; EG-NEXT: LSHR T0.X, KC0[2].Z, literal.x,
2249 ; EG-NEXT: OR_INT * T1.X, KC0[2].W, 1,
2250 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2251 ; EG-NEXT: LSHL T2.X, PS, literal.x,
2252 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2253 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2254 %tmp0 = or i32 %in, 1
2255 %tmp2 = shl i32 %tmp0, 2
2256 store i32 %tmp2, i32 addrspace(1)* %out0
2257 store i32 %tmp0, i32 addrspace(1)* %out1
2261 attributes #0 = { nounwind readnone }