1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
6 declare i32 @llvm.amdgcn.workitem.id.x() #0
8 define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
9 ; SI-LABEL: ashr_v2i32:
11 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: s_mov_b32 s6, -1
14 ; SI-NEXT: s_mov_b32 s10, s6
15 ; SI-NEXT: s_mov_b32 s11, s7
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b32 s8, s2
18 ; SI-NEXT: s_mov_b32 s9, s3
19 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
20 ; SI-NEXT: s_mov_b32 s4, s0
21 ; SI-NEXT: s_mov_b32 s5, s1
22 ; SI-NEXT: s_waitcnt vmcnt(0)
23 ; SI-NEXT: v_ashr_i32_e32 v1, v1, v3
24 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v2
25 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28 ; VI-LABEL: ashr_v2i32:
30 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
31 ; VI-NEXT: s_mov_b32 s7, 0xf000
32 ; VI-NEXT: s_mov_b32 s6, -1
33 ; VI-NEXT: s_mov_b32 s10, s6
34 ; VI-NEXT: s_mov_b32 s11, s7
35 ; VI-NEXT: s_waitcnt lgkmcnt(0)
36 ; VI-NEXT: s_mov_b32 s8, s2
37 ; VI-NEXT: s_mov_b32 s9, s3
38 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
39 ; VI-NEXT: s_mov_b32 s4, s0
40 ; VI-NEXT: s_mov_b32 s5, s1
41 ; VI-NEXT: s_waitcnt vmcnt(0)
42 ; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1
43 ; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0
44 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
47 ; EG-LABEL: ashr_v2i32:
49 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
51 ; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
52 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
55 ; EG-NEXT: Fetch clause starting at 6:
56 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
57 ; EG-NEXT: ALU clause starting at 8:
58 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
59 ; EG-NEXT: ALU clause starting at 9:
60 ; EG-NEXT: ASHR * T0.Y, T0.Y, T0.W,
61 ; EG-NEXT: ASHR T0.X, T0.X, T0.Z,
62 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
63 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
64 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
65 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
66 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
67 %result = ashr <2 x i32> %a, %b
68 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
72 define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
73 ; SI-LABEL: ashr_v4i32:
75 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
76 ; SI-NEXT: s_mov_b32 s7, 0xf000
77 ; SI-NEXT: s_mov_b32 s6, -1
78 ; SI-NEXT: s_mov_b32 s10, s6
79 ; SI-NEXT: s_mov_b32 s11, s7
80 ; SI-NEXT: s_waitcnt lgkmcnt(0)
81 ; SI-NEXT: s_mov_b32 s8, s2
82 ; SI-NEXT: s_mov_b32 s9, s3
83 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
84 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
85 ; SI-NEXT: s_mov_b32 s4, s0
86 ; SI-NEXT: s_mov_b32 s5, s1
87 ; SI-NEXT: s_waitcnt vmcnt(0)
88 ; SI-NEXT: v_ashr_i32_e32 v3, v3, v7
89 ; SI-NEXT: v_ashr_i32_e32 v2, v2, v6
90 ; SI-NEXT: v_ashr_i32_e32 v1, v1, v5
91 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v4
92 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
95 ; VI-LABEL: ashr_v4i32:
97 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
98 ; VI-NEXT: s_mov_b32 s7, 0xf000
99 ; VI-NEXT: s_mov_b32 s6, -1
100 ; VI-NEXT: s_mov_b32 s10, s6
101 ; VI-NEXT: s_mov_b32 s11, s7
102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
103 ; VI-NEXT: s_mov_b32 s8, s2
104 ; VI-NEXT: s_mov_b32 s9, s3
105 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
106 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
107 ; VI-NEXT: s_mov_b32 s4, s0
108 ; VI-NEXT: s_mov_b32 s5, s1
109 ; VI-NEXT: s_waitcnt vmcnt(0)
110 ; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3
111 ; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2
112 ; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1
113 ; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0
114 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
117 ; EG-LABEL: ashr_v4i32:
119 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
121 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
122 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
125 ; EG-NEXT: Fetch clause starting at 6:
126 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
127 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
128 ; EG-NEXT: ALU clause starting at 10:
129 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
130 ; EG-NEXT: ALU clause starting at 11:
131 ; EG-NEXT: ASHR * T0.W, T0.W, T1.W,
132 ; EG-NEXT: ASHR * T0.Z, T0.Z, T1.Z,
133 ; EG-NEXT: ASHR * T0.Y, T0.Y, T1.Y,
134 ; EG-NEXT: ASHR T0.X, T0.X, T1.X,
135 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
136 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
137 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
138 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
139 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
140 %result = ashr <4 x i32> %a, %b
141 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
145 ; FIXME: The ashr operation is uniform, but because its operands come from a
146 ; global load we end up with the vector instructions rather than scalar.
147 define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
148 ; SI-LABEL: ashr_v2i16:
150 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
151 ; SI-NEXT: s_mov_b32 s7, 0xf000
152 ; SI-NEXT: s_mov_b32 s6, -1
153 ; SI-NEXT: s_mov_b32 s10, s6
154 ; SI-NEXT: s_mov_b32 s11, s7
155 ; SI-NEXT: s_waitcnt lgkmcnt(0)
156 ; SI-NEXT: s_mov_b32 s8, s2
157 ; SI-NEXT: s_mov_b32 s9, s3
158 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
159 ; SI-NEXT: s_mov_b32 s4, s0
160 ; SI-NEXT: s_mov_b32 s5, s1
161 ; SI-NEXT: s_waitcnt vmcnt(0)
162 ; SI-NEXT: v_bfe_i32 v2, v0, 0, 16
163 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0
164 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
165 ; SI-NEXT: v_ashrrev_i32_e32 v0, v3, v0
166 ; SI-NEXT: v_ashrrev_i32_e32 v1, v1, v2
167 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
168 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
169 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
170 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
173 ; VI-LABEL: ashr_v2i16:
175 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
176 ; VI-NEXT: s_mov_b32 s7, 0xf000
177 ; VI-NEXT: s_mov_b32 s6, -1
178 ; VI-NEXT: s_mov_b32 s10, s6
179 ; VI-NEXT: s_mov_b32 s11, s7
180 ; VI-NEXT: s_waitcnt lgkmcnt(0)
181 ; VI-NEXT: s_mov_b32 s8, s2
182 ; VI-NEXT: s_mov_b32 s9, s3
183 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
184 ; VI-NEXT: s_mov_b32 s4, s0
185 ; VI-NEXT: s_mov_b32 s5, s1
186 ; VI-NEXT: s_waitcnt vmcnt(0)
187 ; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
188 ; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
189 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
190 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
193 ; EG-LABEL: ashr_v2i16:
195 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
197 ; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
198 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
201 ; EG-NEXT: Fetch clause starting at 6:
202 ; EG-NEXT: VTX_READ_64 T6.XY, T6.X, 0, #1
203 ; EG-NEXT: ALU clause starting at 8:
204 ; EG-NEXT: MOV * T6.X, KC0[2].Z,
205 ; EG-NEXT: ALU clause starting at 9:
206 ; EG-NEXT: LSHR * T0.W, T6.X, literal.x,
207 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
208 ; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
209 ; EG-NEXT: LSHR T0.Z, T6.Y, literal.x,
210 ; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x,
211 ; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y,
212 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
213 ; EG-NEXT: ASHR T0.W, PV.W, PS,
214 ; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z,
215 ; EG-NEXT: LSHL T1.W, PS, literal.x,
216 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
217 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
218 ; EG-NEXT: OR_INT T6.X, PS, PV.W,
219 ; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
220 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
221 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
222 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
223 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
224 %result = ashr <2 x i16> %a, %b
225 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
229 ; FIXME: The ashr operation is uniform, but because its operands come from a
230 ; global load we end up with the vector instructions rather than scalar.
231 define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
232 ; SI-LABEL: ashr_v4i16:
234 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
235 ; SI-NEXT: s_mov_b32 s7, 0xf000
236 ; SI-NEXT: s_mov_b32 s6, -1
237 ; SI-NEXT: s_mov_b32 s10, s6
238 ; SI-NEXT: s_mov_b32 s11, s7
239 ; SI-NEXT: s_waitcnt lgkmcnt(0)
240 ; SI-NEXT: s_mov_b32 s8, s2
241 ; SI-NEXT: s_mov_b32 s9, s3
242 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
243 ; SI-NEXT: s_mov_b32 s4, s0
244 ; SI-NEXT: s_mov_b32 s5, s1
245 ; SI-NEXT: s_waitcnt vmcnt(0)
246 ; SI-NEXT: v_bfe_i32 v4, v0, 0, 16
247 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0
248 ; SI-NEXT: v_bfe_i32 v5, v1, 0, 16
249 ; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1
250 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
251 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
252 ; SI-NEXT: v_ashr_i32_e32 v1, v1, v7
253 ; SI-NEXT: v_ashr_i32_e32 v3, v5, v3
254 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v6
255 ; SI-NEXT: v_ashr_i32_e32 v2, v4, v2
256 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
257 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
258 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
259 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
260 ; SI-NEXT: v_or_b32_e32 v1, v3, v1
261 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
262 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
265 ; VI-LABEL: ashr_v4i16:
267 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
268 ; VI-NEXT: s_mov_b32 s7, 0xf000
269 ; VI-NEXT: s_mov_b32 s6, -1
270 ; VI-NEXT: s_mov_b32 s10, s6
271 ; VI-NEXT: s_mov_b32 s11, s7
272 ; VI-NEXT: s_waitcnt lgkmcnt(0)
273 ; VI-NEXT: s_mov_b32 s8, s2
274 ; VI-NEXT: s_mov_b32 s9, s3
275 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
276 ; VI-NEXT: s_mov_b32 s4, s0
277 ; VI-NEXT: s_mov_b32 s5, s1
278 ; VI-NEXT: s_waitcnt vmcnt(0)
279 ; VI-NEXT: v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
280 ; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
281 ; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v3), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
282 ; VI-NEXT: v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
283 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
284 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
285 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
288 ; EG-LABEL: ashr_v4i16:
290 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
292 ; EG-NEXT: ALU 58, @9, KC0[CB0:0-32], KC1[]
293 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
296 ; EG-NEXT: Fetch clause starting at 6:
297 ; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1
298 ; EG-NEXT: ALU clause starting at 8:
299 ; EG-NEXT: MOV * T9.X, KC0[2].Z,
300 ; EG-NEXT: ALU clause starting at 9:
301 ; EG-NEXT: MOV T4.X, T9.X,
302 ; EG-NEXT: MOV * T5.X, T9.Y,
303 ; EG-NEXT: MOV T0.Y, PV.X,
304 ; EG-NEXT: MOV * T0.Z, PS,
305 ; EG-NEXT: MOV T2.X, T9.Z,
306 ; EG-NEXT: MOV * T3.X, T9.W,
307 ; EG-NEXT: MOV * T0.W, T6.X,
308 ; EG-NEXT: MOV T1.Y, T2.X,
309 ; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x,
310 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
311 ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
312 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
313 ; EG-NEXT: ASHR * T1.W, T1.W, PV.W,
314 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
315 ; EG-NEXT: AND_INT * T0.W, T0.W, literal.y,
316 ; EG-NEXT: 65535(9.183409e-41), -65536(nan)
317 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
318 ; EG-NEXT: MOV * T1.Z, T3.X,
319 ; EG-NEXT: MOV * T6.X, T0.W,
320 ; EG-NEXT: MOV T0.W, PV.X,
321 ; EG-NEXT: LSHR * T1.W, T0.Y, literal.x,
322 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
323 ; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x,
324 ; EG-NEXT: LSHR * T2.W, T1.Y, literal.x,
325 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
326 ; EG-NEXT: ASHR T1.W, PV.W, PS,
327 ; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
328 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
329 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
330 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
331 ; EG-NEXT: OR_INT * T0.W, T0.W, PV.W,
332 ; EG-NEXT: MOV T6.X, PV.W,
333 ; EG-NEXT: MOV T0.Y, T7.X,
334 ; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
335 ; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y,
336 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
337 ; EG-NEXT: ASHR T0.W, PV.W, PS,
338 ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
339 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
340 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
341 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
342 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
343 ; EG-NEXT: MOV * T7.X, PV.W,
344 ; EG-NEXT: MOV T0.Y, PV.X,
345 ; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
346 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
347 ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
348 ; EG-NEXT: LSHR * T1.W, T1.Z, literal.x,
349 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
350 ; EG-NEXT: ASHR T0.W, PV.W, PS,
351 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
352 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
353 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
354 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
355 ; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
356 ; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W,
357 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
358 ; EG-NEXT: MOV T7.X, PV.Y,
359 ; EG-NEXT: MOV * T10.X, T6.X,
360 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
361 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
362 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
363 %result = ashr <4 x i16> %a, %b
364 store <4 x i16> %result, <4 x i16> addrspace(1)* %out
368 define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
369 ; SI-LABEL: s_ashr_i64:
370 ; SI: ; %bb.0: ; %entry
371 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
372 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
373 ; SI-NEXT: s_mov_b32 s3, 0xf000
374 ; SI-NEXT: s_mov_b32 s2, -1
375 ; SI-NEXT: s_waitcnt lgkmcnt(0)
376 ; SI-NEXT: s_ashr_i32 s5, s4, 31
377 ; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8
378 ; SI-NEXT: v_mov_b32_e32 v0, s4
379 ; SI-NEXT: v_mov_b32_e32 v1, s5
380 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
383 ; VI-LABEL: s_ashr_i64:
384 ; VI: ; %bb.0: ; %entry
385 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
386 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
387 ; VI-NEXT: s_mov_b32 s3, 0xf000
388 ; VI-NEXT: s_mov_b32 s2, -1
389 ; VI-NEXT: s_waitcnt lgkmcnt(0)
390 ; VI-NEXT: s_ashr_i32 s5, s4, 31
391 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8
392 ; VI-NEXT: v_mov_b32_e32 v0, s4
393 ; VI-NEXT: v_mov_b32_e32 v1, s5
394 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
397 ; EG-LABEL: s_ashr_i64:
398 ; EG: ; %bb.0: ; %entry
399 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
400 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
403 ; EG-NEXT: ALU clause starting at 4:
404 ; EG-NEXT: ASHR * T0.Y, KC0[2].Z, literal.x,
405 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
406 ; EG-NEXT: BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x,
407 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
408 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
410 %in.ext = sext i32 %in to i64
411 %ashr = ashr i64 %in.ext, 8
412 store i64 %ashr, i64 addrspace(1)* %out
416 define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
417 ; SI-LABEL: ashr_i64_2:
418 ; SI: ; %bb.0: ; %entry
419 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
420 ; SI-NEXT: s_mov_b32 s7, 0xf000
421 ; SI-NEXT: s_mov_b32 s6, -1
422 ; SI-NEXT: s_mov_b32 s10, s6
423 ; SI-NEXT: s_mov_b32 s11, s7
424 ; SI-NEXT: s_waitcnt lgkmcnt(0)
425 ; SI-NEXT: s_mov_b32 s8, s2
426 ; SI-NEXT: s_mov_b32 s9, s3
427 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
428 ; SI-NEXT: s_mov_b32 s4, s0
429 ; SI-NEXT: s_mov_b32 s5, s1
430 ; SI-NEXT: s_waitcnt vmcnt(0)
431 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v2
432 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
435 ; VI-LABEL: ashr_i64_2:
436 ; VI: ; %bb.0: ; %entry
437 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
438 ; VI-NEXT: s_mov_b32 s7, 0xf000
439 ; VI-NEXT: s_mov_b32 s6, -1
440 ; VI-NEXT: s_mov_b32 s10, s6
441 ; VI-NEXT: s_mov_b32 s11, s7
442 ; VI-NEXT: s_waitcnt lgkmcnt(0)
443 ; VI-NEXT: s_mov_b32 s8, s2
444 ; VI-NEXT: s_mov_b32 s9, s3
445 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
446 ; VI-NEXT: s_mov_b32 s4, s0
447 ; VI-NEXT: s_mov_b32 s5, s1
448 ; VI-NEXT: s_waitcnt vmcnt(0)
449 ; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1]
450 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
453 ; EG-LABEL: ashr_i64_2:
454 ; EG: ; %bb.0: ; %entry
455 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
457 ; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
458 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
461 ; EG-NEXT: Fetch clause starting at 6:
462 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
463 ; EG-NEXT: ALU clause starting at 8:
464 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
465 ; EG-NEXT: ALU clause starting at 9:
466 ; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x,
467 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
468 ; EG-NEXT: ASHR T1.Z, T0.Y, PV.W,
469 ; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z,
470 ; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x,
471 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
472 ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z,
473 ; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
474 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
475 ; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45)
476 ; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W,
478 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
479 %a = load i64, i64 addrspace(1)* %in
480 %b = load i64, i64 addrspace(1)* %b_ptr
481 %result = ashr i64 %a, %b
482 store i64 %result, i64 addrspace(1)* %out
486 define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
487 ; SI-LABEL: ashr_v2i64:
489 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
490 ; SI-NEXT: s_mov_b32 s7, 0xf000
491 ; SI-NEXT: s_mov_b32 s6, -1
492 ; SI-NEXT: s_mov_b32 s10, s6
493 ; SI-NEXT: s_mov_b32 s11, s7
494 ; SI-NEXT: s_waitcnt lgkmcnt(0)
495 ; SI-NEXT: s_mov_b32 s8, s2
496 ; SI-NEXT: s_mov_b32 s9, s3
497 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
498 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
499 ; SI-NEXT: s_mov_b32 s4, s0
500 ; SI-NEXT: s_mov_b32 s5, s1
501 ; SI-NEXT: s_waitcnt vmcnt(0)
502 ; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6
503 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4
504 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
507 ; VI-LABEL: ashr_v2i64:
509 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
510 ; VI-NEXT: s_mov_b32 s7, 0xf000
511 ; VI-NEXT: s_mov_b32 s6, -1
512 ; VI-NEXT: s_mov_b32 s10, s6
513 ; VI-NEXT: s_mov_b32 s11, s7
514 ; VI-NEXT: s_waitcnt lgkmcnt(0)
515 ; VI-NEXT: s_mov_b32 s8, s2
516 ; VI-NEXT: s_mov_b32 s9, s3
517 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
518 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
519 ; VI-NEXT: s_mov_b32 s4, s0
520 ; VI-NEXT: s_mov_b32 s5, s1
521 ; VI-NEXT: s_waitcnt vmcnt(0)
522 ; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
523 ; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
524 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
527 ; EG-LABEL: ashr_v2i64:
529 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
531 ; EG-NEXT: ALU 19, @11, KC0[CB0:0-32], KC1[]
532 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
535 ; EG-NEXT: Fetch clause starting at 6:
536 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
537 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
538 ; EG-NEXT: ALU clause starting at 10:
539 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
540 ; EG-NEXT: ALU clause starting at 11:
541 ; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x,
542 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
543 ; EG-NEXT: ASHR T1.Y, T0.W, PV.W,
544 ; EG-NEXT: AND_INT T2.Z, T1.Z, literal.x,
545 ; EG-NEXT: BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z,
546 ; EG-NEXT: AND_INT * T2.W, T1.X, literal.y,
547 ; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44)
548 ; EG-NEXT: ASHR T2.Y, T0.Y, PS,
549 ; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.W, PV.Y,
550 ; EG-NEXT: BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X,
551 ; EG-NEXT: AND_INT * T2.W, T1.X, literal.x,
552 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
553 ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y,
554 ; EG-NEXT: ASHR T0.W, T0.W, literal.x,
555 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.x,
556 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
557 ; EG-NEXT: CNDE_INT * T0.W, T2.Z, T1.Y, PV.W,
558 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
559 ; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W,
560 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
561 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
562 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
563 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
564 %result = ashr <2 x i64> %a, %b
565 store <2 x i64> %result, <2 x i64> addrspace(1)* %out
569 ; FIXME: Broken on r600
570 define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
571 ; SI-LABEL: ashr_v4i64:
573 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
574 ; SI-NEXT: s_mov_b32 s3, 0xf000
575 ; SI-NEXT: s_mov_b32 s2, -1
576 ; SI-NEXT: s_mov_b32 s10, s2
577 ; SI-NEXT: s_mov_b32 s11, s3
578 ; SI-NEXT: s_waitcnt lgkmcnt(0)
579 ; SI-NEXT: s_mov_b32 s8, s6
580 ; SI-NEXT: s_mov_b32 s9, s7
581 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
582 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
583 ; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
584 ; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
585 ; SI-NEXT: s_mov_b32 s0, s4
586 ; SI-NEXT: s_mov_b32 s1, s5
587 ; SI-NEXT: s_waitcnt vmcnt(2)
588 ; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6
589 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4
590 ; SI-NEXT: s_waitcnt vmcnt(0)
591 ; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13
592 ; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11
593 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
594 ; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
597 ; VI-LABEL: ashr_v4i64:
599 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
600 ; VI-NEXT: s_mov_b32 s3, 0xf000
601 ; VI-NEXT: s_mov_b32 s2, -1
602 ; VI-NEXT: s_mov_b32 s10, s2
603 ; VI-NEXT: s_mov_b32 s11, s3
604 ; VI-NEXT: s_waitcnt lgkmcnt(0)
605 ; VI-NEXT: s_mov_b32 s8, s6
606 ; VI-NEXT: s_mov_b32 s9, s7
607 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
608 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
609 ; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0
610 ; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
611 ; VI-NEXT: s_mov_b32 s0, s4
612 ; VI-NEXT: s_mov_b32 s1, s5
613 ; VI-NEXT: s_waitcnt vmcnt(2)
614 ; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
615 ; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
616 ; VI-NEXT: s_waitcnt vmcnt(0)
617 ; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10]
618 ; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8]
619 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
620 ; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
623 ; EG-LABEL: ashr_v4i64:
625 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
627 ; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[]
628 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
629 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
631 ; EG-NEXT: Fetch clause starting at 6:
632 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1
633 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1
634 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1
635 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
636 ; EG-NEXT: ALU clause starting at 14:
637 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
638 ; EG-NEXT: ALU clause starting at 15:
639 ; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x,
640 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
641 ; EG-NEXT: ASHR T1.Y, T0.W, literal.x,
642 ; EG-NEXT: ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212
643 ; EG-NEXT: AND_INT T1.W, T1.Z, literal.y,
644 ; EG-NEXT: AND_INT * T2.W, T2.Z, literal.x,
645 ; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44)
646 ; EG-NEXT: BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z,
647 ; EG-NEXT: ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212
648 ; EG-NEXT: AND_INT * T1.Z, T2.Z, literal.x,
649 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
650 ; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z,
651 ; EG-NEXT: AND_INT * T2.W, T2.X, literal.x,
652 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
653 ; EG-NEXT: AND_INT T5.X, T1.X, literal.x,
654 ; EG-NEXT: ASHR T4.Y, T0.Y, PS,
655 ; EG-NEXT: CNDE_INT T0.Z, T1.Z, PV.W, T2.Y,
656 ; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X,
657 ; EG-NEXT: AND_INT * T2.W, T2.X, literal.y,
658 ; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44)
659 ; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y,
660 ; EG-NEXT: ASHR T5.Y, T3.Y, PV.X,
661 ; EG-NEXT: CNDE_INT T2.Z, T1.W, T4.X, T4.Z,
662 ; EG-NEXT: BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221
663 ; EG-NEXT: AND_INT * T4.W, T1.X, literal.x,
664 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
665 ; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y,
666 ; EG-NEXT: ASHR T6.Y, T3.W, literal.x,
667 ; EG-NEXT: ASHR T3.Z, T0.Y, literal.x, BS:VEC_201
668 ; EG-NEXT: ADD_INT T3.W, KC0[2].Y, literal.y,
669 ; EG-NEXT: CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y,
670 ; EG-NEXT: 31(4.344025e-44), 16(2.242078e-44)
671 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
672 ; EG-NEXT: CNDE_INT T0.Y, T2.W, T4.Y, PV.Z,
673 ; EG-NEXT: ASHR T3.W, T3.Y, literal.y,
674 ; EG-NEXT: CNDE_INT * T2.W, T1.W, T4.Z, PV.Y,
675 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
676 ; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x,
677 ; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
678 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
679 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
680 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
681 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
682 %result = ashr <4 x i64> %a, %b
683 store <4 x i64> %result, <4 x i64> addrspace(1)* %out
687 define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
688 ; SI-LABEL: s_ashr_32_i64:
690 ; SI-NEXT: s_load_dword s6, s[0:1], 0x14
691 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d
692 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
693 ; SI-NEXT: s_mov_b32 s3, 0xf000
694 ; SI-NEXT: s_mov_b32 s2, -1
695 ; SI-NEXT: s_waitcnt lgkmcnt(0)
696 ; SI-NEXT: s_ashr_i32 s7, s6, 31
697 ; SI-NEXT: s_add_u32 s4, s6, s4
698 ; SI-NEXT: s_addc_u32 s5, s7, s5
699 ; SI-NEXT: v_mov_b32_e32 v0, s4
700 ; SI-NEXT: v_mov_b32_e32 v1, s5
701 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
704 ; VI-LABEL: s_ashr_32_i64:
706 ; VI-NEXT: s_load_dword s6, s[0:1], 0x50
707 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
708 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
709 ; VI-NEXT: s_mov_b32 s3, 0xf000
710 ; VI-NEXT: s_mov_b32 s2, -1
711 ; VI-NEXT: s_waitcnt lgkmcnt(0)
712 ; VI-NEXT: s_ashr_i32 s7, s6, 31
713 ; VI-NEXT: s_add_u32 s4, s6, s4
714 ; VI-NEXT: s_addc_u32 s5, s7, s5
715 ; VI-NEXT: v_mov_b32_e32 v0, s4
716 ; VI-NEXT: v_mov_b32_e32 v1, s5
717 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
720 ; EG-LABEL: s_ashr_32_i64:
722 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
723 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
726 ; EG-NEXT: ALU clause starting at 4:
727 ; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x,
728 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
729 ; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[7].Z,
730 ; EG-NEXT: ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y,
731 ; EG-NEXT: ADD_INT * T0.Y, T0.W, PV.W,
732 ; EG-NEXT: ADD_INT * T0.X, KC0[5].X, KC0[7].Y,
733 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
734 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
735 %result = ashr i64 %a, 32
736 %add = add i64 %result, %b
737 store i64 %add, i64 addrspace(1)* %out
741 define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
742 ; SI-LABEL: v_ashr_32_i64:
744 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
745 ; SI-NEXT: s_mov_b32 s7, 0xf000
746 ; SI-NEXT: s_mov_b32 s6, 0
747 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
748 ; SI-NEXT: v_mov_b32_e32 v1, 0
749 ; SI-NEXT: s_waitcnt lgkmcnt(0)
750 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
751 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
752 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
753 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
754 ; SI-NEXT: s_waitcnt vmcnt(0)
755 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
756 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
759 ; VI-LABEL: v_ashr_32_i64:
761 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
762 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
763 ; VI-NEXT: s_waitcnt lgkmcnt(0)
764 ; VI-NEXT: v_mov_b32_e32 v0, s3
765 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
766 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
767 ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
768 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
769 ; VI-NEXT: flat_load_dword v0, v[0:1]
770 ; VI-NEXT: v_mov_b32_e32 v1, s1
771 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
772 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
773 ; VI-NEXT: s_waitcnt vmcnt(0)
774 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
775 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
778 ; EG-LABEL: v_ashr_32_i64:
780 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
782 ; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
783 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
786 ; EG-NEXT: Fetch clause starting at 6:
787 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
788 ; EG-NEXT: ALU clause starting at 8:
789 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
790 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
791 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
792 ; EG-NEXT: ALU clause starting at 11:
793 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
794 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
795 ; EG-NEXT: ASHR * T0.Y, T0.X, literal.y,
796 ; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
797 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
798 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
799 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
800 %a = load i64, i64 addrspace(1)* %gep.in
801 %result = ashr i64 %a, 32
802 store i64 %result, i64 addrspace(1)* %gep.out
806 define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
807 ; SI-LABEL: s_ashr_63_i64:
809 ; SI-NEXT: s_load_dword s6, s[0:1], 0x14
810 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d
811 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
812 ; SI-NEXT: s_mov_b32 s3, 0xf000
813 ; SI-NEXT: s_mov_b32 s2, -1
814 ; SI-NEXT: s_waitcnt lgkmcnt(0)
815 ; SI-NEXT: s_ashr_i32 s6, s6, 31
816 ; SI-NEXT: s_add_u32 s4, s6, s4
817 ; SI-NEXT: s_addc_u32 s5, s6, s5
818 ; SI-NEXT: v_mov_b32_e32 v0, s4
819 ; SI-NEXT: v_mov_b32_e32 v1, s5
820 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
823 ; VI-LABEL: s_ashr_63_i64:
825 ; VI-NEXT: s_load_dword s6, s[0:1], 0x50
826 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
827 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
828 ; VI-NEXT: s_mov_b32 s3, 0xf000
829 ; VI-NEXT: s_mov_b32 s2, -1
830 ; VI-NEXT: s_waitcnt lgkmcnt(0)
831 ; VI-NEXT: s_ashr_i32 s6, s6, 31
832 ; VI-NEXT: s_add_u32 s4, s6, s4
833 ; VI-NEXT: s_addc_u32 s5, s6, s5
834 ; VI-NEXT: v_mov_b32_e32 v0, s4
835 ; VI-NEXT: v_mov_b32_e32 v1, s5
836 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
839 ; EG-LABEL: s_ashr_63_i64:
841 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
842 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
845 ; EG-NEXT: ALU clause starting at 4:
846 ; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x,
847 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
848 ; EG-NEXT: ADD_INT T1.W, PV.W, KC0[7].Z,
849 ; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y,
850 ; EG-NEXT: ADD_INT * T0.Y, PV.W, PS,
851 ; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y,
852 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
853 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
854 %result = ashr i64 %a, 63
855 %add = add i64 %result, %b
856 store i64 %add, i64 addrspace(1)* %out
860 define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
861 ; SI-LABEL: v_ashr_63_i64:
863 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
864 ; SI-NEXT: s_mov_b32 s7, 0xf000
865 ; SI-NEXT: s_mov_b32 s6, 0
866 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
867 ; SI-NEXT: v_mov_b32_e32 v1, 0
868 ; SI-NEXT: s_waitcnt lgkmcnt(0)
869 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
870 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
871 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
872 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
873 ; SI-NEXT: s_waitcnt vmcnt(0)
874 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2
875 ; SI-NEXT: v_mov_b32_e32 v3, v2
876 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
879 ; VI-LABEL: v_ashr_63_i64:
881 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
882 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
883 ; VI-NEXT: s_waitcnt lgkmcnt(0)
884 ; VI-NEXT: v_mov_b32_e32 v0, s3
885 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
886 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
887 ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
888 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
889 ; VI-NEXT: flat_load_dword v3, v[0:1]
890 ; VI-NEXT: v_mov_b32_e32 v1, s1
891 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
892 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
893 ; VI-NEXT: s_waitcnt vmcnt(0)
894 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3
895 ; VI-NEXT: v_mov_b32_e32 v3, v2
896 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
899 ; EG-LABEL: v_ashr_63_i64:
901 ; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
903 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
904 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
907 ; EG-NEXT: Fetch clause starting at 6:
908 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
909 ; EG-NEXT: ALU clause starting at 8:
910 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
911 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
912 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
913 ; EG-NEXT: ALU clause starting at 11:
914 ; EG-NEXT: ASHR T0.X, T0.X, literal.x,
915 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
916 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
917 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
918 ; EG-NEXT: MOV * T0.Y, PV.X,
919 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
920 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
921 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
922 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
923 %a = load i64, i64 addrspace(1)* %gep.in
924 %result = ashr i64 %a, 63
925 store i64 %result, i64 addrspace(1)* %gep.out
929 attributes #0 = { nounwind readnone }