1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI
4 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600
6 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
9 declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
10 declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
11 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
13 define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) {
15 ; SI: ; %bb.0: ; %entry
16 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
17 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
18 ; SI-NEXT: s_mov_b32 s7, 0xf000
19 ; SI-NEXT: s_mov_b32 s6, -1
20 ; SI-NEXT: s_waitcnt lgkmcnt(0)
21 ; SI-NEXT: v_mov_b32_e32 v0, s1
22 ; SI-NEXT: s_lshr_b32 s1, s0, 1
23 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
24 ; SI-NEXT: s_not_b32 s0, s2
25 ; SI-NEXT: v_mov_b32_e32 v1, s0
26 ; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1
27 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
31 ; VI: ; %bb.0: ; %entry
32 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
33 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
34 ; VI-NEXT: s_waitcnt lgkmcnt(0)
35 ; VI-NEXT: v_mov_b32_e32 v0, s1
36 ; VI-NEXT: s_not_b32 s2, s2
37 ; VI-NEXT: s_lshr_b32 s1, s0, 1
38 ; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
39 ; VI-NEXT: v_mov_b32_e32 v1, s2
40 ; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1
41 ; VI-NEXT: v_mov_b32_e32 v0, s4
42 ; VI-NEXT: v_mov_b32_e32 v1, s5
43 ; VI-NEXT: flat_store_dword v[0:1], v2
46 ; GFX9-LABEL: fshl_i32:
47 ; GFX9: ; %bb.0: ; %entry
48 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
49 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
50 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
51 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
53 ; GFX9-NEXT: s_not_b32 s2, s2
54 ; GFX9-NEXT: s_lshr_b32 s1, s0, 1
55 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1
56 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
57 ; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2
58 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
61 ; R600-LABEL: fshl_i32:
62 ; R600: ; %bb.0: ; %entry
63 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
64 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
67 ; R600-NEXT: ALU clause starting at 4:
68 ; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1,
69 ; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1,
70 ; R600-NEXT: NOT_INT * T1.W, KC0[3].X,
71 ; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS,
72 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
73 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
75 ; GFX10-LABEL: fshl_i32:
76 ; GFX10: ; %bb.0: ; %entry
77 ; GFX10-NEXT: s_clause 0x1
78 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
79 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
80 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
81 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1
83 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1
84 ; GFX10-NEXT: s_not_b32 s1, s2
85 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1
86 ; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
87 ; GFX10-NEXT: s_endpgm
89 ; GFX11-LABEL: fshl_i32:
90 ; GFX11: ; %bb.0: ; %entry
91 ; GFX11-NEXT: s_clause 0x1
92 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
93 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
94 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
95 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1
97 ; GFX11-NEXT: s_lshr_b32 s0, s0, 1
98 ; GFX11-NEXT: s_not_b32 s1, s2
99 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
100 ; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1
101 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
102 ; GFX11-NEXT: s_endpgm
104 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
105 store i32 %0, ptr addrspace(1) %in
109 define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
110 ; SI-LABEL: fshl_i32_imm:
111 ; SI: ; %bb.0: ; %entry
112 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
113 ; SI-NEXT: s_mov_b32 s7, 0xf000
114 ; SI-NEXT: s_mov_b32 s6, -1
115 ; SI-NEXT: s_waitcnt lgkmcnt(0)
116 ; SI-NEXT: v_mov_b32_e32 v0, s3
117 ; SI-NEXT: s_mov_b32 s4, s0
118 ; SI-NEXT: s_mov_b32 s5, s1
119 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25
120 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
123 ; VI-LABEL: fshl_i32_imm:
124 ; VI: ; %bb.0: ; %entry
125 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
126 ; VI-NEXT: s_waitcnt lgkmcnt(0)
127 ; VI-NEXT: v_mov_b32_e32 v0, s3
128 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25
129 ; VI-NEXT: v_mov_b32_e32 v0, s0
130 ; VI-NEXT: v_mov_b32_e32 v1, s1
131 ; VI-NEXT: flat_store_dword v[0:1], v2
134 ; GFX9-LABEL: fshl_i32_imm:
135 ; GFX9: ; %bb.0: ; %entry
136 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
137 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
138 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
140 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25
141 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
142 ; GFX9-NEXT: s_endpgm
144 ; R600-LABEL: fshl_i32_imm:
145 ; R600: ; %bb.0: ; %entry
146 ; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
147 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
150 ; R600-NEXT: ALU clause starting at 4:
151 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
152 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
153 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
154 ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
156 ; GFX10-LABEL: fshl_i32_imm:
157 ; GFX10: ; %bb.0: ; %entry
158 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
159 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
160 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
161 ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25
162 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
163 ; GFX10-NEXT: s_endpgm
165 ; GFX11-LABEL: fshl_i32_imm:
166 ; GFX11: ; %bb.0: ; %entry
167 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
168 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
169 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25
171 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
172 ; GFX11-NEXT: s_endpgm
174 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
175 store i32 %0, ptr addrspace(1) %in
179 define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
180 ; SI-LABEL: fshl_v2i32:
181 ; SI: ; %bb.0: ; %entry
182 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
183 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9
184 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
185 ; SI-NEXT: s_mov_b32 s11, 0xf000
186 ; SI-NEXT: s_mov_b32 s10, -1
187 ; SI-NEXT: s_waitcnt lgkmcnt(0)
188 ; SI-NEXT: v_mov_b32_e32 v0, s3
189 ; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1
190 ; SI-NEXT: s_not_b32 s3, s5
191 ; SI-NEXT: s_lshr_b32 s1, s1, 1
192 ; SI-NEXT: v_mov_b32_e32 v1, s3
193 ; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
194 ; SI-NEXT: v_mov_b32_e32 v0, s2
195 ; SI-NEXT: s_not_b32 s1, s4
196 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
197 ; SI-NEXT: s_lshr_b32 s0, s0, 1
198 ; SI-NEXT: v_mov_b32_e32 v2, s1
199 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
200 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
203 ; VI-LABEL: fshl_v2i32:
204 ; VI: ; %bb.0: ; %entry
205 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
206 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
207 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
208 ; VI-NEXT: s_waitcnt lgkmcnt(0)
209 ; VI-NEXT: v_mov_b32_e32 v0, s3
210 ; VI-NEXT: s_not_b32 s7, s7
211 ; VI-NEXT: s_lshr_b32 s3, s1, 1
212 ; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1
213 ; VI-NEXT: v_mov_b32_e32 v1, s7
214 ; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1
215 ; VI-NEXT: v_mov_b32_e32 v0, s2
216 ; VI-NEXT: s_not_b32 s1, s6
217 ; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
218 ; VI-NEXT: s_lshr_b32 s0, s0, 1
219 ; VI-NEXT: v_mov_b32_e32 v2, s1
220 ; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
221 ; VI-NEXT: v_mov_b32_e32 v2, s4
222 ; VI-NEXT: v_mov_b32_e32 v3, s5
223 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
226 ; GFX9-LABEL: fshl_v2i32:
227 ; GFX9: ; %bb.0: ; %entry
228 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
229 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
230 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c
231 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
232 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
233 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
234 ; GFX9-NEXT: s_lshr_b32 s3, s1, 1
235 ; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1
236 ; GFX9-NEXT: s_not_b32 s1, s9
237 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
238 ; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1
239 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
240 ; GFX9-NEXT: s_not_b32 s1, s8
241 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
242 ; GFX9-NEXT: s_lshr_b32 s0, s0, 1
243 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
244 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
245 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
246 ; GFX9-NEXT: s_endpgm
248 ; R600-LABEL: fshl_v2i32:
249 ; R600: ; %bb.0: ; %entry
250 ; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
251 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
254 ; R600-NEXT: ALU clause starting at 4:
255 ; R600-NEXT: LSHR T0.Z, KC0[3].X, 1,
256 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1,
257 ; R600-NEXT: NOT_INT * T1.W, KC0[4].X,
258 ; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W,
259 ; R600-NEXT: LSHR T0.Z, KC0[2].W, 1,
260 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1,
261 ; R600-NEXT: NOT_INT * T1.W, KC0[3].W,
262 ; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W,
263 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
264 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
266 ; GFX10-LABEL: fshl_v2i32:
267 ; GFX10: ; %bb.0: ; %entry
268 ; GFX10-NEXT: s_clause 0x2
269 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
270 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
271 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
272 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
273 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
274 ; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1
275 ; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1
276 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1
277 ; GFX10-NEXT: s_not_b32 s2, s7
278 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1
279 ; GFX10-NEXT: s_not_b32 s3, s6
280 ; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2
281 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3
282 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
283 ; GFX10-NEXT: s_endpgm
285 ; GFX11-LABEL: fshl_v2i32:
286 ; GFX11: ; %bb.0: ; %entry
287 ; GFX11-NEXT: s_clause 0x2
288 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
289 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
290 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
291 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
292 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
293 ; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1
294 ; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1
295 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1
296 ; GFX11-NEXT: s_not_b32 s2, s7
297 ; GFX11-NEXT: s_lshr_b32 s0, s0, 1
298 ; GFX11-NEXT: s_not_b32 s3, s6
299 ; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2
300 ; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3
301 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
302 ; GFX11-NEXT: s_endpgm
304 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
305 store <2 x i32> %0, ptr addrspace(1) %in
309 define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
310 ; SI-LABEL: fshl_v2i32_imm:
311 ; SI: ; %bb.0: ; %entry
312 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
313 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
314 ; SI-NEXT: s_mov_b32 s7, 0xf000
315 ; SI-NEXT: s_mov_b32 s6, -1
316 ; SI-NEXT: s_waitcnt lgkmcnt(0)
317 ; SI-NEXT: v_mov_b32_e32 v0, s3
318 ; SI-NEXT: v_mov_b32_e32 v2, s2
319 ; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23
320 ; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25
321 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
324 ; VI-LABEL: fshl_v2i32_imm:
325 ; VI: ; %bb.0: ; %entry
326 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
327 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
328 ; VI-NEXT: s_waitcnt lgkmcnt(0)
329 ; VI-NEXT: v_mov_b32_e32 v0, s3
330 ; VI-NEXT: v_mov_b32_e32 v2, s2
331 ; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23
332 ; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25
333 ; VI-NEXT: v_mov_b32_e32 v2, s4
334 ; VI-NEXT: v_mov_b32_e32 v3, s5
335 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
338 ; GFX9-LABEL: fshl_v2i32_imm:
339 ; GFX9: ; %bb.0: ; %entry
340 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
341 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
342 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
343 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
344 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
345 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
346 ; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23
347 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25
348 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
349 ; GFX9-NEXT: s_endpgm
351 ; R600-LABEL: fshl_v2i32_imm:
352 ; R600: ; %bb.0: ; %entry
353 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
354 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
357 ; R600-NEXT: ALU clause starting at 4:
358 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
359 ; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00)
360 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
361 ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
362 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
363 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
365 ; GFX10-LABEL: fshl_v2i32_imm:
366 ; GFX10: ; %bb.0: ; %entry
367 ; GFX10-NEXT: s_clause 0x1
368 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
369 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
370 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
371 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23
373 ; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25
374 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
375 ; GFX10-NEXT: s_endpgm
377 ; GFX11-LABEL: fshl_v2i32_imm:
378 ; GFX11: ; %bb.0: ; %entry
379 ; GFX11-NEXT: s_clause 0x1
380 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
381 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
382 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
383 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
384 ; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23
385 ; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25
386 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
387 ; GFX11-NEXT: s_endpgm
389 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
390 store <2 x i32> %0, ptr addrspace(1) %in
394 define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
395 ; SI-LABEL: fshl_v4i32:
396 ; SI: ; %bb.0: ; %entry
397 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
398 ; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15
399 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
400 ; SI-NEXT: s_mov_b32 s3, 0xf000
401 ; SI-NEXT: s_mov_b32 s2, -1
402 ; SI-NEXT: s_waitcnt lgkmcnt(0)
403 ; SI-NEXT: s_not_b32 s5, s19
404 ; SI-NEXT: v_mov_b32_e32 v0, s15
405 ; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1
406 ; SI-NEXT: s_lshr_b32 s4, s11, 1
407 ; SI-NEXT: v_mov_b32_e32 v1, s5
408 ; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1
409 ; SI-NEXT: v_mov_b32_e32 v0, s14
410 ; SI-NEXT: s_not_b32 s5, s18
411 ; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1
412 ; SI-NEXT: s_lshr_b32 s4, s10, 1
413 ; SI-NEXT: v_mov_b32_e32 v1, s5
414 ; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1
415 ; SI-NEXT: v_mov_b32_e32 v0, s13
416 ; SI-NEXT: s_not_b32 s5, s17
417 ; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1
418 ; SI-NEXT: s_lshr_b32 s4, s9, 1
419 ; SI-NEXT: v_mov_b32_e32 v1, s5
420 ; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1
421 ; SI-NEXT: v_mov_b32_e32 v0, s12
422 ; SI-NEXT: s_not_b32 s5, s16
423 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1
424 ; SI-NEXT: s_lshr_b32 s4, s8, 1
425 ; SI-NEXT: v_mov_b32_e32 v4, s5
426 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4
427 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
430 ; VI-LABEL: fshl_v4i32:
431 ; VI: ; %bb.0: ; %entry
432 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
433 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
434 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
435 ; VI-NEXT: s_waitcnt lgkmcnt(0)
436 ; VI-NEXT: v_mov_b32_e32 v0, s15
437 ; VI-NEXT: s_not_b32 s3, s3
438 ; VI-NEXT: s_lshr_b32 s6, s11, 1
439 ; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1
440 ; VI-NEXT: v_mov_b32_e32 v1, s3
441 ; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1
442 ; VI-NEXT: v_mov_b32_e32 v0, s14
443 ; VI-NEXT: s_not_b32 s2, s2
444 ; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1
445 ; VI-NEXT: s_lshr_b32 s3, s10, 1
446 ; VI-NEXT: v_mov_b32_e32 v1, s2
447 ; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1
448 ; VI-NEXT: v_mov_b32_e32 v0, s13
449 ; VI-NEXT: s_not_b32 s1, s1
450 ; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1
451 ; VI-NEXT: s_lshr_b32 s2, s9, 1
452 ; VI-NEXT: v_mov_b32_e32 v1, s1
453 ; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1
454 ; VI-NEXT: v_mov_b32_e32 v0, s12
455 ; VI-NEXT: s_not_b32 s0, s0
456 ; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1
457 ; VI-NEXT: s_lshr_b32 s1, s8, 1
458 ; VI-NEXT: v_mov_b32_e32 v4, s0
459 ; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4
460 ; VI-NEXT: v_mov_b32_e32 v4, s4
461 ; VI-NEXT: v_mov_b32_e32 v5, s5
462 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
465 ; GFX9-LABEL: fshl_v4i32:
466 ; GFX9: ; %bb.0: ; %entry
467 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
468 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
469 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
470 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
471 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX9-NEXT: s_not_b32 s3, s3
473 ; GFX9-NEXT: v_mov_b32_e32 v0, s15
474 ; GFX9-NEXT: s_lshr_b32 s4, s11, 1
475 ; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1
476 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
477 ; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1
478 ; GFX9-NEXT: v_mov_b32_e32 v0, s14
479 ; GFX9-NEXT: s_not_b32 s2, s2
480 ; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1
481 ; GFX9-NEXT: s_lshr_b32 s3, s10, 1
482 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
483 ; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1
484 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
485 ; GFX9-NEXT: s_not_b32 s1, s1
486 ; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1
487 ; GFX9-NEXT: s_lshr_b32 s2, s9, 1
488 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
489 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1
490 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
491 ; GFX9-NEXT: s_not_b32 s0, s0
492 ; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1
493 ; GFX9-NEXT: s_lshr_b32 s1, s8, 1
494 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
495 ; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5
496 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
497 ; GFX9-NEXT: s_endpgm
499 ; R600-LABEL: fshl_v4i32:
500 ; R600: ; %bb.0: ; %entry
501 ; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
502 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
505 ; R600-NEXT: ALU clause starting at 4:
506 ; R600-NEXT: LSHR T0.Z, KC0[4].X, 1,
507 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
508 ; R600-NEXT: NOT_INT * T1.W, KC0[6].X,
509 ; R600-NEXT: LSHR T0.Y, KC0[3].W, 1,
510 ; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1,
511 ; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W,
512 ; R600-NEXT: NOT_INT * T1.W, KC0[5].W,
513 ; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1,
514 ; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W,
515 ; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1,
516 ; R600-NEXT: NOT_INT * T2.W, KC0[5].Z,
517 ; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W,
518 ; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1,
519 ; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1,
520 ; R600-NEXT: NOT_INT * T2.W, KC0[5].Y,
521 ; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W,
522 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
523 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
525 ; GFX10-LABEL: fshl_v4i32:
526 ; GFX10: ; %bb.0: ; %entry
527 ; GFX10-NEXT: s_clause 0x2
528 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
529 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
530 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
531 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
533 ; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1
534 ; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1
535 ; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1
536 ; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1
537 ; GFX10-NEXT: s_lshr_b32 s4, s11, 1
538 ; GFX10-NEXT: s_not_b32 s3, s3
539 ; GFX10-NEXT: s_lshr_b32 s5, s10, 1
540 ; GFX10-NEXT: s_not_b32 s2, s2
541 ; GFX10-NEXT: s_lshr_b32 s9, s9, 1
542 ; GFX10-NEXT: s_not_b32 s1, s1
543 ; GFX10-NEXT: s_lshr_b32 s8, s8, 1
544 ; GFX10-NEXT: s_not_b32 s0, s0
545 ; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3
546 ; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2
547 ; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1
548 ; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0
549 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
550 ; GFX10-NEXT: s_endpgm
552 ; GFX11-LABEL: fshl_v4i32:
553 ; GFX11: ; %bb.0: ; %entry
554 ; GFX11-NEXT: s_clause 0x2
555 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
556 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
557 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
558 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
559 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
560 ; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1
561 ; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1
562 ; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1
563 ; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1
564 ; GFX11-NEXT: s_lshr_b32 s6, s11, 1
565 ; GFX11-NEXT: s_not_b32 s3, s3
566 ; GFX11-NEXT: s_lshr_b32 s7, s10, 1
567 ; GFX11-NEXT: s_not_b32 s2, s2
568 ; GFX11-NEXT: s_lshr_b32 s9, s9, 1
569 ; GFX11-NEXT: s_not_b32 s1, s1
570 ; GFX11-NEXT: s_lshr_b32 s8, s8, 1
571 ; GFX11-NEXT: s_not_b32 s0, s0
572 ; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3
573 ; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2
574 ; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1
575 ; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0
576 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
577 ; GFX11-NEXT: s_endpgm
579 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
580 store <4 x i32> %0, ptr addrspace(1) %in
584 define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
585 ; SI-LABEL: fshl_v4i32_imm:
586 ; SI: ; %bb.0: ; %entry
587 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
588 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
589 ; SI-NEXT: s_mov_b32 s3, 0xf000
590 ; SI-NEXT: s_mov_b32 s2, -1
591 ; SI-NEXT: s_waitcnt lgkmcnt(0)
592 ; SI-NEXT: v_mov_b32_e32 v0, s15
593 ; SI-NEXT: v_mov_b32_e32 v1, s14
594 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31
595 ; SI-NEXT: v_mov_b32_e32 v0, s13
596 ; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23
597 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25
598 ; SI-NEXT: v_mov_b32_e32 v0, s12
599 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31
600 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
603 ; VI-LABEL: fshl_v4i32_imm:
604 ; VI: ; %bb.0: ; %entry
605 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
606 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
607 ; VI-NEXT: s_waitcnt lgkmcnt(0)
608 ; VI-NEXT: v_mov_b32_e32 v0, s15
609 ; VI-NEXT: v_mov_b32_e32 v1, s14
610 ; VI-NEXT: v_mov_b32_e32 v4, s13
611 ; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31
612 ; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23
613 ; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25
614 ; VI-NEXT: v_mov_b32_e32 v0, s12
615 ; VI-NEXT: v_mov_b32_e32 v5, s1
616 ; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31
617 ; VI-NEXT: v_mov_b32_e32 v4, s0
618 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
621 ; GFX9-LABEL: fshl_v4i32_imm:
622 ; GFX9: ; %bb.0: ; %entry
623 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
624 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
625 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
626 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
627 ; GFX9-NEXT: v_mov_b32_e32 v0, s15
628 ; GFX9-NEXT: v_mov_b32_e32 v1, s14
629 ; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31
630 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
631 ; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23
632 ; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25
633 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
634 ; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31
635 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
636 ; GFX9-NEXT: s_endpgm
638 ; R600-LABEL: fshl_v4i32_imm:
639 ; R600: ; %bb.0: ; %entry
640 ; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
641 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
644 ; R600-NEXT: ALU clause starting at 4:
645 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
646 ; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
647 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
648 ; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00)
649 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
650 ; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
651 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
652 ; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
653 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
654 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
656 ; GFX10-LABEL: fshl_v4i32_imm:
657 ; GFX10: ; %bb.0: ; %entry
658 ; GFX10-NEXT: s_clause 0x1
659 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
660 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
661 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
662 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
663 ; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31
664 ; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23
665 ; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25
666 ; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31
667 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
668 ; GFX10-NEXT: s_endpgm
670 ; GFX11-LABEL: fshl_v4i32_imm:
671 ; GFX11: ; %bb.0: ; %entry
672 ; GFX11-NEXT: s_clause 0x1
673 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
674 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
675 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
676 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
677 ; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31
678 ; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23
679 ; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25
680 ; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31
681 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
682 ; GFX11-NEXT: s_endpgm
684 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
685 store <4 x i32> %0, ptr addrspace(1) %in
689 ; (a ^ b) | a --> a | b
690 define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
691 ; SI-LABEL: orxor2or1:
693 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
694 ; SI-NEXT: s_mov_b32 s7, 0xf000
695 ; SI-NEXT: s_mov_b32 s6, -1
696 ; SI-NEXT: s_waitcnt lgkmcnt(0)
697 ; SI-NEXT: s_mov_b32 s4, s0
698 ; SI-NEXT: s_lshl_b32 s0, s2, 7
699 ; SI-NEXT: s_or_b32 s0, s3, s0
700 ; SI-NEXT: s_cmp_eq_u32 s0, 0
701 ; SI-NEXT: s_cselect_b32 s0, s2, s3
702 ; SI-NEXT: s_mov_b32 s5, s1
703 ; SI-NEXT: v_mov_b32_e32 v0, s0
704 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
707 ; VI-LABEL: orxor2or1:
709 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
710 ; VI-NEXT: s_waitcnt lgkmcnt(0)
711 ; VI-NEXT: s_lshl_b32 s4, s2, 7
712 ; VI-NEXT: s_or_b32 s4, s3, s4
713 ; VI-NEXT: s_cmp_eq_u32 s4, 0
714 ; VI-NEXT: s_cselect_b32 s2, s2, s3
715 ; VI-NEXT: v_mov_b32_e32 v0, s0
716 ; VI-NEXT: v_mov_b32_e32 v1, s1
717 ; VI-NEXT: v_mov_b32_e32 v2, s2
718 ; VI-NEXT: flat_store_dword v[0:1], v2
721 ; GFX9-LABEL: orxor2or1:
723 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
724 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
725 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
726 ; GFX9-NEXT: s_lshl_b32 s4, s2, 7
727 ; GFX9-NEXT: s_or_b32 s4, s3, s4
728 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0
729 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3
730 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
731 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
732 ; GFX9-NEXT: s_endpgm
734 ; R600-LABEL: orxor2or1:
736 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
737 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
740 ; R600-NEXT: ALU clause starting at 4:
741 ; R600-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
742 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
743 ; R600-NEXT: OR_INT * T0.W, KC0[2].W, PV.W,
744 ; R600-NEXT: CNDE_INT T0.X, PV.W, KC0[2].Z, KC0[2].W,
745 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
746 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
748 ; GFX10-LABEL: orxor2or1:
750 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
751 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
752 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX10-NEXT: s_lshl_b32 s4, s2, 7
754 ; GFX10-NEXT: s_or_b32 s4, s3, s4
755 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0
756 ; GFX10-NEXT: s_cselect_b32 s2, s2, s3
757 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
758 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
759 ; GFX10-NEXT: s_endpgm
761 ; GFX11-LABEL: orxor2or1:
763 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
764 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
765 ; GFX11-NEXT: s_lshl_b32 s4, s2, 7
766 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
767 ; GFX11-NEXT: s_or_b32 s4, s3, s4
768 ; GFX11-NEXT: s_cmp_eq_u32 s4, 0
769 ; GFX11-NEXT: s_cselect_b32 s2, s2, s3
770 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
771 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
772 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
773 ; GFX11-NEXT: s_endpgm
775 %xor = xor i32 %shl, %b
776 %or = or i32 %a, %xor
777 %fshl = call i32 @llvm.fshl.i32(i32 %or, i32 %xor, i32 7)
778 %cond = icmp eq i32 %fshl, 0
779 %r = select i1 %cond, i32 %a, i32 %b
780 store i32 %r, ptr addrspace(1) %in