1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600
6 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
9 declare i32 @llvm.fshr.i32(i32, i32, i32)
10 declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
11 declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
12 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
13 declare i16 @llvm.fshr.i16(i16, i16, i16)
14 declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
15 declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
16 declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
17 declare i64 @llvm.fshr.i64(i64, i64, i64)
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare i24 @llvm.fshr.i24(i24, i24, i24)
20 declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
22 define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) {
24 ; SI: ; %bb.0: ; %entry
25 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
26 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
27 ; SI-NEXT: s_mov_b32 s3, 0xf000
28 ; SI-NEXT: s_mov_b32 s2, -1
29 ; SI-NEXT: s_waitcnt lgkmcnt(0)
30 ; SI-NEXT: v_mov_b32_e32 v0, s7
31 ; SI-NEXT: v_mov_b32_e32 v1, s8
32 ; SI-NEXT: s_mov_b32 s0, s4
33 ; SI-NEXT: s_mov_b32 s1, s5
34 ; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1
35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
39 ; VI: ; %bb.0: ; %entry
40 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
41 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
42 ; VI-NEXT: s_waitcnt lgkmcnt(0)
43 ; VI-NEXT: v_mov_b32_e32 v0, s7
44 ; VI-NEXT: v_mov_b32_e32 v1, s0
45 ; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1
46 ; VI-NEXT: v_mov_b32_e32 v0, s4
47 ; VI-NEXT: v_mov_b32_e32 v1, s5
48 ; VI-NEXT: flat_store_dword v[0:1], v2
51 ; GFX9-LABEL: fshr_i32:
52 ; GFX9: ; %bb.0: ; %entry
53 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
54 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
55 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
56 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
58 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
59 ; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2
60 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
63 ; R600-LABEL: fshr_i32:
64 ; R600: ; %bb.0: ; %entry
65 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
66 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
69 ; R600-NEXT: ALU clause starting at 4:
70 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
71 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
72 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
74 ; GFX10-LABEL: fshr_i32:
75 ; GFX10: ; %bb.0: ; %entry
76 ; GFX10-NEXT: s_clause 0x1
77 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
78 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
79 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
80 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
82 ; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0
83 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
84 ; GFX10-NEXT: s_endpgm
86 ; GFX11-LABEL: fshr_i32:
87 ; GFX11: ; %bb.0: ; %entry
88 ; GFX11-NEXT: s_clause 0x1
89 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34
90 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
91 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
92 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
93 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
94 ; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0
95 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
97 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
98 ; GFX11-NEXT: s_endpgm
100 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
101 store i32 %0, ptr addrspace(1) %in
105 define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
106 ; SI-LABEL: fshr_i32_imm:
107 ; SI: ; %bb.0: ; %entry
108 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
109 ; SI-NEXT: s_mov_b32 s7, 0xf000
110 ; SI-NEXT: s_mov_b32 s6, -1
111 ; SI-NEXT: s_waitcnt lgkmcnt(0)
112 ; SI-NEXT: v_mov_b32_e32 v0, s3
113 ; SI-NEXT: s_mov_b32 s4, s0
114 ; SI-NEXT: s_mov_b32 s5, s1
115 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7
116 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
119 ; VI-LABEL: fshr_i32_imm:
120 ; VI: ; %bb.0: ; %entry
121 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
123 ; VI-NEXT: v_mov_b32_e32 v0, s3
124 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7
125 ; VI-NEXT: v_mov_b32_e32 v0, s0
126 ; VI-NEXT: v_mov_b32_e32 v1, s1
127 ; VI-NEXT: flat_store_dword v[0:1], v2
130 ; GFX9-LABEL: fshr_i32_imm:
131 ; GFX9: ; %bb.0: ; %entry
132 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
133 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
134 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
135 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
136 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7
137 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
138 ; GFX9-NEXT: s_endpgm
140 ; R600-LABEL: fshr_i32_imm:
141 ; R600: ; %bb.0: ; %entry
142 ; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
143 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
146 ; R600-NEXT: ALU clause starting at 4:
147 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
148 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
149 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
150 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
152 ; GFX10-LABEL: fshr_i32_imm:
153 ; GFX10: ; %bb.0: ; %entry
154 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
155 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
156 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7
158 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
159 ; GFX10-NEXT: s_endpgm
161 ; GFX11-LABEL: fshr_i32_imm:
162 ; GFX11: ; %bb.0: ; %entry
163 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
164 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
165 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
166 ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7
167 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
168 ; GFX11-NEXT: s_nop 0
169 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
170 ; GFX11-NEXT: s_endpgm
172 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
173 store i32 %0, ptr addrspace(1) %in
177 define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
178 ; SI-LABEL: fshr_v2i32:
179 ; SI: ; %bb.0: ; %entry
180 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
181 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
182 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
183 ; SI-NEXT: s_mov_b32 s3, 0xf000
184 ; SI-NEXT: s_mov_b32 s2, -1
185 ; SI-NEXT: s_waitcnt lgkmcnt(0)
186 ; SI-NEXT: v_mov_b32_e32 v0, s7
187 ; SI-NEXT: v_mov_b32_e32 v1, s9
188 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1
189 ; SI-NEXT: v_mov_b32_e32 v0, s6
190 ; SI-NEXT: v_mov_b32_e32 v2, s8
191 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v2
192 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
195 ; VI-LABEL: fshr_v2i32:
196 ; VI: ; %bb.0: ; %entry
197 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
198 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
199 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
200 ; VI-NEXT: s_waitcnt lgkmcnt(0)
201 ; VI-NEXT: v_mov_b32_e32 v0, s7
202 ; VI-NEXT: v_mov_b32_e32 v1, s3
203 ; VI-NEXT: v_mov_b32_e32 v2, s6
204 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
205 ; VI-NEXT: v_mov_b32_e32 v0, s2
206 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0
207 ; VI-NEXT: v_mov_b32_e32 v3, s1
208 ; VI-NEXT: v_mov_b32_e32 v2, s0
209 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
212 ; GFX9-LABEL: fshr_v2i32:
213 ; GFX9: ; %bb.0: ; %entry
214 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
215 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
216 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
217 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
218 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
219 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
220 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
221 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
222 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
223 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
224 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3
225 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
226 ; GFX9-NEXT: s_endpgm
228 ; R600-LABEL: fshr_v2i32:
229 ; R600: ; %bb.0: ; %entry
230 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
231 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
234 ; R600-NEXT: ALU clause starting at 4:
235 ; R600-NEXT: MOV * T0.W, KC0[4].X,
236 ; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
237 ; R600-NEXT: MOV * T0.W, KC0[3].W,
238 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
239 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
240 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
242 ; GFX10-LABEL: fshr_v2i32:
243 ; GFX10: ; %bb.0: ; %entry
244 ; GFX10-NEXT: s_clause 0x2
245 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
246 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
247 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
248 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
249 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX10-NEXT: v_mov_b32_e32 v0, s3
251 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
252 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0
253 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2
254 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9]
255 ; GFX10-NEXT: s_endpgm
257 ; GFX11-LABEL: fshr_v2i32:
258 ; GFX11: ; %bb.0: ; %entry
259 ; GFX11-NEXT: s_clause 0x2
260 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
261 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
262 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
265 ; GFX11-NEXT: v_mov_b32_e32 v2, s2
266 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
267 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0
268 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2
269 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
270 ; GFX11-NEXT: s_nop 0
271 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
272 ; GFX11-NEXT: s_endpgm
274 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
275 store <2 x i32> %0, ptr addrspace(1) %in
279 define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
280 ; SI-LABEL: fshr_v2i32_imm:
281 ; SI: ; %bb.0: ; %entry
282 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
283 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
284 ; SI-NEXT: s_mov_b32 s3, 0xf000
285 ; SI-NEXT: s_mov_b32 s2, -1
286 ; SI-NEXT: s_waitcnt lgkmcnt(0)
287 ; SI-NEXT: v_mov_b32_e32 v0, s7
288 ; SI-NEXT: v_mov_b32_e32 v2, s6
289 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 9
290 ; SI-NEXT: v_alignbit_b32 v0, s4, v2, 7
291 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
294 ; VI-LABEL: fshr_v2i32_imm:
295 ; VI: ; %bb.0: ; %entry
296 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
297 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
298 ; VI-NEXT: s_waitcnt lgkmcnt(0)
299 ; VI-NEXT: v_mov_b32_e32 v0, s7
300 ; VI-NEXT: v_mov_b32_e32 v2, s6
301 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9
302 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7
303 ; VI-NEXT: v_mov_b32_e32 v3, s1
304 ; VI-NEXT: v_mov_b32_e32 v2, s0
305 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
308 ; GFX9-LABEL: fshr_v2i32_imm:
309 ; GFX9: ; %bb.0: ; %entry
310 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
311 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
312 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
313 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
315 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
316 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9
317 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7
318 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
319 ; GFX9-NEXT: s_endpgm
321 ; R600-LABEL: fshr_v2i32_imm:
322 ; R600: ; %bb.0: ; %entry
323 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
324 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
327 ; R600-NEXT: ALU clause starting at 4:
328 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
329 ; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00)
330 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
331 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
332 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
333 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
335 ; GFX10-LABEL: fshr_v2i32_imm:
336 ; GFX10: ; %bb.0: ; %entry
337 ; GFX10-NEXT: s_clause 0x1
338 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
339 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
340 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
341 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9
343 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7
344 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
345 ; GFX10-NEXT: s_endpgm
347 ; GFX11-LABEL: fshr_v2i32_imm:
348 ; GFX11: ; %bb.0: ; %entry
349 ; GFX11-NEXT: s_clause 0x1
350 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
351 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
352 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
353 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
354 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9
355 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7
356 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
357 ; GFX11-NEXT: s_nop 0
358 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
359 ; GFX11-NEXT: s_endpgm
361 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
362 store <2 x i32> %0, ptr addrspace(1) %in
366 define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
367 ; SI-LABEL: fshr_v4i32:
368 ; SI: ; %bb.0: ; %entry
369 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
370 ; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15
371 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
372 ; SI-NEXT: s_mov_b32 s3, 0xf000
373 ; SI-NEXT: s_mov_b32 s2, -1
374 ; SI-NEXT: s_waitcnt lgkmcnt(0)
375 ; SI-NEXT: v_mov_b32_e32 v0, s11
376 ; SI-NEXT: v_mov_b32_e32 v1, s15
377 ; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1
378 ; SI-NEXT: v_mov_b32_e32 v0, s10
379 ; SI-NEXT: v_mov_b32_e32 v1, s14
380 ; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1
381 ; SI-NEXT: v_mov_b32_e32 v0, s9
382 ; SI-NEXT: v_mov_b32_e32 v1, s13
383 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1
384 ; SI-NEXT: v_mov_b32_e32 v0, s8
385 ; SI-NEXT: v_mov_b32_e32 v4, s12
386 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4
387 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
390 ; VI-LABEL: fshr_v4i32:
391 ; VI: ; %bb.0: ; %entry
392 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
393 ; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
394 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
395 ; VI-NEXT: s_waitcnt lgkmcnt(0)
396 ; VI-NEXT: v_mov_b32_e32 v0, s11
397 ; VI-NEXT: v_mov_b32_e32 v1, s15
398 ; VI-NEXT: v_mov_b32_e32 v2, s10
399 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1
400 ; VI-NEXT: v_mov_b32_e32 v0, s14
401 ; VI-NEXT: v_alignbit_b32 v2, s6, v2, v0
402 ; VI-NEXT: v_mov_b32_e32 v0, s9
403 ; VI-NEXT: v_mov_b32_e32 v1, s13
404 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
405 ; VI-NEXT: v_mov_b32_e32 v0, s8
406 ; VI-NEXT: v_mov_b32_e32 v4, s12
407 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
408 ; VI-NEXT: v_mov_b32_e32 v5, s1
409 ; VI-NEXT: v_mov_b32_e32 v4, s0
410 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
413 ; GFX9-LABEL: fshr_v4i32:
414 ; GFX9: ; %bb.0: ; %entry
415 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
416 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
417 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
418 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
419 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
420 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
421 ; GFX9-NEXT: v_mov_b32_e32 v1, s15
422 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
423 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
424 ; GFX9-NEXT: v_mov_b32_e32 v1, s14
425 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1
426 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
427 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
428 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
429 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
430 ; GFX9-NEXT: v_mov_b32_e32 v5, s12
431 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
432 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
433 ; GFX9-NEXT: s_endpgm
435 ; R600-LABEL: fshr_v4i32:
436 ; R600: ; %bb.0: ; %entry
437 ; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
438 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
441 ; R600-NEXT: ALU clause starting at 4:
442 ; R600-NEXT: MOV * T0.W, KC0[6].X,
443 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
444 ; R600-NEXT: MOV * T1.W, KC0[5].W,
445 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
446 ; R600-NEXT: MOV * T1.W, KC0[5].Z,
447 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
448 ; R600-NEXT: MOV * T1.W, KC0[5].Y,
449 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
450 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
451 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
453 ; GFX10-LABEL: fshr_v4i32:
454 ; GFX10: ; %bb.0: ; %entry
455 ; GFX10-NEXT: s_clause 0x2
456 ; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
457 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
458 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
459 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
460 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
461 ; GFX10-NEXT: v_mov_b32_e32 v0, s15
462 ; GFX10-NEXT: v_mov_b32_e32 v1, s14
463 ; GFX10-NEXT: v_mov_b32_e32 v4, s13
464 ; GFX10-NEXT: v_mov_b32_e32 v5, s12
465 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, v0
466 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1
467 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4
468 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5
469 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3]
470 ; GFX10-NEXT: s_endpgm
472 ; GFX11-LABEL: fshr_v4i32:
473 ; GFX11: ; %bb.0: ; %entry
474 ; GFX11-NEXT: s_clause 0x2
475 ; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
476 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
477 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
478 ; GFX11-NEXT: v_mov_b32_e32 v6, 0
479 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14
481 ; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12
482 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
483 ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, v0
484 ; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, v1
485 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
486 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4
487 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5
488 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
489 ; GFX11-NEXT: s_nop 0
490 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
491 ; GFX11-NEXT: s_endpgm
493 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
494 store <4 x i32> %0, ptr addrspace(1) %in
498 define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
499 ; SI-LABEL: fshr_v4i32_imm:
500 ; SI: ; %bb.0: ; %entry
501 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
502 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
503 ; SI-NEXT: s_mov_b32 s3, 0xf000
504 ; SI-NEXT: s_mov_b32 s2, -1
505 ; SI-NEXT: s_waitcnt lgkmcnt(0)
506 ; SI-NEXT: v_mov_b32_e32 v0, s11
507 ; SI-NEXT: v_mov_b32_e32 v1, s10
508 ; SI-NEXT: v_alignbit_b32 v3, s7, v0, 1
509 ; SI-NEXT: v_mov_b32_e32 v0, s9
510 ; SI-NEXT: v_alignbit_b32 v2, s6, v1, 9
511 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7
512 ; SI-NEXT: v_mov_b32_e32 v0, s8
513 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1
514 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
517 ; VI-LABEL: fshr_v4i32_imm:
518 ; VI: ; %bb.0: ; %entry
519 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
520 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
521 ; VI-NEXT: s_waitcnt lgkmcnt(0)
522 ; VI-NEXT: v_mov_b32_e32 v0, s11
523 ; VI-NEXT: v_mov_b32_e32 v1, s10
524 ; VI-NEXT: v_mov_b32_e32 v4, s9
525 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1
526 ; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9
527 ; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7
528 ; VI-NEXT: v_mov_b32_e32 v0, s8
529 ; VI-NEXT: v_mov_b32_e32 v5, s1
530 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
531 ; VI-NEXT: v_mov_b32_e32 v4, s0
532 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
535 ; GFX9-LABEL: fshr_v4i32_imm:
536 ; GFX9: ; %bb.0: ; %entry
537 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
538 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
539 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
540 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
541 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
542 ; GFX9-NEXT: v_mov_b32_e32 v1, s10
543 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1
544 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
545 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9
546 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
547 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
548 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
549 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
550 ; GFX9-NEXT: s_endpgm
552 ; R600-LABEL: fshr_v4i32_imm:
553 ; R600: ; %bb.0: ; %entry
554 ; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
555 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
558 ; R600-NEXT: ALU clause starting at 4:
559 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
560 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
561 ; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00)
562 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
563 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
564 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
565 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
566 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
568 ; GFX10-LABEL: fshr_v4i32_imm:
569 ; GFX10: ; %bb.0: ; %entry
570 ; GFX10-NEXT: s_clause 0x1
571 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
572 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
573 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
574 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
575 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1
576 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9
577 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7
578 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1
579 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
580 ; GFX10-NEXT: s_endpgm
582 ; GFX11-LABEL: fshr_v4i32_imm:
583 ; GFX11: ; %bb.0: ; %entry
584 ; GFX11-NEXT: s_clause 0x1
585 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
586 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
587 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
588 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
589 ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1
590 ; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9
591 ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7
592 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1
593 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
594 ; GFX11-NEXT: s_nop 0
595 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
596 ; GFX11-NEXT: s_endpgm
598 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
599 store <4 x i32> %0, ptr addrspace(1) %in
603 define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
604 ; GFX89-LABEL: v_fshr_i32:
606 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
607 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2
608 ; GFX89-NEXT: s_setpc_b64 s[30:31]
610 ; R600-LABEL: v_fshr_i32:
615 ; GFX10-LABEL: v_fshr_i32:
617 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2
619 ; GFX10-NEXT: s_setpc_b64 s[30:31]
621 ; GFX11-LABEL: v_fshr_i32:
623 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
624 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2
625 ; GFX11-NEXT: s_setpc_b64 s[30:31]
626 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
630 define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
631 ; GFX89-LABEL: v_fshr_v2i32:
633 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4
635 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5
636 ; GFX89-NEXT: s_setpc_b64 s[30:31]
638 ; R600-LABEL: v_fshr_v2i32:
643 ; GFX10-LABEL: v_fshr_v2i32:
645 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
646 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
647 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
648 ; GFX10-NEXT: s_setpc_b64 s[30:31]
650 ; GFX11-LABEL: v_fshr_v2i32:
652 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
653 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
654 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
655 ; GFX11-NEXT: s_setpc_b64 s[30:31]
656 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
660 define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
661 ; GFX89-LABEL: v_fshr_v3i32:
663 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6
665 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7
666 ; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8
667 ; GFX89-NEXT: s_setpc_b64 s[30:31]
669 ; R600-LABEL: v_fshr_v3i32:
674 ; GFX10-LABEL: v_fshr_v3i32:
676 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6
678 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7
679 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8
680 ; GFX10-NEXT: s_setpc_b64 s[30:31]
682 ; GFX11-LABEL: v_fshr_v3i32:
684 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
685 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6
686 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7
687 ; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8
688 ; GFX11-NEXT: s_setpc_b64 s[30:31]
689 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
693 define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
694 ; GFX89-LABEL: v_fshr_v4i32:
696 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8
698 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9
699 ; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10
700 ; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11
701 ; GFX89-NEXT: s_setpc_b64 s[30:31]
703 ; R600-LABEL: v_fshr_v4i32:
708 ; GFX10-LABEL: v_fshr_v4i32:
710 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
711 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8
712 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9
713 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10
714 ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11
715 ; GFX10-NEXT: s_setpc_b64 s[30:31]
717 ; GFX11-LABEL: v_fshr_v4i32:
719 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
720 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8
721 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9
722 ; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10
723 ; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11
724 ; GFX11-NEXT: s_setpc_b64 s[30:31]
725 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
729 define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
730 ; SI-LABEL: v_fshr_i16:
732 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733 ; SI-NEXT: v_or_b32_e32 v2, 16, v2
734 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
735 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
736 ; SI-NEXT: s_setpc_b64 s[30:31]
738 ; VI-LABEL: v_fshr_i16:
740 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
741 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
742 ; VI-NEXT: v_xor_b32_e32 v3, -1, v2
743 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
744 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
745 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
746 ; VI-NEXT: s_setpc_b64 s[30:31]
748 ; GFX9-LABEL: v_fshr_i16:
750 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
752 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
753 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
754 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
755 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
756 ; GFX9-NEXT: s_setpc_b64 s[30:31]
758 ; R600-LABEL: v_fshr_i16:
763 ; GFX10-LABEL: v_fshr_i16:
765 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
767 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
768 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
769 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
770 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
771 ; GFX10-NEXT: s_setpc_b64 s[30:31]
773 ; GFX11-LABEL: v_fshr_i16:
775 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
777 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
778 ; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1
779 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
780 ; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0
781 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
782 ; GFX11-NEXT: s_setpc_b64 s[30:31]
783 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
787 define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
788 ; SI-LABEL: v_fshr_v2i16:
790 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
791 ; SI-NEXT: v_or_b32_e32 v5, 16, v5
792 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
793 ; SI-NEXT: v_or_b32_e32 v4, 16, v4
794 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
795 ; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
796 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
797 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
798 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
799 ; SI-NEXT: v_or_b32_e32 v0, v0, v3
800 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
801 ; SI-NEXT: s_setpc_b64 s[30:31]
803 ; VI-LABEL: v_fshr_v2i16:
805 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
807 ; VI-NEXT: v_mov_b32_e32 v5, 1
808 ; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
809 ; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
810 ; VI-NEXT: v_xor_b32_e32 v3, -1, v3
811 ; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5
812 ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
813 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
814 ; VI-NEXT: v_xor_b32_e32 v4, -1, v2
815 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
816 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
817 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
818 ; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
819 ; VI-NEXT: s_setpc_b64 s[30:31]
821 ; GFX9-LABEL: v_fshr_v2i16:
823 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
824 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
825 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
826 ; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3
827 ; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2
828 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0
829 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1
830 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
831 ; GFX9-NEXT: s_setpc_b64 s[30:31]
833 ; R600-LABEL: v_fshr_v2i16:
838 ; GFX10-LABEL: v_fshr_v2i16:
840 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
842 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
843 ; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2
844 ; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3
845 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1
846 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0
847 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
848 ; GFX10-NEXT: s_setpc_b64 s[30:31]
850 ; GFX11-LABEL: v_fshr_v2i16:
852 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
854 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
855 ; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2
856 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
857 ; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3
858 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v2, v1
859 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
860 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v3, v0
861 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
862 ; GFX11-NEXT: s_setpc_b64 s[30:31]
863 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
867 define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
868 ; SI-LABEL: v_fshr_v3i16:
870 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871 ; SI-NEXT: v_or_b32_e32 v7, 16, v7
872 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
873 ; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7
874 ; SI-NEXT: v_or_b32_e32 v4, 16, v6
875 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
876 ; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4
877 ; SI-NEXT: v_or_b32_e32 v3, 16, v8
878 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
879 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
880 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
881 ; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3
882 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
883 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
884 ; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
885 ; SI-NEXT: s_setpc_b64 s[30:31]
887 ; VI-LABEL: v_fshr_v3i16:
889 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
890 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
891 ; VI-NEXT: v_mov_b32_e32 v8, 1
892 ; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
893 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
894 ; VI-NEXT: v_xor_b32_e32 v6, -1, v6
895 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8
896 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
897 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
898 ; VI-NEXT: v_xor_b32_e32 v7, -1, v5
899 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
900 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
901 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
902 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
903 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4
904 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
905 ; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2
906 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
907 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
908 ; VI-NEXT: s_setpc_b64 s[30:31]
910 ; GFX9-LABEL: v_fshr_v3i16:
912 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
914 ; GFX9-NEXT: v_mov_b32_e32 v8, 1
915 ; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
916 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
917 ; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
918 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8
919 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
920 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
921 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
922 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1
923 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
924 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
925 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
926 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
927 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
928 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
929 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
930 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
931 ; GFX9-NEXT: v_perm_b32 v0, v6, v0, s4
932 ; GFX9-NEXT: s_setpc_b64 s[30:31]
934 ; R600-LABEL: v_fshr_v3i16:
939 ; GFX10-LABEL: v_fshr_v3i16:
941 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
942 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
943 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4
944 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
945 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
946 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4
947 ; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6
948 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7
949 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
950 ; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8
951 ; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0
952 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
953 ; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6
954 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5
955 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
956 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
957 ; GFX10-NEXT: v_or_b32_e32 v5, v6, v7
958 ; GFX10-NEXT: v_lshlrev_b16 v1, v4, v1
959 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
960 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
961 ; GFX10-NEXT: s_setpc_b64 s[30:31]
963 ; GFX11-LABEL: v_fshr_v3i16:
965 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
967 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4
968 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
969 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
970 ; GFX11-NEXT: v_xor_b32_e32 v10, -1, v4
971 ; GFX11-NEXT: v_lshlrev_b16 v6, 1, v6
972 ; GFX11-NEXT: v_xor_b32_e32 v9, -1, v7
973 ; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
974 ; GFX11-NEXT: v_lshrrev_b16 v7, v7, v8
975 ; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0
976 ; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2
977 ; GFX11-NEXT: v_lshlrev_b16 v6, v9, v6
978 ; GFX11-NEXT: v_xor_b32_e32 v4, -1, v5
979 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3
980 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
981 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
982 ; GFX11-NEXT: v_or_b32_e32 v5, v6, v7
983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
984 ; GFX11-NEXT: v_lshlrev_b16 v1, v4, v1
985 ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
986 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
987 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
988 ; GFX11-NEXT: s_setpc_b64 s[30:31]
989 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
993 define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
994 ; SI-LABEL: v_fshr_v4i16:
996 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
997 ; SI-NEXT: v_or_b32_e32 v9, 16, v9
998 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
999 ; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9
1000 ; SI-NEXT: v_or_b32_e32 v5, 16, v8
1001 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1002 ; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5
1003 ; SI-NEXT: v_or_b32_e32 v4, 16, v11
1004 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
1005 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
1006 ; SI-NEXT: v_or_b32_e32 v5, 16, v10
1007 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1008 ; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5
1009 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
1010 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1011 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1012 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1013 ; SI-NEXT: v_or_b32_e32 v2, v2, v4
1014 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
1015 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
1016 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1017 ; SI-NEXT: s_setpc_b64 s[30:31]
1019 ; VI-LABEL: v_fshr_v4i16:
1021 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1022 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
1023 ; VI-NEXT: v_mov_b32_e32 v8, 1
1024 ; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1025 ; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1026 ; VI-NEXT: v_xor_b32_e32 v6, -1, v6
1027 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9
1028 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1029 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
1030 ; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1031 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1032 ; VI-NEXT: v_xor_b32_e32 v7, -1, v7
1033 ; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8
1034 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
1035 ; VI-NEXT: v_xor_b32_e32 v8, -1, v5
1036 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1
1037 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
1038 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
1039 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
1040 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4
1041 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
1042 ; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2
1043 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1044 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1045 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1046 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1047 ; VI-NEXT: s_setpc_b64 s[30:31]
1049 ; GFX9-LABEL: v_fshr_v4i16:
1051 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1052 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5
1053 ; GFX9-NEXT: v_mov_b32_e32 v8, 1
1054 ; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1055 ; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1056 ; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
1057 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9
1058 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
1059 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
1060 ; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1061 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1062 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7
1063 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8
1064 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
1065 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5
1066 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1
1067 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
1068 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1069 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
1070 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
1071 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
1072 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
1073 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9
1074 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
1075 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1076 ; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4
1077 ; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4
1078 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1080 ; R600-LABEL: v_fshr_v4i16:
1085 ; GFX10-LABEL: v_fshr_v4i16:
1087 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1088 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1089 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
1090 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
1091 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1092 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4
1093 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2
1094 ; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6
1095 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
1096 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7
1097 ; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9
1098 ; GFX10-NEXT: v_xor_b32_e32 v12, -1, v10
1099 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
1100 ; GFX10-NEXT: v_xor_b32_e32 v13, -1, v5
1101 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
1102 ; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4
1103 ; GFX10-NEXT: v_lshlrev_b16 v7, v7, v8
1104 ; GFX10-NEXT: v_lshrrev_b16 v8, v10, v11
1105 ; GFX10-NEXT: v_lshlrev_b16 v9, v12, v9
1106 ; GFX10-NEXT: v_lshlrev_b16 v1, v13, v1
1107 ; GFX10-NEXT: v_lshlrev_b16 v0, v14, v0
1108 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
1109 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
1110 ; GFX10-NEXT: v_or_b32_e32 v4, v7, v6
1111 ; GFX10-NEXT: v_or_b32_e32 v5, v9, v8
1112 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
1113 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
1114 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
1115 ; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
1116 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1118 ; GFX11-LABEL: v_fshr_v4i16:
1120 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1121 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1122 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5
1123 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1
1124 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1125 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v4
1126 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2
1127 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6
1128 ; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8
1129 ; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7
1130 ; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9
1131 ; GFX11-NEXT: v_xor_b32_e32 v12, -1, v10
1132 ; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
1133 ; GFX11-NEXT: v_xor_b32_e32 v13, -1, v5
1134 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
1135 ; GFX11-NEXT: v_xor_b32_e32 v14, -1, v4
1136 ; GFX11-NEXT: v_lshlrev_b16 v7, v7, v8
1137 ; GFX11-NEXT: v_lshrrev_b16 v8, v10, v11
1138 ; GFX11-NEXT: v_lshlrev_b16 v9, v12, v9
1139 ; GFX11-NEXT: v_lshlrev_b16 v1, v13, v1
1140 ; GFX11-NEXT: v_lshlrev_b16 v0, v14, v0
1141 ; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2
1142 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3
1143 ; GFX11-NEXT: v_or_b32_e32 v4, v7, v6
1144 ; GFX11-NEXT: v_or_b32_e32 v5, v9, v8
1145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1146 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
1147 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
1148 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1149 ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
1150 ; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
1151 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1152 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1156 define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1157 ; SI-LABEL: v_fshr_i64:
1159 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1160 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
1161 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
1162 ; SI-NEXT: v_not_b32_e32 v4, v4
1163 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
1164 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1165 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1166 ; SI-NEXT: s_setpc_b64 s[30:31]
1168 ; VI-LABEL: v_fshr_i64:
1170 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1171 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1172 ; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1173 ; VI-NEXT: v_not_b32_e32 v4, v4
1174 ; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1175 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
1176 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1177 ; VI-NEXT: s_setpc_b64 s[30:31]
1179 ; GFX9-LABEL: v_fshr_i64:
1181 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1182 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1183 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1184 ; GFX9-NEXT: v_not_b32_e32 v4, v4
1185 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1186 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1187 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
1188 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1190 ; R600-LABEL: v_fshr_i64:
1195 ; GFX10-LABEL: v_fshr_i64:
1197 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1198 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1199 ; GFX10-NEXT: v_not_b32_e32 v5, v4
1200 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1201 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
1202 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
1203 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
1204 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1206 ; GFX11-LABEL: v_fshr_i64:
1208 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1209 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1210 ; GFX11-NEXT: v_not_b32_e32 v5, v4
1211 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1212 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1213 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
1214 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
1215 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1216 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
1217 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1218 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1222 define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1223 ; SI-LABEL: v_fshr_v2i64:
1225 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1226 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
1227 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
1228 ; SI-NEXT: v_not_b32_e32 v8, v8
1229 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
1230 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
1231 ; SI-NEXT: v_or_b32_e32 v1, v1, v5
1232 ; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10
1233 ; SI-NEXT: v_not_b32_e32 v7, v10
1234 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
1235 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
1236 ; SI-NEXT: v_or_b32_e32 v3, v3, v6
1237 ; SI-NEXT: v_or_b32_e32 v2, v2, v5
1238 ; SI-NEXT: s_setpc_b64 s[30:31]
1240 ; VI-LABEL: v_fshr_v2i64:
1242 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1244 ; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1245 ; VI-NEXT: v_not_b32_e32 v8, v8
1246 ; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
1247 ; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1248 ; VI-NEXT: v_or_b32_e32 v1, v1, v5
1249 ; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7]
1250 ; VI-NEXT: v_not_b32_e32 v7, v10
1251 ; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
1252 ; VI-NEXT: v_or_b32_e32 v0, v0, v4
1253 ; VI-NEXT: v_or_b32_e32 v3, v3, v6
1254 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
1255 ; VI-NEXT: s_setpc_b64 s[30:31]
1257 ; GFX9-LABEL: v_fshr_v2i64:
1259 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1260 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1261 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1262 ; GFX9-NEXT: v_not_b32_e32 v8, v8
1263 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
1264 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1265 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
1266 ; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7]
1267 ; GFX9-NEXT: v_not_b32_e32 v7, v10
1268 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
1269 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
1270 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v6
1271 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
1272 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1274 ; R600-LABEL: v_fshr_v2i64:
1279 ; GFX10-LABEL: v_fshr_v2i64:
1281 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1282 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1283 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1284 ; GFX10-NEXT: v_not_b32_e32 v9, v8
1285 ; GFX10-NEXT: v_not_b32_e32 v11, v10
1286 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1287 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
1288 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
1289 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
1290 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
1291 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
1292 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
1293 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
1294 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1296 ; GFX11-LABEL: v_fshr_v2i64:
1298 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1299 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1300 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1301 ; GFX11-NEXT: v_not_b32_e32 v9, v8
1302 ; GFX11-NEXT: v_not_b32_e32 v11, v10
1303 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1304 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
1305 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1306 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
1307 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
1308 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1309 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4
1310 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v5
1311 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1312 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v6
1313 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v7
1314 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1315 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1319 define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1320 ; SI-LABEL: v_fshr_i24:
1322 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1323 ; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1324 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
1325 ; SI-NEXT: v_mul_hi_u32 v3, v3, s4
1326 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1327 ; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1328 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
1329 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
1330 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
1331 ; SI-NEXT: s_setpc_b64 s[30:31]
1333 ; VI-LABEL: v_fshr_i24:
1335 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336 ; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1337 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
1338 ; VI-NEXT: v_mul_hi_u32 v3, v3, s4
1339 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1340 ; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1341 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
1342 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
1343 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2
1344 ; VI-NEXT: s_setpc_b64 s[30:31]
1346 ; GFX9-LABEL: v_fshr_i24:
1348 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1350 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
1351 ; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4
1352 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1353 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1354 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
1355 ; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
1356 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2
1357 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1359 ; R600-LABEL: v_fshr_i24:
1364 ; GFX10-LABEL: v_fshr_i24:
1366 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1367 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1368 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1369 ; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
1370 ; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1371 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3
1372 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2
1373 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2
1374 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1376 ; GFX11-LABEL: v_fshr_i24:
1378 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1379 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1380 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1381 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1382 ; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
1383 ; GFX11-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1384 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1385 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3
1386 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 8, v2
1387 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1388 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2
1389 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1390 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1394 define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1395 ; SI-LABEL: v_fshr_v2i24:
1397 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1398 ; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1399 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
1400 ; SI-NEXT: v_mul_hi_u32 v6, v6, s4
1401 ; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1402 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1403 ; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1404 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
1405 ; SI-NEXT: v_mul_hi_u32 v6, v7, s4
1406 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
1407 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
1408 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1409 ; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
1410 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3
1411 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
1412 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3
1413 ; SI-NEXT: s_setpc_b64 s[30:31]
1415 ; VI-LABEL: v_fshr_v2i24:
1417 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1418 ; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1419 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
1420 ; VI-NEXT: v_mul_hi_u32 v6, v6, s4
1421 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1422 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1423 ; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1424 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
1425 ; VI-NEXT: v_mul_hi_u32 v6, v7, s4
1426 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
1427 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
1428 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1429 ; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
1430 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3
1431 ; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
1432 ; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3
1433 ; VI-NEXT: s_setpc_b64 s[30:31]
1435 ; GFX9-LABEL: v_fshr_v2i24:
1437 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1438 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1439 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
1440 ; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4
1441 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1442 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1443 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1444 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
1445 ; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4
1446 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
1447 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
1448 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1449 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6
1450 ; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3
1451 ; GFX9-NEXT: v_add_u32_e32 v3, 8, v3
1452 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
1453 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1455 ; R600-LABEL: v_fshr_v2i24:
1460 ; GFX10-LABEL: v_fshr_v2i24:
1462 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1463 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1464 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1465 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1466 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1467 ; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
1468 ; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
1469 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1470 ; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7
1471 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6
1472 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7
1473 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4
1474 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5
1475 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
1476 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
1477 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1479 ; GFX11-LABEL: v_fshr_v2i24:
1481 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1482 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1483 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1484 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1485 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1486 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1487 ; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
1488 ; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
1489 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1490 ; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1491 ; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7
1492 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1493 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6
1494 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7
1495 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1496 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4
1497 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5
1498 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1499 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
1500 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
1501 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1502 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)