1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600
6 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
9 declare i32 @llvm.fshr.i32(i32, i32, i32)
10 declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
11 declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
12 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
13 declare i16 @llvm.fshr.i16(i16, i16, i16)
14 declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
15 declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
16 declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
17 declare i64 @llvm.fshr.i64(i64, i64, i64)
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare i24 @llvm.fshr.i24(i24, i24, i24)
20 declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
22 define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) {
24 ; SI: ; %bb.0: ; %entry
25 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
26 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
27 ; SI-NEXT: s_mov_b32 s7, 0xf000
28 ; SI-NEXT: s_mov_b32 s6, -1
29 ; SI-NEXT: s_waitcnt lgkmcnt(0)
30 ; SI-NEXT: v_mov_b32_e32 v0, s1
31 ; SI-NEXT: v_mov_b32_e32 v1, s2
32 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
33 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
37 ; VI: ; %bb.0: ; %entry
38 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
39 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
40 ; VI-NEXT: s_waitcnt lgkmcnt(0)
41 ; VI-NEXT: v_mov_b32_e32 v0, s1
42 ; VI-NEXT: v_mov_b32_e32 v1, s2
43 ; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
44 ; VI-NEXT: v_mov_b32_e32 v0, s4
45 ; VI-NEXT: v_mov_b32_e32 v1, s5
46 ; VI-NEXT: flat_store_dword v[0:1], v2
49 ; GFX9-LABEL: fshr_i32:
50 ; GFX9: ; %bb.0: ; %entry
51 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
52 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
53 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
55 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
56 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
57 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2
58 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
61 ; R600-LABEL: fshr_i32:
62 ; R600: ; %bb.0: ; %entry
63 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
64 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
67 ; R600-NEXT: ALU clause starting at 4:
68 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
69 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
70 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
72 ; GFX10-LABEL: fshr_i32:
73 ; GFX10: ; %bb.0: ; %entry
74 ; GFX10-NEXT: s_clause 0x1
75 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
76 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
77 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
78 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
79 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
80 ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0
81 ; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
82 ; GFX10-NEXT: s_endpgm
84 ; GFX11-LABEL: fshr_i32:
85 ; GFX11: ; %bb.0: ; %entry
86 ; GFX11-NEXT: s_clause 0x1
87 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
88 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
89 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
90 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
91 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
92 ; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0
93 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
94 ; GFX11-NEXT: s_endpgm
96 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
97 store i32 %0, ptr addrspace(1) %in
101 define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
102 ; SI-LABEL: fshr_i32_imm:
103 ; SI: ; %bb.0: ; %entry
104 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
105 ; SI-NEXT: s_mov_b32 s7, 0xf000
106 ; SI-NEXT: s_mov_b32 s6, -1
107 ; SI-NEXT: s_waitcnt lgkmcnt(0)
108 ; SI-NEXT: v_mov_b32_e32 v0, s3
109 ; SI-NEXT: s_mov_b32 s4, s0
110 ; SI-NEXT: s_mov_b32 s5, s1
111 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7
112 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
115 ; VI-LABEL: fshr_i32_imm:
116 ; VI: ; %bb.0: ; %entry
117 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
118 ; VI-NEXT: s_waitcnt lgkmcnt(0)
119 ; VI-NEXT: v_mov_b32_e32 v0, s3
120 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7
121 ; VI-NEXT: v_mov_b32_e32 v0, s0
122 ; VI-NEXT: v_mov_b32_e32 v1, s1
123 ; VI-NEXT: flat_store_dword v[0:1], v2
126 ; GFX9-LABEL: fshr_i32_imm:
127 ; GFX9: ; %bb.0: ; %entry
128 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
129 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
130 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
131 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
132 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7
133 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
134 ; GFX9-NEXT: s_endpgm
136 ; R600-LABEL: fshr_i32_imm:
137 ; R600: ; %bb.0: ; %entry
138 ; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
139 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
142 ; R600-NEXT: ALU clause starting at 4:
143 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
144 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
145 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
146 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
148 ; GFX10-LABEL: fshr_i32_imm:
149 ; GFX10: ; %bb.0: ; %entry
150 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
151 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
152 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7
154 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
155 ; GFX10-NEXT: s_endpgm
157 ; GFX11-LABEL: fshr_i32_imm:
158 ; GFX11: ; %bb.0: ; %entry
159 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
160 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
161 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7
163 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
164 ; GFX11-NEXT: s_endpgm
166 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
167 store i32 %0, ptr addrspace(1) %in
171 define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
172 ; SI-LABEL: fshr_v2i32:
173 ; SI: ; %bb.0: ; %entry
174 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
175 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf
176 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
177 ; SI-NEXT: s_mov_b32 s7, 0xf000
178 ; SI-NEXT: s_mov_b32 s6, -1
179 ; SI-NEXT: s_waitcnt lgkmcnt(0)
180 ; SI-NEXT: v_mov_b32_e32 v0, s3
181 ; SI-NEXT: v_mov_b32_e32 v1, s9
182 ; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
183 ; SI-NEXT: v_mov_b32_e32 v0, s2
184 ; SI-NEXT: v_mov_b32_e32 v2, s8
185 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
186 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
189 ; VI-LABEL: fshr_v2i32:
190 ; VI: ; %bb.0: ; %entry
191 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
192 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
193 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
194 ; VI-NEXT: s_waitcnt lgkmcnt(0)
195 ; VI-NEXT: v_mov_b32_e32 v0, s3
196 ; VI-NEXT: v_mov_b32_e32 v1, s7
197 ; VI-NEXT: v_mov_b32_e32 v2, s2
198 ; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1
199 ; VI-NEXT: v_mov_b32_e32 v0, s6
200 ; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0
201 ; VI-NEXT: v_mov_b32_e32 v2, s4
202 ; VI-NEXT: v_mov_b32_e32 v3, s5
203 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
206 ; GFX9-LABEL: fshr_v2i32:
207 ; GFX9: ; %bb.0: ; %entry
208 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
209 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
210 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
211 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
212 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
213 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
214 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
215 ; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1
216 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
217 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
218 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
219 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
220 ; GFX9-NEXT: s_endpgm
222 ; R600-LABEL: fshr_v2i32:
223 ; R600: ; %bb.0: ; %entry
224 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
225 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
228 ; R600-NEXT: ALU clause starting at 4:
229 ; R600-NEXT: MOV * T0.W, KC0[4].X,
230 ; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
231 ; R600-NEXT: MOV * T0.W, KC0[3].W,
232 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
233 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
234 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
236 ; GFX10-LABEL: fshr_v2i32:
237 ; GFX10: ; %bb.0: ; %entry
238 ; GFX10-NEXT: s_clause 0x2
239 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
240 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
241 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
242 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
243 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX10-NEXT: v_mov_b32_e32 v0, s7
245 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
246 ; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0
247 ; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2
248 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9]
249 ; GFX10-NEXT: s_endpgm
251 ; GFX11-LABEL: fshr_v2i32:
252 ; GFX11: ; %bb.0: ; %entry
253 ; GFX11-NEXT: s_clause 0x2
254 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
255 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
256 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
257 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
259 ; GFX11-NEXT: v_mov_b32_e32 v2, s6
260 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261 ; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, v0
262 ; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, v2
263 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
264 ; GFX11-NEXT: s_endpgm
266 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
267 store <2 x i32> %0, ptr addrspace(1) %in
271 define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
272 ; SI-LABEL: fshr_v2i32_imm:
273 ; SI: ; %bb.0: ; %entry
274 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
275 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
276 ; SI-NEXT: s_mov_b32 s7, 0xf000
277 ; SI-NEXT: s_mov_b32 s6, -1
278 ; SI-NEXT: s_waitcnt lgkmcnt(0)
279 ; SI-NEXT: v_mov_b32_e32 v0, s3
280 ; SI-NEXT: v_mov_b32_e32 v2, s2
281 ; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9
282 ; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7
283 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
286 ; VI-LABEL: fshr_v2i32_imm:
287 ; VI: ; %bb.0: ; %entry
288 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
289 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
290 ; VI-NEXT: s_waitcnt lgkmcnt(0)
291 ; VI-NEXT: v_mov_b32_e32 v0, s3
292 ; VI-NEXT: v_mov_b32_e32 v2, s2
293 ; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9
294 ; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7
295 ; VI-NEXT: v_mov_b32_e32 v2, s4
296 ; VI-NEXT: v_mov_b32_e32 v3, s5
297 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
300 ; GFX9-LABEL: fshr_v2i32_imm:
301 ; GFX9: ; %bb.0: ; %entry
302 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
303 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
304 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
305 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
307 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
308 ; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9
309 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7
310 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
311 ; GFX9-NEXT: s_endpgm
313 ; R600-LABEL: fshr_v2i32_imm:
314 ; R600: ; %bb.0: ; %entry
315 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
316 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
319 ; R600-NEXT: ALU clause starting at 4:
320 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
321 ; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00)
322 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
323 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
324 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
325 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
327 ; GFX10-LABEL: fshr_v2i32_imm:
328 ; GFX10: ; %bb.0: ; %entry
329 ; GFX10-NEXT: s_clause 0x1
330 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
331 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
332 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
333 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
334 ; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9
335 ; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7
336 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
337 ; GFX10-NEXT: s_endpgm
339 ; GFX11-LABEL: fshr_v2i32_imm:
340 ; GFX11: ; %bb.0: ; %entry
341 ; GFX11-NEXT: s_clause 0x1
342 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
343 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
344 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
345 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
346 ; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9
347 ; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7
348 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
349 ; GFX11-NEXT: s_endpgm
351 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
352 store <2 x i32> %0, ptr addrspace(1) %in
356 define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
357 ; SI-LABEL: fshr_v4i32:
358 ; SI: ; %bb.0: ; %entry
359 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
360 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15
361 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
362 ; SI-NEXT: s_mov_b32 s7, 0xf000
363 ; SI-NEXT: s_mov_b32 s6, -1
364 ; SI-NEXT: s_waitcnt lgkmcnt(0)
365 ; SI-NEXT: v_mov_b32_e32 v0, s15
366 ; SI-NEXT: v_mov_b32_e32 v1, s3
367 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1
368 ; SI-NEXT: v_mov_b32_e32 v0, s14
369 ; SI-NEXT: v_mov_b32_e32 v1, s2
370 ; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1
371 ; SI-NEXT: v_mov_b32_e32 v0, s13
372 ; SI-NEXT: v_mov_b32_e32 v1, s1
373 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
374 ; SI-NEXT: v_mov_b32_e32 v0, s12
375 ; SI-NEXT: v_mov_b32_e32 v4, s0
376 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
377 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
380 ; VI-LABEL: fshr_v4i32:
381 ; VI: ; %bb.0: ; %entry
382 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
383 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
384 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
385 ; VI-NEXT: s_waitcnt lgkmcnt(0)
386 ; VI-NEXT: v_mov_b32_e32 v0, s15
387 ; VI-NEXT: v_mov_b32_e32 v1, s3
388 ; VI-NEXT: v_mov_b32_e32 v2, s14
389 ; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1
390 ; VI-NEXT: v_mov_b32_e32 v0, s2
391 ; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0
392 ; VI-NEXT: v_mov_b32_e32 v0, s13
393 ; VI-NEXT: v_mov_b32_e32 v1, s1
394 ; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1
395 ; VI-NEXT: v_mov_b32_e32 v0, s12
396 ; VI-NEXT: v_mov_b32_e32 v4, s0
397 ; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4
398 ; VI-NEXT: v_mov_b32_e32 v4, s4
399 ; VI-NEXT: v_mov_b32_e32 v5, s5
400 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
403 ; GFX9-LABEL: fshr_v4i32:
404 ; GFX9: ; %bb.0: ; %entry
405 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
406 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
407 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
408 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
409 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
410 ; GFX9-NEXT: v_mov_b32_e32 v0, s15
411 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
412 ; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1
413 ; GFX9-NEXT: v_mov_b32_e32 v0, s14
414 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
415 ; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1
416 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
417 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
418 ; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1
419 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
420 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
421 ; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5
422 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
423 ; GFX9-NEXT: s_endpgm
425 ; R600-LABEL: fshr_v4i32:
426 ; R600: ; %bb.0: ; %entry
427 ; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
428 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
431 ; R600-NEXT: ALU clause starting at 4:
432 ; R600-NEXT: MOV * T0.W, KC0[6].X,
433 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
434 ; R600-NEXT: MOV * T1.W, KC0[5].W,
435 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
436 ; R600-NEXT: MOV * T1.W, KC0[5].Z,
437 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
438 ; R600-NEXT: MOV * T1.W, KC0[5].Y,
439 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
440 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
441 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
443 ; GFX10-LABEL: fshr_v4i32:
444 ; GFX10: ; %bb.0: ; %entry
445 ; GFX10-NEXT: s_clause 0x2
446 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
447 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
448 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
449 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
450 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX10-NEXT: v_mov_b32_e32 v0, s3
452 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
453 ; GFX10-NEXT: v_mov_b32_e32 v4, s1
454 ; GFX10-NEXT: v_mov_b32_e32 v5, s0
455 ; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0
456 ; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1
457 ; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4
458 ; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5
459 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
460 ; GFX10-NEXT: s_endpgm
462 ; GFX11-LABEL: fshr_v4i32:
463 ; GFX11: ; %bb.0: ; %entry
464 ; GFX11-NEXT: s_clause 0x2
465 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
466 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
467 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
468 ; GFX11-NEXT: v_mov_b32_e32 v6, 0
469 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
470 ; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
471 ; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
472 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
473 ; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, v0
474 ; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, v1
475 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
476 ; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, v4
477 ; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, v5
478 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[4:5]
479 ; GFX11-NEXT: s_endpgm
481 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
482 store <4 x i32> %0, ptr addrspace(1) %in
486 define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
487 ; SI-LABEL: fshr_v4i32_imm:
488 ; SI: ; %bb.0: ; %entry
489 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
490 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
491 ; SI-NEXT: s_mov_b32 s3, 0xf000
492 ; SI-NEXT: s_mov_b32 s2, -1
493 ; SI-NEXT: s_waitcnt lgkmcnt(0)
494 ; SI-NEXT: v_mov_b32_e32 v0, s15
495 ; SI-NEXT: v_mov_b32_e32 v1, s14
496 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1
497 ; SI-NEXT: v_mov_b32_e32 v0, s13
498 ; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9
499 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7
500 ; SI-NEXT: v_mov_b32_e32 v0, s12
501 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1
502 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
505 ; VI-LABEL: fshr_v4i32_imm:
506 ; VI: ; %bb.0: ; %entry
507 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
508 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
509 ; VI-NEXT: s_waitcnt lgkmcnt(0)
510 ; VI-NEXT: v_mov_b32_e32 v0, s15
511 ; VI-NEXT: v_mov_b32_e32 v1, s14
512 ; VI-NEXT: v_mov_b32_e32 v4, s13
513 ; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1
514 ; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9
515 ; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7
516 ; VI-NEXT: v_mov_b32_e32 v0, s12
517 ; VI-NEXT: v_mov_b32_e32 v5, s1
518 ; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1
519 ; VI-NEXT: v_mov_b32_e32 v4, s0
520 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
523 ; GFX9-LABEL: fshr_v4i32_imm:
524 ; GFX9: ; %bb.0: ; %entry
525 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
526 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
527 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
528 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
529 ; GFX9-NEXT: v_mov_b32_e32 v0, s15
530 ; GFX9-NEXT: v_mov_b32_e32 v1, s14
531 ; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1
532 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
533 ; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9
534 ; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7
535 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
536 ; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1
537 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
538 ; GFX9-NEXT: s_endpgm
540 ; R600-LABEL: fshr_v4i32_imm:
541 ; R600: ; %bb.0: ; %entry
542 ; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
543 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
546 ; R600-NEXT: ALU clause starting at 4:
547 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
548 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
549 ; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00)
550 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
551 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
552 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
553 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
554 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
556 ; GFX10-LABEL: fshr_v4i32_imm:
557 ; GFX10: ; %bb.0: ; %entry
558 ; GFX10-NEXT: s_clause 0x1
559 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
560 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
561 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
562 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
563 ; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1
564 ; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9
565 ; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7
566 ; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1
567 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
568 ; GFX10-NEXT: s_endpgm
570 ; GFX11-LABEL: fshr_v4i32_imm:
571 ; GFX11: ; %bb.0: ; %entry
572 ; GFX11-NEXT: s_clause 0x1
573 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
574 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
575 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
576 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1
578 ; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9
579 ; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7
580 ; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1
581 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
582 ; GFX11-NEXT: s_endpgm
584 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
585 store <4 x i32> %0, ptr addrspace(1) %in
589 define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
590 ; GFX89-LABEL: v_fshr_i32:
592 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2
594 ; GFX89-NEXT: s_setpc_b64 s[30:31]
596 ; R600-LABEL: v_fshr_i32:
601 ; GFX10-LABEL: v_fshr_i32:
603 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2
605 ; GFX10-NEXT: s_setpc_b64 s[30:31]
607 ; GFX11-LABEL: v_fshr_i32:
609 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2
611 ; GFX11-NEXT: s_setpc_b64 s[30:31]
612 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
616 define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
617 ; GFX89-LABEL: v_fshr_v2i32:
619 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4
621 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5
622 ; GFX89-NEXT: s_setpc_b64 s[30:31]
624 ; R600-LABEL: v_fshr_v2i32:
629 ; GFX10-LABEL: v_fshr_v2i32:
631 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
632 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
633 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
634 ; GFX10-NEXT: s_setpc_b64 s[30:31]
636 ; GFX11-LABEL: v_fshr_v2i32:
638 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
640 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
641 ; GFX11-NEXT: s_setpc_b64 s[30:31]
642 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
646 define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
647 ; GFX89-LABEL: v_fshr_v3i32:
649 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
650 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6
651 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7
652 ; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8
653 ; GFX89-NEXT: s_setpc_b64 s[30:31]
655 ; R600-LABEL: v_fshr_v3i32:
660 ; GFX10-LABEL: v_fshr_v3i32:
662 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
663 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6
664 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7
665 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8
666 ; GFX10-NEXT: s_setpc_b64 s[30:31]
668 ; GFX11-LABEL: v_fshr_v3i32:
670 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
671 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6
672 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7
673 ; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8
674 ; GFX11-NEXT: s_setpc_b64 s[30:31]
675 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
679 define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
680 ; GFX89-LABEL: v_fshr_v4i32:
682 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8
684 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9
685 ; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10
686 ; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11
687 ; GFX89-NEXT: s_setpc_b64 s[30:31]
689 ; R600-LABEL: v_fshr_v4i32:
694 ; GFX10-LABEL: v_fshr_v4i32:
696 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8
698 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9
699 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10
700 ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11
701 ; GFX10-NEXT: s_setpc_b64 s[30:31]
703 ; GFX11-LABEL: v_fshr_v4i32:
705 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
706 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8
707 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9
708 ; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10
709 ; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11
710 ; GFX11-NEXT: s_setpc_b64 s[30:31]
711 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
715 define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
716 ; SI-LABEL: v_fshr_i16:
718 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719 ; SI-NEXT: v_or_b32_e32 v2, 16, v2
720 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
721 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
722 ; SI-NEXT: s_setpc_b64 s[30:31]
724 ; VI-LABEL: v_fshr_i16:
726 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
727 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
728 ; VI-NEXT: v_xor_b32_e32 v3, -1, v2
729 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
730 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
731 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
732 ; VI-NEXT: s_setpc_b64 s[30:31]
734 ; GFX9-LABEL: v_fshr_i16:
736 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
737 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
738 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
739 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
740 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
741 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
742 ; GFX9-NEXT: s_setpc_b64 s[30:31]
744 ; R600-LABEL: v_fshr_i16:
749 ; GFX10-LABEL: v_fshr_i16:
751 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
753 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
754 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
755 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
756 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
757 ; GFX10-NEXT: s_setpc_b64 s[30:31]
759 ; GFX11-LABEL: v_fshr_i16:
761 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
763 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
764 ; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1
765 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
766 ; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0
767 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
768 ; GFX11-NEXT: s_setpc_b64 s[30:31]
769 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
773 define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
774 ; SI-LABEL: v_fshr_v2i16:
776 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
777 ; SI-NEXT: v_or_b32_e32 v5, 16, v5
778 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
779 ; SI-NEXT: v_or_b32_e32 v4, 16, v4
780 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
781 ; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
782 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
783 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
784 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
785 ; SI-NEXT: v_or_b32_e32 v0, v0, v3
786 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
787 ; SI-NEXT: s_setpc_b64 s[30:31]
789 ; VI-LABEL: v_fshr_v2i16:
791 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
792 ; VI-NEXT: v_mov_b32_e32 v4, 1
793 ; VI-NEXT: v_mov_b32_e32 v5, -1
794 ; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
795 ; VI-NEXT: v_xor_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
796 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
797 ; VI-NEXT: v_lshlrev_b16_e32 v4, v5, v4
798 ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
799 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
800 ; VI-NEXT: v_xor_b32_e32 v4, -1, v2
801 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
802 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
803 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
804 ; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
805 ; VI-NEXT: s_setpc_b64 s[30:31]
807 ; GFX9-LABEL: v_fshr_v2i16:
809 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
811 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
812 ; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3
813 ; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2
814 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0
815 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1
816 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
817 ; GFX9-NEXT: s_setpc_b64 s[30:31]
819 ; R600-LABEL: v_fshr_v2i16:
824 ; GFX10-LABEL: v_fshr_v2i16:
826 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
827 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
828 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
829 ; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2
830 ; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3
831 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1
832 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0
833 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
834 ; GFX10-NEXT: s_setpc_b64 s[30:31]
836 ; GFX11-LABEL: v_fshr_v2i16:
838 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839 ; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
840 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
841 ; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2
842 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
843 ; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3
844 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v2, v1
845 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
846 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v3, v0
847 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
848 ; GFX11-NEXT: s_setpc_b64 s[30:31]
849 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
853 define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
854 ; SI-LABEL: v_fshr_v3i16:
856 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857 ; SI-NEXT: v_or_b32_e32 v7, 16, v7
858 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
859 ; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7
860 ; SI-NEXT: v_or_b32_e32 v4, 16, v6
861 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
862 ; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4
863 ; SI-NEXT: v_or_b32_e32 v3, 16, v8
864 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
865 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
866 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
867 ; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3
868 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
869 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
870 ; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
871 ; SI-NEXT: s_setpc_b64 s[30:31]
873 ; VI-LABEL: v_fshr_v3i16:
875 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
876 ; VI-NEXT: v_mov_b32_e32 v7, 1
877 ; VI-NEXT: v_mov_b32_e32 v8, -1
878 ; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
879 ; VI-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
880 ; VI-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
881 ; VI-NEXT: v_lshlrev_b16_e32 v7, v8, v7
882 ; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
883 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
884 ; VI-NEXT: v_xor_b32_e32 v7, -1, v5
885 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
886 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
887 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
888 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
889 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4
890 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
891 ; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2
892 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
893 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
894 ; VI-NEXT: s_setpc_b64 s[30:31]
896 ; GFX9-LABEL: v_fshr_v3i16:
898 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899 ; GFX9-NEXT: v_mov_b32_e32 v7, 1
900 ; GFX9-NEXT: v_mov_b32_e32 v8, -1
901 ; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
902 ; GFX9-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
903 ; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
904 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v8, v7
905 ; GFX9-NEXT: v_or_b32_e32 v6, v7, v6
906 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
907 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
908 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1
909 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
910 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
911 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
912 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
913 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
914 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
915 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
916 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
917 ; GFX9-NEXT: v_perm_b32 v0, v6, v0, s4
918 ; GFX9-NEXT: s_setpc_b64 s[30:31]
920 ; R600-LABEL: v_fshr_v3i16:
925 ; GFX10-LABEL: v_fshr_v3i16:
927 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
929 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4
930 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
931 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
932 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4
933 ; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6
934 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7
935 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
936 ; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8
937 ; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0
938 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
939 ; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6
940 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5
941 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
942 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
943 ; GFX10-NEXT: v_or_b32_e32 v5, v6, v7
944 ; GFX10-NEXT: v_lshlrev_b16 v1, v4, v1
945 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
946 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
947 ; GFX10-NEXT: s_setpc_b64 s[30:31]
949 ; GFX11-LABEL: v_fshr_v3i16:
951 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
952 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
953 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4
954 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
955 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
956 ; GFX11-NEXT: v_xor_b32_e32 v10, -1, v4
957 ; GFX11-NEXT: v_lshlrev_b16 v6, 1, v6
958 ; GFX11-NEXT: v_xor_b32_e32 v9, -1, v7
959 ; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
960 ; GFX11-NEXT: v_lshrrev_b16 v7, v7, v8
961 ; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0
962 ; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2
963 ; GFX11-NEXT: v_lshlrev_b16 v6, v9, v6
964 ; GFX11-NEXT: v_xor_b32_e32 v4, -1, v5
965 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3
966 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
967 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
968 ; GFX11-NEXT: v_or_b32_e32 v5, v6, v7
969 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
970 ; GFX11-NEXT: v_lshlrev_b16 v1, v4, v1
971 ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
972 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
973 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
974 ; GFX11-NEXT: s_setpc_b64 s[30:31]
975 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
979 define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
980 ; SI-LABEL: v_fshr_v4i16:
982 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
983 ; SI-NEXT: v_or_b32_e32 v9, 16, v9
984 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
985 ; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9
986 ; SI-NEXT: v_or_b32_e32 v5, 16, v8
987 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
988 ; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5
989 ; SI-NEXT: v_or_b32_e32 v4, 16, v11
990 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
991 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
992 ; SI-NEXT: v_or_b32_e32 v5, 16, v10
993 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
994 ; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5
995 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3
996 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
997 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
998 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
999 ; SI-NEXT: v_or_b32_e32 v2, v2, v4
1000 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
1001 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
1002 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1003 ; SI-NEXT: s_setpc_b64 s[30:31]
1005 ; VI-LABEL: v_fshr_v4i16:
1007 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1008 ; VI-NEXT: v_mov_b32_e32 v7, 1
1009 ; VI-NEXT: v_mov_b32_e32 v9, -1
1010 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1011 ; VI-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1012 ; VI-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1013 ; VI-NEXT: v_lshlrev_b16_e32 v8, v10, v8
1014 ; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1015 ; VI-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1016 ; VI-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1017 ; VI-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1018 ; VI-NEXT: v_lshlrev_b16_e32 v7, v9, v7
1019 ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1020 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
1021 ; VI-NEXT: v_xor_b32_e32 v8, -1, v5
1022 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1
1023 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
1024 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
1025 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
1026 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4
1027 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
1028 ; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2
1029 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1030 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1031 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1032 ; VI-NEXT: s_setpc_b64 s[30:31]
1034 ; GFX9-LABEL: v_fshr_v4i16:
1036 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037 ; GFX9-NEXT: v_mov_b32_e32 v7, 1
1038 ; GFX9-NEXT: v_mov_b32_e32 v9, -1
1039 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1040 ; GFX9-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1041 ; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1042 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, v10, v8
1043 ; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1044 ; GFX9-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1045 ; GFX9-NEXT: v_or_b32_e32 v6, v8, v6
1046 ; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1047 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v9, v7
1048 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
1049 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
1050 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5
1051 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1
1052 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
1053 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1054 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
1055 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
1056 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
1057 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
1058 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
1059 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1060 ; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4
1061 ; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4
1062 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1064 ; R600-LABEL: v_fshr_v4i16:
1069 ; GFX10-LABEL: v_fshr_v4i16:
1071 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1072 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1073 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
1074 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
1075 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1076 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4
1077 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2
1078 ; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6
1079 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
1080 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7
1081 ; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9
1082 ; GFX10-NEXT: v_xor_b32_e32 v12, -1, v10
1083 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
1084 ; GFX10-NEXT: v_xor_b32_e32 v13, -1, v5
1085 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
1086 ; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4
1087 ; GFX10-NEXT: v_lshlrev_b16 v7, v7, v8
1088 ; GFX10-NEXT: v_lshrrev_b16 v8, v10, v11
1089 ; GFX10-NEXT: v_lshlrev_b16 v9, v12, v9
1090 ; GFX10-NEXT: v_lshlrev_b16 v1, v13, v1
1091 ; GFX10-NEXT: v_lshlrev_b16 v0, v14, v0
1092 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
1093 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
1094 ; GFX10-NEXT: v_or_b32_e32 v4, v7, v6
1095 ; GFX10-NEXT: v_or_b32_e32 v5, v9, v8
1096 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
1097 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
1098 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
1099 ; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
1100 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1102 ; GFX11-LABEL: v_fshr_v4i16:
1104 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1105 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1106 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5
1107 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1
1108 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1109 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v4
1110 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2
1111 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6
1112 ; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8
1113 ; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7
1114 ; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9
1115 ; GFX11-NEXT: v_xor_b32_e32 v12, -1, v10
1116 ; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
1117 ; GFX11-NEXT: v_xor_b32_e32 v13, -1, v5
1118 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
1119 ; GFX11-NEXT: v_xor_b32_e32 v14, -1, v4
1120 ; GFX11-NEXT: v_lshlrev_b16 v7, v7, v8
1121 ; GFX11-NEXT: v_lshrrev_b16 v8, v10, v11
1122 ; GFX11-NEXT: v_lshlrev_b16 v9, v12, v9
1123 ; GFX11-NEXT: v_lshlrev_b16 v1, v13, v1
1124 ; GFX11-NEXT: v_lshlrev_b16 v0, v14, v0
1125 ; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2
1126 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3
1127 ; GFX11-NEXT: v_or_b32_e32 v4, v7, v6
1128 ; GFX11-NEXT: v_or_b32_e32 v5, v9, v8
1129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1130 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
1131 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
1132 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1133 ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
1134 ; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
1135 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1136 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1140 define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1141 ; SI-LABEL: v_fshr_i64:
1143 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
1145 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
1146 ; SI-NEXT: v_not_b32_e32 v4, v4
1147 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
1148 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1149 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1150 ; SI-NEXT: s_setpc_b64 s[30:31]
1152 ; VI-LABEL: v_fshr_i64:
1154 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1155 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1156 ; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1157 ; VI-NEXT: v_not_b32_e32 v4, v4
1158 ; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1159 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
1160 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1161 ; VI-NEXT: s_setpc_b64 s[30:31]
1163 ; GFX9-LABEL: v_fshr_i64:
1165 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1167 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1168 ; GFX9-NEXT: v_not_b32_e32 v4, v4
1169 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1170 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1171 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
1172 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1174 ; R600-LABEL: v_fshr_i64:
1179 ; GFX10-LABEL: v_fshr_i64:
1181 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1182 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1183 ; GFX10-NEXT: v_not_b32_e32 v5, v4
1184 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1185 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
1186 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
1187 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
1188 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1190 ; GFX11-LABEL: v_fshr_i64:
1192 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1194 ; GFX11-NEXT: v_not_b32_e32 v5, v4
1195 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1196 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1197 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
1198 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
1199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1200 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
1201 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1202 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1206 define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1207 ; SI-LABEL: v_fshr_v2i64:
1209 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
1211 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
1212 ; SI-NEXT: v_not_b32_e32 v8, v8
1213 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
1214 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
1215 ; SI-NEXT: v_or_b32_e32 v1, v1, v5
1216 ; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10
1217 ; SI-NEXT: v_not_b32_e32 v7, v10
1218 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
1219 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
1220 ; SI-NEXT: v_or_b32_e32 v3, v3, v6
1221 ; SI-NEXT: v_or_b32_e32 v2, v2, v5
1222 ; SI-NEXT: s_setpc_b64 s[30:31]
1224 ; VI-LABEL: v_fshr_v2i64:
1226 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1227 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1228 ; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1229 ; VI-NEXT: v_not_b32_e32 v8, v8
1230 ; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
1231 ; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1232 ; VI-NEXT: v_or_b32_e32 v1, v1, v5
1233 ; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7]
1234 ; VI-NEXT: v_not_b32_e32 v7, v10
1235 ; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
1236 ; VI-NEXT: v_or_b32_e32 v0, v0, v4
1237 ; VI-NEXT: v_or_b32_e32 v3, v3, v6
1238 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
1239 ; VI-NEXT: s_setpc_b64 s[30:31]
1241 ; GFX9-LABEL: v_fshr_v2i64:
1243 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1244 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1245 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1246 ; GFX9-NEXT: v_not_b32_e32 v8, v8
1247 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
1248 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1249 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
1250 ; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7]
1251 ; GFX9-NEXT: v_not_b32_e32 v7, v10
1252 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
1253 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
1254 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v6
1255 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
1256 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1258 ; R600-LABEL: v_fshr_v2i64:
1263 ; GFX10-LABEL: v_fshr_v2i64:
1265 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1267 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1268 ; GFX10-NEXT: v_not_b32_e32 v9, v8
1269 ; GFX10-NEXT: v_not_b32_e32 v11, v10
1270 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1271 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
1272 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
1273 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
1274 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
1275 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
1276 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
1277 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
1278 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1280 ; GFX11-LABEL: v_fshr_v2i64:
1282 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1284 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1285 ; GFX11-NEXT: v_not_b32_e32 v9, v8
1286 ; GFX11-NEXT: v_not_b32_e32 v11, v10
1287 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1288 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
1289 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1290 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
1291 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
1292 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1293 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v4
1294 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v5
1295 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1296 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v6
1297 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v7
1298 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1299 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1303 define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1304 ; SI-LABEL: v_fshr_i24:
1306 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1307 ; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1308 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
1309 ; SI-NEXT: v_mul_hi_u32 v3, v3, s4
1310 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1311 ; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1312 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
1313 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
1314 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
1315 ; SI-NEXT: s_setpc_b64 s[30:31]
1317 ; VI-LABEL: v_fshr_i24:
1319 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320 ; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1321 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
1322 ; VI-NEXT: v_mul_hi_u32 v3, v3, s4
1323 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1324 ; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1325 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
1326 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
1327 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2
1328 ; VI-NEXT: s_setpc_b64 s[30:31]
1330 ; GFX9-LABEL: v_fshr_i24:
1332 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1333 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1334 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
1335 ; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4
1336 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1337 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1338 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
1339 ; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
1340 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2
1341 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1343 ; R600-LABEL: v_fshr_i24:
1348 ; GFX10-LABEL: v_fshr_i24:
1350 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1351 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1352 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1353 ; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
1354 ; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1355 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3
1356 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2
1357 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2
1358 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1360 ; GFX11-LABEL: v_fshr_i24:
1362 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1363 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2
1364 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1365 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1366 ; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3
1367 ; GFX11-NEXT: v_mul_u32_u24_e32 v3, 24, v3
1368 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1369 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3
1370 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 8, v2
1371 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1372 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2
1373 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1374 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1378 define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1379 ; SI-LABEL: v_fshr_v2i24:
1381 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1382 ; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1383 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab
1384 ; SI-NEXT: v_mul_hi_u32 v6, v6, s4
1385 ; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1386 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1387 ; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1388 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
1389 ; SI-NEXT: v_mul_hi_u32 v6, v7, s4
1390 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
1391 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
1392 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1393 ; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
1394 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3
1395 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
1396 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3
1397 ; SI-NEXT: s_setpc_b64 s[30:31]
1399 ; VI-LABEL: v_fshr_v2i24:
1401 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1402 ; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1403 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab
1404 ; VI-NEXT: v_mul_hi_u32 v6, v6, s4
1405 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1406 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1407 ; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1408 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
1409 ; VI-NEXT: v_mul_hi_u32 v6, v7, s4
1410 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
1411 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
1412 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1413 ; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6
1414 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3
1415 ; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
1416 ; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3
1417 ; VI-NEXT: s_setpc_b64 s[30:31]
1419 ; GFX9-LABEL: v_fshr_v2i24:
1421 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1423 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab
1424 ; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4
1425 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1426 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1427 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1428 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
1429 ; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4
1430 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
1431 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
1432 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1433 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6
1434 ; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3
1435 ; GFX9-NEXT: v_add_u32_e32 v3, 8, v3
1436 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
1437 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1439 ; R600-LABEL: v_fshr_v2i24:
1444 ; GFX10-LABEL: v_fshr_v2i24:
1446 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1447 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1448 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1449 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1450 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1451 ; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
1452 ; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
1453 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1454 ; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7
1455 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6
1456 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7
1457 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4
1458 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5
1459 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
1460 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
1461 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1463 ; GFX11-LABEL: v_fshr_v2i24:
1465 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v4
1467 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v5
1468 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1469 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1470 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1471 ; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6
1472 ; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7
1473 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1474 ; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6
1475 ; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7
1476 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1477 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6
1478 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7
1479 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1480 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4
1481 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5
1482 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1483 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4
1484 ; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5
1485 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1486 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)