1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600
6 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
8 declare i32 @llvm.fshr.i32(i32, i32, i32)
9 declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
10 declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
11 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
12 declare i16 @llvm.fshr.i16(i16, i16, i16)
13 declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
14 declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
15 declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
16 declare i64 @llvm.fshr.i64(i64, i64, i64)
17 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
18 declare i24 @llvm.fshr.i24(i24, i24, i24)
19 declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
21 define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
23 ; SI: ; %bb.0: ; %entry
24 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
25 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
26 ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
27 ; SI-NEXT: s_mov_b32 s7, 0xf000
28 ; SI-NEXT: s_mov_b32 s6, -1
29 ; SI-NEXT: s_waitcnt lgkmcnt(0)
30 ; SI-NEXT: v_mov_b32_e32 v0, s3
31 ; SI-NEXT: v_mov_b32_e32 v1, s0
32 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, v1
33 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
37 ; VI: ; %bb.0: ; %entry
38 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
39 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
40 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
41 ; VI-NEXT: s_waitcnt lgkmcnt(0)
42 ; VI-NEXT: v_mov_b32_e32 v0, s5
43 ; VI-NEXT: v_mov_b32_e32 v1, s0
44 ; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1
45 ; VI-NEXT: v_mov_b32_e32 v0, s2
46 ; VI-NEXT: v_mov_b32_e32 v1, s3
47 ; VI-NEXT: flat_store_dword v[0:1], v2
50 ; GFX9-LABEL: fshr_i32:
51 ; GFX9: ; %bb.0: ; %entry
52 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
53 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
54 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34
55 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
56 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
58 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
59 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2
60 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
63 ; R600-LABEL: fshr_i32:
64 ; R600: ; %bb.0: ; %entry
65 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
66 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
69 ; R600-NEXT: ALU clause starting at 4:
70 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
71 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
72 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
74 ; GFX10-LABEL: fshr_i32:
75 ; GFX10: ; %bb.0: ; %entry
76 ; GFX10-NEXT: s_clause 0x2
77 ; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34
78 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
79 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
80 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
81 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX10-NEXT: v_mov_b32_e32 v0, s6
83 ; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0
84 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
85 ; GFX10-NEXT: s_endpgm
87 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
88 store i32 %0, i32 addrspace(1)* %in
92 define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
93 ; SI-LABEL: fshr_i32_imm:
94 ; SI: ; %bb.0: ; %entry
95 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
96 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
97 ; SI-NEXT: s_mov_b32 s7, 0xf000
98 ; SI-NEXT: s_mov_b32 s6, -1
99 ; SI-NEXT: s_waitcnt lgkmcnt(0)
100 ; SI-NEXT: v_mov_b32_e32 v0, s1
101 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, 7
102 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
105 ; VI-LABEL: fshr_i32_imm:
106 ; VI: ; %bb.0: ; %entry
107 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
108 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
109 ; VI-NEXT: s_waitcnt lgkmcnt(0)
110 ; VI-NEXT: v_mov_b32_e32 v0, s1
111 ; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7
112 ; VI-NEXT: v_mov_b32_e32 v0, s2
113 ; VI-NEXT: v_mov_b32_e32 v1, s3
114 ; VI-NEXT: flat_store_dword v[0:1], v2
117 ; GFX9-LABEL: fshr_i32_imm:
118 ; GFX9: ; %bb.0: ; %entry
119 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
120 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
121 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
122 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
124 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 7
125 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
126 ; GFX9-NEXT: s_endpgm
128 ; R600-LABEL: fshr_i32_imm:
129 ; R600: ; %bb.0: ; %entry
130 ; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
131 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
134 ; R600-NEXT: ALU clause starting at 4:
135 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
136 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
137 ; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
138 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
140 ; GFX10-LABEL: fshr_i32_imm:
141 ; GFX10: ; %bb.0: ; %entry
142 ; GFX10-NEXT: s_clause 0x1
143 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
144 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
145 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
146 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7
148 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
149 ; GFX10-NEXT: s_endpgm
151 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
152 store i32 %0, i32 addrspace(1)* %in
156 define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
157 ; SI-LABEL: fshr_v2i32:
158 ; SI: ; %bb.0: ; %entry
159 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
160 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
161 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
162 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
163 ; SI-NEXT: s_mov_b32 s7, 0xf000
164 ; SI-NEXT: s_mov_b32 s6, -1
165 ; SI-NEXT: s_waitcnt lgkmcnt(0)
166 ; SI-NEXT: v_mov_b32_e32 v0, s9
167 ; SI-NEXT: v_mov_b32_e32 v1, s1
168 ; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
169 ; SI-NEXT: v_mov_b32_e32 v0, s8
170 ; SI-NEXT: v_mov_b32_e32 v2, s0
171 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
172 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
175 ; VI-LABEL: fshr_v2i32:
176 ; VI: ; %bb.0: ; %entry
177 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
178 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
179 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
180 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
182 ; VI-NEXT: v_mov_b32_e32 v0, s7
183 ; VI-NEXT: v_mov_b32_e32 v1, s1
184 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
185 ; VI-NEXT: v_mov_b32_e32 v0, s6
186 ; VI-NEXT: v_mov_b32_e32 v2, s0
187 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
188 ; VI-NEXT: v_mov_b32_e32 v2, s2
189 ; VI-NEXT: v_mov_b32_e32 v3, s3
190 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
193 ; GFX9-LABEL: fshr_v2i32:
194 ; GFX9: ; %bb.0: ; %entry
195 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
196 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
197 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
198 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c
199 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
200 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
201 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
202 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
203 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
204 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
205 ; GFX9-NEXT: v_mov_b32_e32 v3, s8
206 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3
207 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
208 ; GFX9-NEXT: s_endpgm
210 ; R600-LABEL: fshr_v2i32:
211 ; R600: ; %bb.0: ; %entry
212 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
213 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
216 ; R600-NEXT: ALU clause starting at 4:
217 ; R600-NEXT: MOV * T0.W, KC0[4].X,
218 ; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
219 ; R600-NEXT: MOV * T0.W, KC0[3].W,
220 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
221 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
222 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
224 ; GFX10-LABEL: fshr_v2i32:
225 ; GFX10: ; %bb.0: ; %entry
226 ; GFX10-NEXT: s_clause 0x3
227 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
228 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
229 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
230 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
231 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
232 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
233 ; GFX10-NEXT: v_mov_b32_e32 v0, s3
234 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
235 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0
236 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2
237 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9]
238 ; GFX10-NEXT: s_endpgm
240 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
241 store <2 x i32> %0, <2 x i32> addrspace(1)* %in
245 define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
246 ; SI-LABEL: fshr_v2i32_imm:
247 ; SI: ; %bb.0: ; %entry
248 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
249 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
250 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
251 ; SI-NEXT: s_mov_b32 s7, 0xf000
252 ; SI-NEXT: s_mov_b32 s6, -1
253 ; SI-NEXT: s_waitcnt lgkmcnt(0)
254 ; SI-NEXT: v_mov_b32_e32 v0, s1
255 ; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9
256 ; SI-NEXT: v_mov_b32_e32 v0, s0
257 ; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7
258 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
261 ; VI-LABEL: fshr_v2i32_imm:
262 ; VI: ; %bb.0: ; %entry
263 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
264 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
265 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
266 ; VI-NEXT: s_waitcnt lgkmcnt(0)
267 ; VI-NEXT: v_mov_b32_e32 v0, s1
268 ; VI-NEXT: v_mov_b32_e32 v2, s0
269 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9
270 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7
271 ; VI-NEXT: v_mov_b32_e32 v2, s2
272 ; VI-NEXT: v_mov_b32_e32 v3, s3
273 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
276 ; GFX9-LABEL: fshr_v2i32_imm:
277 ; GFX9: ; %bb.0: ; %entry
278 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
279 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
280 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
281 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
282 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
283 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
284 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
285 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9
286 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7
287 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
288 ; GFX9-NEXT: s_endpgm
290 ; R600-LABEL: fshr_v2i32_imm:
291 ; R600: ; %bb.0: ; %entry
292 ; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
293 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
296 ; R600-NEXT: ALU clause starting at 4:
297 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
298 ; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00)
299 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
300 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
301 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
302 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
304 ; GFX10-LABEL: fshr_v2i32_imm:
305 ; GFX10: ; %bb.0: ; %entry
306 ; GFX10-NEXT: s_clause 0x2
307 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
308 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
309 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
310 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
311 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
312 ; GFX10-NEXT: v_alignbit_b32 v1, s3, s5, 9
313 ; GFX10-NEXT: v_alignbit_b32 v0, s2, s4, 7
314 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
315 ; GFX10-NEXT: s_endpgm
317 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
318 store <2 x i32> %0, <2 x i32> addrspace(1)* %in
322 define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
323 ; SI-LABEL: fshr_v4i32:
324 ; SI: ; %bb.0: ; %entry
325 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
326 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
327 ; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11
328 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15
329 ; SI-NEXT: s_mov_b32 s7, 0xf000
330 ; SI-NEXT: s_mov_b32 s6, -1
331 ; SI-NEXT: s_waitcnt lgkmcnt(0)
332 ; SI-NEXT: v_mov_b32_e32 v0, s15
333 ; SI-NEXT: v_mov_b32_e32 v1, s3
334 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1
335 ; SI-NEXT: v_mov_b32_e32 v0, s14
336 ; SI-NEXT: v_mov_b32_e32 v1, s2
337 ; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1
338 ; SI-NEXT: v_mov_b32_e32 v0, s13
339 ; SI-NEXT: v_mov_b32_e32 v1, s1
340 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
341 ; SI-NEXT: v_mov_b32_e32 v0, s12
342 ; SI-NEXT: v_mov_b32_e32 v4, s0
343 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
344 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
347 ; VI-LABEL: fshr_v4i32:
348 ; VI: ; %bb.0: ; %entry
349 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
350 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
351 ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
352 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
353 ; VI-NEXT: s_waitcnt lgkmcnt(0)
354 ; VI-NEXT: v_mov_b32_e32 v0, s11
355 ; VI-NEXT: v_mov_b32_e32 v1, s3
356 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1
357 ; VI-NEXT: v_mov_b32_e32 v0, s10
358 ; VI-NEXT: v_mov_b32_e32 v1, s2
359 ; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1
360 ; VI-NEXT: v_mov_b32_e32 v0, s9
361 ; VI-NEXT: v_mov_b32_e32 v1, s1
362 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
363 ; VI-NEXT: v_mov_b32_e32 v0, s8
364 ; VI-NEXT: v_mov_b32_e32 v4, s0
365 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
366 ; VI-NEXT: v_mov_b32_e32 v4, s12
367 ; VI-NEXT: v_mov_b32_e32 v5, s13
368 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
371 ; GFX9-LABEL: fshr_v4i32:
372 ; GFX9: ; %bb.0: ; %entry
373 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
374 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
375 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
376 ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
377 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
378 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
380 ; GFX9-NEXT: v_mov_b32_e32 v1, s15
381 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
382 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
383 ; GFX9-NEXT: v_mov_b32_e32 v1, s14
384 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1
385 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
386 ; GFX9-NEXT: v_mov_b32_e32 v1, s13
387 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
388 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
389 ; GFX9-NEXT: v_mov_b32_e32 v5, s12
390 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
391 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
392 ; GFX9-NEXT: s_endpgm
394 ; R600-LABEL: fshr_v4i32:
395 ; R600: ; %bb.0: ; %entry
396 ; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
397 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
400 ; R600-NEXT: ALU clause starting at 4:
401 ; R600-NEXT: MOV * T0.W, KC0[6].X,
402 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
403 ; R600-NEXT: MOV * T1.W, KC0[5].W,
404 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
405 ; R600-NEXT: MOV * T1.W, KC0[5].Z,
406 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
407 ; R600-NEXT: MOV * T1.W, KC0[5].Y,
408 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
409 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
410 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
412 ; GFX10-LABEL: fshr_v4i32:
413 ; GFX10: ; %bb.0: ; %entry
414 ; GFX10-NEXT: s_clause 0x3
415 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54
416 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
417 ; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
418 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
419 ; GFX10-NEXT: v_mov_b32_e32 v6, 0
420 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
421 ; GFX10-NEXT: v_mov_b32_e32 v0, s7
422 ; GFX10-NEXT: v_mov_b32_e32 v1, s6
423 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
424 ; GFX10-NEXT: v_mov_b32_e32 v5, s4
425 ; GFX10-NEXT: v_alignbit_b32 v3, s15, s11, v0
426 ; GFX10-NEXT: v_alignbit_b32 v2, s14, s10, v1
427 ; GFX10-NEXT: v_alignbit_b32 v1, s13, s9, v4
428 ; GFX10-NEXT: v_alignbit_b32 v0, s12, s8, v5
429 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3]
430 ; GFX10-NEXT: s_endpgm
432 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
433 store <4 x i32> %0, <4 x i32> addrspace(1)* %in
437 define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
438 ; SI-LABEL: fshr_v4i32_imm:
439 ; SI: ; %bb.0: ; %entry
440 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
441 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
442 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11
443 ; SI-NEXT: s_mov_b32 s7, 0xf000
444 ; SI-NEXT: s_mov_b32 s6, -1
445 ; SI-NEXT: s_waitcnt lgkmcnt(0)
446 ; SI-NEXT: v_mov_b32_e32 v0, s3
447 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1
448 ; SI-NEXT: v_mov_b32_e32 v0, s2
449 ; SI-NEXT: v_alignbit_b32 v2, s10, v0, 9
450 ; SI-NEXT: v_mov_b32_e32 v0, s1
451 ; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7
452 ; SI-NEXT: v_mov_b32_e32 v0, s0
453 ; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1
454 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
457 ; VI-LABEL: fshr_v4i32_imm:
458 ; VI: ; %bb.0: ; %entry
459 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
460 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
461 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
462 ; VI-NEXT: s_waitcnt lgkmcnt(0)
463 ; VI-NEXT: v_mov_b32_e32 v4, s8
464 ; VI-NEXT: v_mov_b32_e32 v5, s9
465 ; VI-NEXT: v_mov_b32_e32 v0, s3
466 ; VI-NEXT: v_mov_b32_e32 v1, s2
467 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1
468 ; VI-NEXT: v_mov_b32_e32 v0, s1
469 ; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9
470 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 7
471 ; VI-NEXT: v_mov_b32_e32 v0, s0
472 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
473 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
476 ; GFX9-LABEL: fshr_v4i32_imm:
477 ; GFX9: ; %bb.0: ; %entry
478 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
479 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
480 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
481 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
482 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX9-NEXT: v_mov_b32_e32 v0, s11
484 ; GFX9-NEXT: v_mov_b32_e32 v1, s10
485 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1
486 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
487 ; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9
488 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
489 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
490 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
491 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
492 ; GFX9-NEXT: s_endpgm
494 ; R600-LABEL: fshr_v4i32_imm:
495 ; R600: ; %bb.0: ; %entry
496 ; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
497 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
500 ; R600-NEXT: ALU clause starting at 4:
501 ; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
502 ; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
503 ; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00)
504 ; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
505 ; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
506 ; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
507 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
508 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
510 ; GFX10-LABEL: fshr_v4i32_imm:
511 ; GFX10: ; %bb.0: ; %entry
512 ; GFX10-NEXT: s_clause 0x2
513 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
514 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
515 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
516 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
517 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
518 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1
519 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9
520 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7
521 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1
522 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
523 ; GFX10-NEXT: s_endpgm
525 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
526 store <4 x i32> %0, <4 x i32> addrspace(1)* %in
530 define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
531 ; GFX89-LABEL: v_fshr_i32:
533 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2
535 ; GFX89-NEXT: s_setpc_b64 s[30:31]
537 ; R600-LABEL: v_fshr_i32:
542 ; GFX10-LABEL: v_fshr_i32:
544 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
546 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2
547 ; GFX10-NEXT: s_setpc_b64 s[30:31]
548 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
552 define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
553 ; GFX89-LABEL: v_fshr_v2i32:
555 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4
557 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5
558 ; GFX89-NEXT: s_setpc_b64 s[30:31]
560 ; R600-LABEL: v_fshr_v2i32:
565 ; GFX10-LABEL: v_fshr_v2i32:
567 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
569 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
570 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
571 ; GFX10-NEXT: s_setpc_b64 s[30:31]
572 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
576 define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
577 ; GFX89-LABEL: v_fshr_v3i32:
579 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6
581 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7
582 ; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8
583 ; GFX89-NEXT: s_setpc_b64 s[30:31]
585 ; R600-LABEL: v_fshr_v3i32:
590 ; GFX10-LABEL: v_fshr_v3i32:
592 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
594 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6
595 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7
596 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8
597 ; GFX10-NEXT: s_setpc_b64 s[30:31]
598 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
602 define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
603 ; GFX89-LABEL: v_fshr_v4i32:
605 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606 ; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8
607 ; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9
608 ; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10
609 ; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11
610 ; GFX89-NEXT: s_setpc_b64 s[30:31]
612 ; R600-LABEL: v_fshr_v4i32:
617 ; GFX10-LABEL: v_fshr_v4i32:
619 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
621 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8
622 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9
623 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10
624 ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11
625 ; GFX10-NEXT: s_setpc_b64 s[30:31]
626 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
630 define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
631 ; SI-LABEL: v_fshr_i16:
633 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634 ; SI-NEXT: v_or_b32_e32 v2, 16, v2
635 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
636 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
637 ; SI-NEXT: s_setpc_b64 s[30:31]
639 ; VI-LABEL: v_fshr_i16:
641 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642 ; VI-NEXT: v_xor_b32_e32 v3, -1, v2
643 ; VI-NEXT: v_and_b32_e32 v2, 15, v2
644 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
645 ; VI-NEXT: v_and_b32_e32 v3, 15, v3
646 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
647 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
648 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
649 ; VI-NEXT: s_setpc_b64 s[30:31]
651 ; GFX9-LABEL: v_fshr_i16:
653 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
655 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
656 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
657 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
658 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
659 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
660 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
661 ; GFX9-NEXT: s_setpc_b64 s[30:31]
663 ; R600-LABEL: v_fshr_i16:
668 ; GFX10-LABEL: v_fshr_i16:
670 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
671 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
672 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
673 ; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
674 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
675 ; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
676 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
677 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
678 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
679 ; GFX10-NEXT: s_setpc_b64 s[30:31]
680 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
684 define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
685 ; SI-LABEL: v_fshr_v2i16:
687 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688 ; SI-NEXT: v_or_b32_e32 v5, 16, v5
689 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
690 ; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5
691 ; SI-NEXT: v_or_b32_e32 v3, 16, v4
692 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
693 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3
694 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
695 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
696 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
697 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
698 ; SI-NEXT: s_setpc_b64 s[30:31]
700 ; VI-LABEL: v_fshr_v2i16:
702 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
704 ; VI-NEXT: v_and_b32_e32 v4, 15, v3
705 ; VI-NEXT: v_mov_b32_e32 v5, 1
706 ; VI-NEXT: v_xor_b32_e32 v3, -1, v3
707 ; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
708 ; VI-NEXT: v_and_b32_e32 v3, 15, v3
709 ; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
710 ; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5
711 ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
712 ; VI-NEXT: v_xor_b32_e32 v4, -1, v2
713 ; VI-NEXT: v_and_b32_e32 v2, 15, v2
714 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
715 ; VI-NEXT: v_and_b32_e32 v4, 15, v4
716 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0
717 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1
718 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
719 ; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
720 ; VI-NEXT: s_setpc_b64 s[30:31]
722 ; GFX9-LABEL: v_fshr_v2i16:
724 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
726 ; GFX9-NEXT: s_mov_b32 s4, 0xf000f
727 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v2
728 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
729 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3
730 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0
731 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1
732 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
733 ; GFX9-NEXT: s_setpc_b64 s[30:31]
735 ; R600-LABEL: v_fshr_v2i16:
740 ; GFX10-LABEL: v_fshr_v2i16:
742 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
743 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
744 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
745 ; GFX10-NEXT: s_mov_b32 s4, 0xf000f
746 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
747 ; GFX10-NEXT: v_and_b32_e32 v2, s4, v2
748 ; GFX10-NEXT: v_and_b32_e32 v3, s4, v3
749 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1
750 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0
751 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
752 ; GFX10-NEXT: s_setpc_b64 s[30:31]
753 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
757 define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
758 ; SI-LABEL: v_fshr_v3i16:
760 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761 ; SI-NEXT: v_or_b32_e32 v7, 16, v7
762 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
763 ; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7
764 ; SI-NEXT: v_or_b32_e32 v4, 16, v6
765 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
766 ; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4
767 ; SI-NEXT: s_mov_b32 s4, 0xffff
768 ; SI-NEXT: v_or_b32_e32 v3, 16, v8
769 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
770 ; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3
771 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
772 ; SI-NEXT: v_and_b32_e32 v0, s4, v0
773 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
774 ; SI-NEXT: v_and_b32_e32 v2, s4, v3
775 ; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
776 ; SI-NEXT: s_setpc_b64 s[30:31]
778 ; VI-LABEL: v_fshr_v3i16:
780 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
781 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
782 ; VI-NEXT: v_and_b32_e32 v7, 15, v6
783 ; VI-NEXT: v_mov_b32_e32 v8, 1
784 ; VI-NEXT: v_xor_b32_e32 v6, -1, v6
785 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
786 ; VI-NEXT: v_and_b32_e32 v6, 15, v6
787 ; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
788 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8
789 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
790 ; VI-NEXT: v_xor_b32_e32 v7, -1, v5
791 ; VI-NEXT: v_and_b32_e32 v5, 15, v5
792 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
793 ; VI-NEXT: v_and_b32_e32 v7, 15, v7
794 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1
795 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
796 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
797 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4
798 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
799 ; VI-NEXT: v_and_b32_e32 v3, 15, v3
800 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
801 ; VI-NEXT: v_and_b32_e32 v3, 15, v4
802 ; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2
803 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
804 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
805 ; VI-NEXT: s_setpc_b64 s[30:31]
807 ; GFX9-LABEL: v_fshr_v3i16:
809 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
811 ; GFX9-NEXT: v_and_b32_e32 v7, 15, v6
812 ; GFX9-NEXT: v_mov_b32_e32 v8, 1
813 ; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
814 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
815 ; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
816 ; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
817 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8
818 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
819 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
820 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
821 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
822 ; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
823 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1
824 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
825 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
826 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
827 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
828 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
829 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
830 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v4
831 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
832 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
833 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
834 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
835 ; GFX9-NEXT: s_setpc_b64 s[30:31]
837 ; R600-LABEL: v_fshr_v3i16:
842 ; GFX10-LABEL: v_fshr_v3i16:
844 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
845 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
846 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
847 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
848 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
849 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
850 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
851 ; GFX10-NEXT: v_and_b32_e32 v8, 15, v8
852 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v6
853 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6
854 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
855 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
856 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
857 ; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10
858 ; GFX10-NEXT: v_and_b32_e32 v6, 15, v6
859 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5
860 ; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7
861 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
862 ; GFX10-NEXT: v_and_b32_e32 v2, 15, v5
863 ; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10
864 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
865 ; GFX10-NEXT: v_and_b32_e32 v7, 15, v11
866 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
867 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3
868 ; GFX10-NEXT: v_or_b32_e32 v4, v6, v4
869 ; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1
870 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
871 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
872 ; GFX10-NEXT: s_setpc_b64 s[30:31]
873 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
877 define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
878 ; SI-LABEL: v_fshr_v4i16:
880 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881 ; SI-NEXT: v_or_b32_e32 v9, 16, v9
882 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
883 ; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9
884 ; SI-NEXT: v_or_b32_e32 v5, 16, v8
885 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
886 ; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5
887 ; SI-NEXT: v_or_b32_e32 v4, 16, v11
888 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7
889 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
890 ; SI-NEXT: v_or_b32_e32 v4, 16, v10
891 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
892 ; SI-NEXT: s_mov_b32 s4, 0xffff
893 ; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4
894 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
895 ; SI-NEXT: v_and_b32_e32 v2, s4, v2
896 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
897 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
898 ; SI-NEXT: v_and_b32_e32 v0, s4, v0
899 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
900 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
901 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
902 ; SI-NEXT: s_setpc_b64 s[30:31]
904 ; VI-LABEL: v_fshr_v4i16:
906 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
908 ; VI-NEXT: v_and_b32_e32 v7, 15, v6
909 ; VI-NEXT: v_xor_b32_e32 v6, -1, v6
910 ; VI-NEXT: v_mov_b32_e32 v8, 1
911 ; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
912 ; VI-NEXT: v_and_b32_e32 v6, 15, v6
913 ; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
914 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9
915 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
916 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
917 ; VI-NEXT: v_and_b32_e32 v9, 15, v7
918 ; VI-NEXT: v_xor_b32_e32 v7, -1, v7
919 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
920 ; VI-NEXT: v_and_b32_e32 v7, 15, v7
921 ; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8
922 ; VI-NEXT: v_xor_b32_e32 v8, -1, v5
923 ; VI-NEXT: v_and_b32_e32 v5, 15, v5
924 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1
925 ; VI-NEXT: v_and_b32_e32 v8, 15, v8
926 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1
927 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3
928 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
929 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4
930 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0
931 ; VI-NEXT: v_and_b32_e32 v3, 15, v3
932 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0
933 ; VI-NEXT: v_and_b32_e32 v3, 15, v4
934 ; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
935 ; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2
936 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
937 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
938 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
939 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
940 ; VI-NEXT: s_setpc_b64 s[30:31]
942 ; GFX9-LABEL: v_fshr_v4i16:
944 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5
946 ; GFX9-NEXT: v_and_b32_e32 v7, 15, v6
947 ; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6
948 ; GFX9-NEXT: v_mov_b32_e32 v8, 1
949 ; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
950 ; GFX9-NEXT: v_and_b32_e32 v6, 15, v6
951 ; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
952 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9
953 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
954 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
955 ; GFX9-NEXT: v_and_b32_e32 v9, 15, v7
956 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7
957 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
958 ; GFX9-NEXT: v_and_b32_e32 v7, 15, v7
959 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8
960 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5
961 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
962 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
963 ; GFX9-NEXT: v_and_b32_e32 v8, 15, v8
964 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1
965 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3
966 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
967 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4
968 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
969 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
970 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
971 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v4
972 ; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
973 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
974 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
975 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
976 ; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
977 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9
978 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
979 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0
980 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1
981 ; GFX9-NEXT: s_setpc_b64 s[30:31]
983 ; R600-LABEL: v_fshr_v4i16:
988 ; GFX10-LABEL: v_fshr_v4i16:
990 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
991 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
992 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5
993 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
994 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4
995 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
996 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0
997 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6
998 ; GFX10-NEXT: v_and_b32_e32 v6, 15, v6
999 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
1000 ; GFX10-NEXT: v_and_b32_e32 v13, 15, v10
1001 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
1002 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
1003 ; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
1004 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
1005 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
1006 ; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11
1007 ; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8
1008 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
1009 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10
1010 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5
1011 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
1012 ; GFX10-NEXT: v_and_b32_e32 v5, 15, v5
1013 ; GFX10-NEXT: v_and_b32_e32 v8, 15, v8
1014 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
1015 ; GFX10-NEXT: v_and_b32_e32 v10, 15, v10
1016 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
1017 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
1018 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
1019 ; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12
1020 ; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1
1021 ; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11
1022 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
1023 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
1024 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
1025 ; GFX10-NEXT: v_or_b32_e32 v3, v7, v6
1026 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
1027 ; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
1028 ; GFX10-NEXT: v_and_b32_e32 v1, v2, v1
1029 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
1030 ; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
1031 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1032 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1036 define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1037 ; SI-LABEL: v_fshr_i64:
1039 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040 ; SI-NEXT: v_and_b32_e32 v5, 63, v4
1041 ; SI-NEXT: v_not_b32_e32 v4, v4
1042 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
1043 ; SI-NEXT: v_and_b32_e32 v4, 63, v4
1044 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5
1045 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
1046 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
1047 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1048 ; SI-NEXT: s_setpc_b64 s[30:31]
1050 ; VI-LABEL: v_fshr_i64:
1052 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1053 ; VI-NEXT: v_and_b32_e32 v5, 63, v4
1054 ; VI-NEXT: v_not_b32_e32 v4, v4
1055 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1056 ; VI-NEXT: v_and_b32_e32 v4, 63, v4
1057 ; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
1058 ; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1059 ; VI-NEXT: v_or_b32_e32 v1, v1, v3
1060 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1061 ; VI-NEXT: s_setpc_b64 s[30:31]
1063 ; GFX9-LABEL: v_fshr_i64:
1065 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1066 ; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
1067 ; GFX9-NEXT: v_not_b32_e32 v4, v4
1068 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1069 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
1070 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
1071 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1072 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
1073 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
1074 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1076 ; R600-LABEL: v_fshr_i64:
1081 ; GFX10-LABEL: v_fshr_i64:
1083 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1085 ; GFX10-NEXT: v_not_b32_e32 v5, v4
1086 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1087 ; GFX10-NEXT: v_and_b32_e32 v4, 63, v4
1088 ; GFX10-NEXT: v_and_b32_e32 v5, 63, v5
1089 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
1090 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
1091 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
1092 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
1093 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1094 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1098 define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1099 ; SI-LABEL: v_fshr_v2i64:
1101 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102 ; SI-NEXT: v_and_b32_e32 v9, 63, v8
1103 ; SI-NEXT: v_not_b32_e32 v8, v8
1104 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
1105 ; SI-NEXT: v_and_b32_e32 v8, 63, v8
1106 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v9
1107 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
1108 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
1109 ; SI-NEXT: v_or_b32_e32 v1, v1, v5
1110 ; SI-NEXT: v_and_b32_e32 v5, 63, v10
1111 ; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v5
1112 ; SI-NEXT: v_not_b32_e32 v7, v10
1113 ; SI-NEXT: v_and_b32_e32 v7, 63, v7
1114 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7
1115 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
1116 ; SI-NEXT: v_or_b32_e32 v3, v3, v6
1117 ; SI-NEXT: v_or_b32_e32 v2, v2, v5
1118 ; SI-NEXT: s_setpc_b64 s[30:31]
1120 ; VI-LABEL: v_fshr_v2i64:
1122 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123 ; VI-NEXT: v_and_b32_e32 v9, 63, v8
1124 ; VI-NEXT: v_not_b32_e32 v8, v8
1125 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1126 ; VI-NEXT: v_and_b32_e32 v8, 63, v8
1127 ; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
1128 ; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
1129 ; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1130 ; VI-NEXT: v_or_b32_e32 v1, v1, v5
1131 ; VI-NEXT: v_and_b32_e32 v5, 63, v10
1132 ; VI-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7]
1133 ; VI-NEXT: v_not_b32_e32 v7, v10
1134 ; VI-NEXT: v_and_b32_e32 v7, 63, v7
1135 ; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
1136 ; VI-NEXT: v_or_b32_e32 v0, v0, v4
1137 ; VI-NEXT: v_or_b32_e32 v3, v3, v6
1138 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
1139 ; VI-NEXT: s_setpc_b64 s[30:31]
1141 ; GFX9-LABEL: v_fshr_v2i64:
1143 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144 ; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
1145 ; GFX9-NEXT: v_not_b32_e32 v8, v8
1146 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1147 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
1148 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
1149 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
1150 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1151 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
1152 ; GFX9-NEXT: v_and_b32_e32 v5, 63, v10
1153 ; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7]
1154 ; GFX9-NEXT: v_not_b32_e32 v7, v10
1155 ; GFX9-NEXT: v_and_b32_e32 v7, 63, v7
1156 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3]
1157 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
1158 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v6
1159 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v5
1160 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1162 ; R600-LABEL: v_fshr_v2i64:
1167 ; GFX10-LABEL: v_fshr_v2i64:
1169 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1170 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1171 ; GFX10-NEXT: v_not_b32_e32 v9, v8
1172 ; GFX10-NEXT: v_not_b32_e32 v11, v10
1173 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
1174 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
1175 ; GFX10-NEXT: v_and_b32_e32 v8, 63, v8
1176 ; GFX10-NEXT: v_and_b32_e32 v9, 63, v9
1177 ; GFX10-NEXT: v_and_b32_e32 v10, 63, v10
1178 ; GFX10-NEXT: v_and_b32_e32 v11, 63, v11
1179 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
1180 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
1181 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7]
1182 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3]
1183 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4
1184 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5
1185 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6
1186 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v7
1187 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1188 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1192 define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1193 ; SI-LABEL: v_fshr_i24:
1195 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab
1197 ; SI-NEXT: v_mul_hi_u32 v3, v2, s4
1198 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1199 ; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3
1200 ; SI-NEXT: v_mul_lo_u32 v3, v3, 24
1201 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
1202 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
1203 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2
1204 ; SI-NEXT: s_setpc_b64 s[30:31]
1206 ; VI-LABEL: v_fshr_i24:
1208 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1209 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab
1210 ; VI-NEXT: v_mul_hi_u32 v3, v2, s4
1211 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1212 ; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3
1213 ; VI-NEXT: v_mul_lo_u32 v3, v3, 24
1214 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
1215 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
1216 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2
1217 ; VI-NEXT: s_setpc_b64 s[30:31]
1219 ; GFX9-LABEL: v_fshr_i24:
1221 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1222 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab
1223 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4
1224 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1225 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3
1226 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24
1227 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
1228 ; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
1229 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2
1230 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1232 ; R600-LABEL: v_fshr_i24:
1237 ; GFX10-LABEL: v_fshr_i24:
1239 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1241 ; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v2
1242 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1243 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 4, v3
1244 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24
1245 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3
1246 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2
1247 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2
1248 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1249 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1253 define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1254 ; SI-LABEL: v_fshr_v2i24:
1256 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1257 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab
1258 ; SI-NEXT: v_mul_hi_u32 v6, v4, s4
1259 ; SI-NEXT: v_mul_hi_u32 v7, v5, s4
1260 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1261 ; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6
1262 ; SI-NEXT: v_mul_lo_u32 v6, v6, 24
1263 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
1264 ; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7
1265 ; SI-NEXT: v_mul_lo_u32 v6, v6, 24
1266 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
1267 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
1268 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1269 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6
1270 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
1271 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3
1272 ; SI-NEXT: s_setpc_b64 s[30:31]
1274 ; VI-LABEL: v_fshr_v2i24:
1276 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1277 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab
1278 ; VI-NEXT: v_mul_hi_u32 v6, v4, s4
1279 ; VI-NEXT: v_mul_hi_u32 v7, v5, s4
1280 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1281 ; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6
1282 ; VI-NEXT: v_mul_lo_u32 v6, v6, 24
1283 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
1284 ; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7
1285 ; VI-NEXT: v_mul_lo_u32 v6, v6, 24
1286 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
1287 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
1288 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1289 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6
1290 ; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
1291 ; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3
1292 ; VI-NEXT: s_setpc_b64 s[30:31]
1294 ; GFX9-LABEL: v_fshr_v2i24:
1296 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1297 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab
1298 ; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4
1299 ; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4
1300 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1301 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6
1302 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
1303 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
1304 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7
1305 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
1306 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
1307 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
1308 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
1309 ; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6
1310 ; GFX9-NEXT: v_add_u32_e32 v3, 8, v3
1311 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
1312 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1314 ; R600-LABEL: v_fshr_v2i24:
1319 ; GFX10-LABEL: v_fshr_v2i24:
1321 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1322 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1323 ; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab
1324 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1325 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, s4
1326 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, s4
1327 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
1328 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6
1329 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7
1330 ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24
1331 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24
1332 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6
1333 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7
1334 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4
1335 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5
1336 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4
1337 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5
1338 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1339 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)