1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
6 define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
7 ; GFX9-LABEL: shuffle_v4f16_23uu:
9 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
11 ; GFX9-NEXT: s_waitcnt vmcnt(0)
12 ; GFX9-NEXT: s_setpc_b64 s[30:31]
14 ; GFX10-LABEL: shuffle_v4f16_23uu:
16 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
18 ; GFX10-NEXT: s_waitcnt vmcnt(0)
19 ; GFX10-NEXT: s_setpc_b64 s[30:31]
21 ; GFX11-LABEL: shuffle_v4f16_23uu:
23 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
25 ; GFX11-NEXT: s_waitcnt vmcnt(0)
26 ; GFX11-NEXT: s_setpc_b64 s[30:31]
27 %val0 = load <4 x half>, ptr addrspace(1) %arg0
28 %val1 = load <4 x half>, ptr addrspace(1) %arg1
29 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
30 ret <4 x half> %shuffle
33 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
34 ; GFX9-LABEL: shuffle_v4f16_234u:
36 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
38 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
39 ; GFX9-NEXT: s_waitcnt vmcnt(1)
40 ; GFX9-NEXT: v_mov_b32_e32 v0, v6
41 ; GFX9-NEXT: s_waitcnt vmcnt(0)
42 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
43 ; GFX9-NEXT: s_setpc_b64 s[30:31]
45 ; GFX10-LABEL: shuffle_v4f16_234u:
47 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
49 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
50 ; GFX10-NEXT: s_waitcnt vmcnt(1)
51 ; GFX10-NEXT: v_mov_b32_e32 v0, v6
52 ; GFX10-NEXT: s_waitcnt vmcnt(0)
53 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
54 ; GFX10-NEXT: s_setpc_b64 s[30:31]
56 ; GFX11-LABEL: shuffle_v4f16_234u:
58 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
60 ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
61 ; GFX11-NEXT: s_waitcnt vmcnt(0)
62 ; GFX11-NEXT: s_setpc_b64 s[30:31]
63 %val0 = load <4 x half>, ptr addrspace(1) %arg0
64 %val1 = load <4 x half>, ptr addrspace(1) %arg1
65 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
66 ret <4 x half> %shuffle
69 define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
70 ; GFX9-LABEL: shuffle_v4f16_u1u3:
72 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
74 ; GFX9-NEXT: s_waitcnt vmcnt(0)
75 ; GFX9-NEXT: s_setpc_b64 s[30:31]
77 ; GFX10-LABEL: shuffle_v4f16_u1u3:
79 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
81 ; GFX10-NEXT: s_waitcnt vmcnt(0)
82 ; GFX10-NEXT: s_setpc_b64 s[30:31]
84 ; GFX11-LABEL: shuffle_v4f16_u1u3:
86 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
88 ; GFX11-NEXT: s_waitcnt vmcnt(0)
89 ; GFX11-NEXT: s_setpc_b64 s[30:31]
90 %val0 = load <4 x half>, ptr addrspace(1) %arg0
91 %val1 = load <4 x half>, ptr addrspace(1) %arg1
92 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
93 ret <4 x half> %shuffle
96 define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
97 ; GFX9-LABEL: shuffle_v4f16_u3u1:
99 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
101 ; GFX9-NEXT: s_waitcnt vmcnt(0)
102 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
103 ; GFX9-NEXT: s_setpc_b64 s[30:31]
105 ; GFX10-LABEL: shuffle_v4f16_u3u1:
107 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
109 ; GFX10-NEXT: s_waitcnt vmcnt(0)
110 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
111 ; GFX10-NEXT: s_setpc_b64 s[30:31]
113 ; GFX11-LABEL: shuffle_v4f16_u3u1:
115 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
117 ; GFX11-NEXT: s_waitcnt vmcnt(0)
118 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
119 ; GFX11-NEXT: s_setpc_b64 s[30:31]
120 %val0 = load <4 x half>, ptr addrspace(1) %arg0
121 %val1 = load <4 x half>, ptr addrspace(1) %arg1
122 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
123 ret <4 x half> %shuffle
126 define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
127 ; GFX9-LABEL: shuffle_v4f16_u3uu:
129 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
131 ; GFX9-NEXT: s_waitcnt vmcnt(0)
132 ; GFX9-NEXT: s_setpc_b64 s[30:31]
134 ; GFX10-LABEL: shuffle_v4f16_u3uu:
136 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
138 ; GFX10-NEXT: s_waitcnt vmcnt(0)
139 ; GFX10-NEXT: s_setpc_b64 s[30:31]
141 ; GFX11-LABEL: shuffle_v4f16_u3uu:
143 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
145 ; GFX11-NEXT: s_waitcnt vmcnt(0)
146 ; GFX11-NEXT: s_setpc_b64 s[30:31]
147 %val0 = load <4 x half>, ptr addrspace(1) %arg0
148 %val1 = load <4 x half>, ptr addrspace(1) %arg1
149 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
150 ret <4 x half> %shuffle
153 define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
154 ; GFX9-LABEL: shuffle_v4f16_3u6u:
156 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
158 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
159 ; GFX9-NEXT: s_waitcnt vmcnt(1)
160 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v5, 16
161 ; GFX9-NEXT: s_waitcnt vmcnt(0)
162 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
163 ; GFX9-NEXT: s_setpc_b64 s[30:31]
165 ; GFX10-LABEL: shuffle_v4f16_3u6u:
167 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
169 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
170 ; GFX10-NEXT: s_waitcnt vmcnt(1)
171 ; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16
172 ; GFX10-NEXT: s_waitcnt vmcnt(0)
173 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
174 ; GFX10-NEXT: s_setpc_b64 s[30:31]
176 ; GFX11-LABEL: shuffle_v4f16_3u6u:
178 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
180 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
181 ; GFX11-NEXT: s_waitcnt vmcnt(1)
182 ; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16
183 ; GFX11-NEXT: s_waitcnt vmcnt(0)
184 ; GFX11-NEXT: s_setpc_b64 s[30:31]
185 %val0 = load <4 x half>, ptr addrspace(1) %arg0
186 %val1 = load <4 x half>, ptr addrspace(1) %arg1
187 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
188 ret <4 x half> %shuffle
191 define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
192 ; GFX9-LABEL: shuffle_v4f16_3uu7:
194 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
196 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
197 ; GFX9-NEXT: s_waitcnt vmcnt(1)
198 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v5, 16
199 ; GFX9-NEXT: s_waitcnt vmcnt(0)
200 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
201 ; GFX9-NEXT: s_setpc_b64 s[30:31]
203 ; GFX10-LABEL: shuffle_v4f16_3uu7:
205 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
207 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
208 ; GFX10-NEXT: s_waitcnt vmcnt(1)
209 ; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16
210 ; GFX10-NEXT: s_waitcnt vmcnt(0)
211 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
212 ; GFX10-NEXT: s_setpc_b64 s[30:31]
214 ; GFX11-LABEL: shuffle_v4f16_3uu7:
216 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
218 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
219 ; GFX11-NEXT: s_waitcnt vmcnt(1)
220 ; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16
221 ; GFX11-NEXT: s_waitcnt vmcnt(0)
222 ; GFX11-NEXT: s_setpc_b64 s[30:31]
223 %val0 = load <4 x half>, ptr addrspace(1) %arg0
224 %val1 = load <4 x half>, ptr addrspace(1) %arg1
225 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
226 ret <4 x half> %shuffle
229 define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
230 ; GFX9-LABEL: shuffle_v4f16_35u5:
232 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
234 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
235 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
236 ; GFX9-NEXT: s_waitcnt vmcnt(0)
237 ; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
238 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
239 ; GFX9-NEXT: s_setpc_b64 s[30:31]
241 ; GFX10-LABEL: shuffle_v4f16_35u5:
243 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
245 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
246 ; GFX10-NEXT: s_waitcnt vmcnt(0)
247 ; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x7060302
248 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
249 ; GFX10-NEXT: s_setpc_b64 s[30:31]
251 ; GFX11-LABEL: shuffle_v4f16_35u5:
253 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
255 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
256 ; GFX11-NEXT: s_waitcnt vmcnt(0)
257 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
258 ; GFX11-NEXT: s_setpc_b64 s[30:31]
259 %val0 = load <4 x half>, ptr addrspace(1) %arg0
260 %val1 = load <4 x half>, ptr addrspace(1) %arg1
261 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
262 ret <4 x half> %shuffle
265 define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
266 ; GFX9-LABEL: shuffle_v4f16_357u:
268 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
270 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
271 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
272 ; GFX9-NEXT: s_waitcnt vmcnt(1)
273 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v5, 16
274 ; GFX9-NEXT: s_waitcnt vmcnt(0)
275 ; GFX9-NEXT: v_perm_b32 v0, v4, v6, s4
276 ; GFX9-NEXT: s_setpc_b64 s[30:31]
278 ; GFX10-LABEL: shuffle_v4f16_357u:
280 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
282 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
283 ; GFX10-NEXT: s_waitcnt vmcnt(1)
284 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16
285 ; GFX10-NEXT: s_waitcnt vmcnt(0)
286 ; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302
287 ; GFX10-NEXT: s_setpc_b64 s[30:31]
289 ; GFX11-LABEL: shuffle_v4f16_357u:
291 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
293 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
294 ; GFX11-NEXT: s_waitcnt vmcnt(1)
295 ; GFX11-NEXT: v_alignbit_b32 v1, s0, v3, 16
296 ; GFX11-NEXT: s_waitcnt vmcnt(0)
297 ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
298 ; GFX11-NEXT: s_setpc_b64 s[30:31]
299 %val0 = load <4 x half>, ptr addrspace(1) %arg0
300 %val1 = load <4 x half>, ptr addrspace(1) %arg1
301 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
302 ret <4 x half> %shuffle
305 define <4 x half> @shuffle_v4f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
306 ; GFX9-LABEL: shuffle_v4f16_0101:
308 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
310 ; GFX9-NEXT: s_waitcnt vmcnt(0)
311 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
312 ; GFX9-NEXT: s_setpc_b64 s[30:31]
314 ; GFX10-LABEL: shuffle_v4f16_0101:
316 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
318 ; GFX10-NEXT: s_waitcnt vmcnt(0)
319 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
320 ; GFX10-NEXT: s_setpc_b64 s[30:31]
322 ; GFX11-LABEL: shuffle_v4f16_0101:
324 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
327 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
328 ; GFX11-NEXT: s_setpc_b64 s[30:31]
329 %val0 = load <4 x half>, ptr addrspace(1) %arg0
330 %val1 = load <4 x half>, ptr addrspace(1) %arg1
331 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
332 ret <4 x half> %shuffle
335 define <4 x half> @shuffle_v4f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
336 ; GFX9-LABEL: shuffle_v4f16_0123:
338 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
340 ; GFX9-NEXT: s_waitcnt vmcnt(0)
341 ; GFX9-NEXT: s_setpc_b64 s[30:31]
343 ; GFX10-LABEL: shuffle_v4f16_0123:
345 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
347 ; GFX10-NEXT: s_waitcnt vmcnt(0)
348 ; GFX10-NEXT: s_setpc_b64 s[30:31]
350 ; GFX11-LABEL: shuffle_v4f16_0123:
352 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
354 ; GFX11-NEXT: s_waitcnt vmcnt(0)
355 ; GFX11-NEXT: s_setpc_b64 s[30:31]
356 %val0 = load <4 x half>, ptr addrspace(1) %arg0
357 %val1 = load <4 x half>, ptr addrspace(1) %arg1
358 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359 ret <4 x half> %shuffle
362 define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
363 ; GFX9-LABEL: shuffle_v4f16_0145:
365 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
367 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
368 ; GFX9-NEXT: s_waitcnt vmcnt(1)
369 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
370 ; GFX9-NEXT: s_waitcnt vmcnt(0)
371 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
372 ; GFX9-NEXT: s_setpc_b64 s[30:31]
374 ; GFX10-LABEL: shuffle_v4f16_0145:
376 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
378 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
379 ; GFX10-NEXT: s_waitcnt vmcnt(1)
380 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
381 ; GFX10-NEXT: s_waitcnt vmcnt(0)
382 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
383 ; GFX10-NEXT: s_setpc_b64 s[30:31]
385 ; GFX11-LABEL: shuffle_v4f16_0145:
387 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
389 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
390 ; GFX11-NEXT: s_waitcnt vmcnt(0)
391 ; GFX11-NEXT: s_setpc_b64 s[30:31]
392 %val0 = load <4 x half>, ptr addrspace(1) %arg0
393 %val1 = load <4 x half>, ptr addrspace(1) %arg1
394 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
395 ret <4 x half> %shuffle
398 define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
399 ; GFX9-LABEL: shuffle_v4f16_0167:
401 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
403 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
404 ; GFX9-NEXT: s_waitcnt vmcnt(1)
405 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
406 ; GFX9-NEXT: s_waitcnt vmcnt(0)
407 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
408 ; GFX9-NEXT: s_setpc_b64 s[30:31]
410 ; GFX10-LABEL: shuffle_v4f16_0167:
412 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
414 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
415 ; GFX10-NEXT: s_waitcnt vmcnt(1)
416 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
417 ; GFX10-NEXT: s_waitcnt vmcnt(0)
418 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
419 ; GFX10-NEXT: s_setpc_b64 s[30:31]
421 ; GFX11-LABEL: shuffle_v4f16_0167:
423 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
425 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
426 ; GFX11-NEXT: s_waitcnt vmcnt(0)
427 ; GFX11-NEXT: s_setpc_b64 s[30:31]
428 %val0 = load <4 x half>, ptr addrspace(1) %arg0
429 %val1 = load <4 x half>, ptr addrspace(1) %arg1
430 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
431 ret <4 x half> %shuffle
434 define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
435 ; GFX9-LABEL: shuffle_v4f16_2301:
437 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
439 ; GFX9-NEXT: s_waitcnt vmcnt(0)
440 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
441 ; GFX9-NEXT: s_setpc_b64 s[30:31]
443 ; GFX10-LABEL: shuffle_v4f16_2301:
445 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
447 ; GFX10-NEXT: s_waitcnt vmcnt(0)
448 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
449 ; GFX10-NEXT: s_setpc_b64 s[30:31]
451 ; GFX11-LABEL: shuffle_v4f16_2301:
453 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
455 ; GFX11-NEXT: s_waitcnt vmcnt(0)
456 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
457 ; GFX11-NEXT: s_setpc_b64 s[30:31]
458 %val0 = load <4 x half>, ptr addrspace(1) %arg0
459 %val1 = load <4 x half>, ptr addrspace(1) %arg1
460 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
461 ret <4 x half> %shuffle
464 define <4 x half> @shuffle_v4f16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
465 ; GFX9-LABEL: shuffle_v4f16_2323:
467 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
468 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
471 ; GFX9-NEXT: s_setpc_b64 s[30:31]
473 ; GFX10-LABEL: shuffle_v4f16_2323:
475 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
476 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
477 ; GFX10-NEXT: s_waitcnt vmcnt(0)
478 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
479 ; GFX10-NEXT: s_setpc_b64 s[30:31]
481 ; GFX11-LABEL: shuffle_v4f16_2323:
483 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
485 ; GFX11-NEXT: s_waitcnt vmcnt(0)
486 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
487 ; GFX11-NEXT: s_setpc_b64 s[30:31]
488 %val0 = load <4 x half>, ptr addrspace(1) %arg0
489 %val1 = load <4 x half>, ptr addrspace(1) %arg1
490 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
491 ret <4 x half> %shuffle
494 define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
495 ; GFX9-LABEL: shuffle_v4f16_2345:
497 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
499 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
500 ; GFX9-NEXT: s_waitcnt vmcnt(1)
501 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
502 ; GFX9-NEXT: s_waitcnt vmcnt(0)
503 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
504 ; GFX9-NEXT: s_setpc_b64 s[30:31]
506 ; GFX10-LABEL: shuffle_v4f16_2345:
508 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
510 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
511 ; GFX10-NEXT: s_waitcnt vmcnt(1)
512 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
513 ; GFX10-NEXT: s_waitcnt vmcnt(0)
514 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
515 ; GFX10-NEXT: s_setpc_b64 s[30:31]
517 ; GFX11-LABEL: shuffle_v4f16_2345:
519 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
521 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
522 ; GFX11-NEXT: s_waitcnt vmcnt(0)
523 ; GFX11-NEXT: s_setpc_b64 s[30:31]
524 %val0 = load <4 x half>, ptr addrspace(1) %arg0
525 %val1 = load <4 x half>, ptr addrspace(1) %arg1
526 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
527 ret <4 x half> %shuffle
530 define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
531 ; GFX9-LABEL: shuffle_v4f16_2367:
533 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
535 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
536 ; GFX9-NEXT: s_waitcnt vmcnt(1)
537 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
538 ; GFX9-NEXT: s_waitcnt vmcnt(0)
539 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
540 ; GFX9-NEXT: s_setpc_b64 s[30:31]
542 ; GFX10-LABEL: shuffle_v4f16_2367:
544 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
546 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
547 ; GFX10-NEXT: s_waitcnt vmcnt(1)
548 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
549 ; GFX10-NEXT: s_waitcnt vmcnt(0)
550 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
551 ; GFX10-NEXT: s_setpc_b64 s[30:31]
553 ; GFX11-LABEL: shuffle_v4f16_2367:
555 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
557 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
558 ; GFX11-NEXT: s_waitcnt vmcnt(0)
559 ; GFX11-NEXT: s_setpc_b64 s[30:31]
560 %val0 = load <4 x half>, ptr addrspace(1) %arg0
561 %val1 = load <4 x half>, ptr addrspace(1) %arg1
562 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
563 ret <4 x half> %shuffle
566 define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
567 ; GFX9-LABEL: shuffle_v4f16_4501:
569 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
571 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
572 ; GFX9-NEXT: s_waitcnt vmcnt(1)
573 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
574 ; GFX9-NEXT: s_waitcnt vmcnt(0)
575 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
576 ; GFX9-NEXT: s_setpc_b64 s[30:31]
578 ; GFX10-LABEL: shuffle_v4f16_4501:
580 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
582 ; GFX10-NEXT: global_load_dword v5, v[0:1], off
583 ; GFX10-NEXT: s_waitcnt vmcnt(1)
584 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
585 ; GFX10-NEXT: s_waitcnt vmcnt(0)
586 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
587 ; GFX10-NEXT: s_setpc_b64 s[30:31]
589 ; GFX11-LABEL: shuffle_v4f16_4501:
591 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off
593 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off
594 ; GFX11-NEXT: s_waitcnt vmcnt(1)
595 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
596 ; GFX11-NEXT: s_waitcnt vmcnt(0)
597 ; GFX11-NEXT: s_setpc_b64 s[30:31]
598 %val0 = load <4 x half>, ptr addrspace(1) %arg0
599 %val1 = load <4 x half>, ptr addrspace(1) %arg1
600 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
601 ret <4 x half> %shuffle
604 define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
605 ; GFX9-LABEL: shuffle_v4f16_4523:
607 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
609 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
610 ; GFX9-NEXT: s_waitcnt vmcnt(1)
611 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
612 ; GFX9-NEXT: s_waitcnt vmcnt(0)
613 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
614 ; GFX9-NEXT: s_setpc_b64 s[30:31]
616 ; GFX10-LABEL: shuffle_v4f16_4523:
618 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
620 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
621 ; GFX10-NEXT: s_waitcnt vmcnt(1)
622 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
623 ; GFX10-NEXT: s_waitcnt vmcnt(0)
624 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
625 ; GFX10-NEXT: s_setpc_b64 s[30:31]
627 ; GFX11-LABEL: shuffle_v4f16_4523:
629 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
630 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off
631 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
632 ; GFX11-NEXT: s_waitcnt vmcnt(1)
633 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
634 ; GFX11-NEXT: s_waitcnt vmcnt(0)
635 ; GFX11-NEXT: s_setpc_b64 s[30:31]
636 %val0 = load <4 x half>, ptr addrspace(1) %arg0
637 %val1 = load <4 x half>, ptr addrspace(1) %arg1
638 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
639 ret <4 x half> %shuffle
642 define <4 x half> @shuffle_v4f16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
643 ; GFX9-LABEL: shuffle_v4f16_4545:
645 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
646 ; GFX9-NEXT: global_load_dword v0, v[2:3], off
647 ; GFX9-NEXT: s_waitcnt vmcnt(0)
648 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
649 ; GFX9-NEXT: s_setpc_b64 s[30:31]
651 ; GFX10-LABEL: shuffle_v4f16_4545:
653 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654 ; GFX10-NEXT: global_load_dword v0, v[2:3], off
655 ; GFX10-NEXT: s_waitcnt vmcnt(0)
656 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
657 ; GFX10-NEXT: s_setpc_b64 s[30:31]
659 ; GFX11-LABEL: shuffle_v4f16_4545:
661 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
662 ; GFX11-NEXT: global_load_b32 v0, v[2:3], off
663 ; GFX11-NEXT: s_waitcnt vmcnt(0)
664 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
665 ; GFX11-NEXT: s_setpc_b64 s[30:31]
666 %val0 = load <4 x half>, ptr addrspace(1) %arg0
667 %val1 = load <4 x half>, ptr addrspace(1) %arg1
668 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
669 ret <4 x half> %shuffle
672 define <4 x half> @shuffle_v4f16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
673 ; GFX9-LABEL: shuffle_v4f16_4567:
675 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
677 ; GFX9-NEXT: s_waitcnt vmcnt(0)
678 ; GFX9-NEXT: s_setpc_b64 s[30:31]
680 ; GFX10-LABEL: shuffle_v4f16_4567:
682 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
684 ; GFX10-NEXT: s_waitcnt vmcnt(0)
685 ; GFX10-NEXT: s_setpc_b64 s[30:31]
687 ; GFX11-LABEL: shuffle_v4f16_4567:
689 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
690 ; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off
691 ; GFX11-NEXT: s_waitcnt vmcnt(0)
692 ; GFX11-NEXT: s_setpc_b64 s[30:31]
693 %val0 = load <4 x half>, ptr addrspace(1) %arg0
694 %val1 = load <4 x half>, ptr addrspace(1) %arg1
695 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
696 ret <4 x half> %shuffle
699 define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
700 ; GFX9-LABEL: shuffle_v4f16_6701:
702 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
704 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
705 ; GFX9-NEXT: s_waitcnt vmcnt(1)
706 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
707 ; GFX9-NEXT: s_waitcnt vmcnt(0)
708 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
709 ; GFX9-NEXT: s_setpc_b64 s[30:31]
711 ; GFX10-LABEL: shuffle_v4f16_6701:
713 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
715 ; GFX10-NEXT: global_load_dword v5, v[0:1], off
716 ; GFX10-NEXT: s_waitcnt vmcnt(1)
717 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
718 ; GFX10-NEXT: s_waitcnt vmcnt(0)
719 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
720 ; GFX10-NEXT: s_setpc_b64 s[30:31]
722 ; GFX11-LABEL: shuffle_v4f16_6701:
724 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
726 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off
727 ; GFX11-NEXT: s_waitcnt vmcnt(1)
728 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
729 ; GFX11-NEXT: s_waitcnt vmcnt(0)
730 ; GFX11-NEXT: s_setpc_b64 s[30:31]
731 %val0 = load <4 x half>, ptr addrspace(1) %arg0
732 %val1 = load <4 x half>, ptr addrspace(1) %arg1
733 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
734 ret <4 x half> %shuffle
737 define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
738 ; GFX9-LABEL: shuffle_v4f16_6723:
740 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
741 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
742 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
743 ; GFX9-NEXT: s_waitcnt vmcnt(1)
744 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
745 ; GFX9-NEXT: s_waitcnt vmcnt(0)
746 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
747 ; GFX9-NEXT: s_setpc_b64 s[30:31]
749 ; GFX10-LABEL: shuffle_v4f16_6723:
751 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
753 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
754 ; GFX10-NEXT: s_waitcnt vmcnt(1)
755 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
756 ; GFX10-NEXT: s_waitcnt vmcnt(0)
757 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
758 ; GFX10-NEXT: s_setpc_b64 s[30:31]
760 ; GFX11-LABEL: shuffle_v4f16_6723:
762 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
764 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
765 ; GFX11-NEXT: s_waitcnt vmcnt(1)
766 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
767 ; GFX11-NEXT: s_waitcnt vmcnt(0)
768 ; GFX11-NEXT: s_setpc_b64 s[30:31]
769 %val0 = load <4 x half>, ptr addrspace(1) %arg0
770 %val1 = load <4 x half>, ptr addrspace(1) %arg1
771 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
772 ret <4 x half> %shuffle
775 define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
776 ; GFX9-LABEL: shuffle_v4f16_6745:
778 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
779 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
780 ; GFX9-NEXT: s_waitcnt vmcnt(0)
781 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
782 ; GFX9-NEXT: s_setpc_b64 s[30:31]
784 ; GFX10-LABEL: shuffle_v4f16_6745:
786 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
787 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
788 ; GFX10-NEXT: s_waitcnt vmcnt(0)
789 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
790 ; GFX10-NEXT: s_setpc_b64 s[30:31]
792 ; GFX11-LABEL: shuffle_v4f16_6745:
794 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795 ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
796 ; GFX11-NEXT: s_waitcnt vmcnt(0)
797 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
798 ; GFX11-NEXT: s_setpc_b64 s[30:31]
799 %val0 = load <4 x half>, ptr addrspace(1) %arg0
800 %val1 = load <4 x half>, ptr addrspace(1) %arg1
801 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
802 ret <4 x half> %shuffle
805 define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
806 ; GFX9-LABEL: shuffle_v4f16_6767:
808 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
809 ; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
810 ; GFX9-NEXT: s_waitcnt vmcnt(0)
811 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
812 ; GFX9-NEXT: s_setpc_b64 s[30:31]
814 ; GFX10-LABEL: shuffle_v4f16_6767:
816 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
817 ; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4
818 ; GFX10-NEXT: s_waitcnt vmcnt(0)
819 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
820 ; GFX10-NEXT: s_setpc_b64 s[30:31]
822 ; GFX11-LABEL: shuffle_v4f16_6767:
824 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825 ; GFX11-NEXT: global_load_b32 v0, v[2:3], off offset:4
826 ; GFX11-NEXT: s_waitcnt vmcnt(0)
827 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
828 ; GFX11-NEXT: s_setpc_b64 s[30:31]
829 %val0 = load <4 x half>, ptr addrspace(1) %arg0
830 %val1 = load <4 x half>, ptr addrspace(1) %arg1
831 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
832 ret <4 x half> %shuffle
835 define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
836 ; GFX9-LABEL: shuffle_v4f16_2356:
838 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
840 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
841 ; GFX9-NEXT: s_waitcnt vmcnt(1)
842 ; GFX9-NEXT: v_alignbit_b32 v1, v6, v5, 16
843 ; GFX9-NEXT: s_waitcnt vmcnt(0)
844 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
845 ; GFX9-NEXT: s_setpc_b64 s[30:31]
847 ; GFX10-LABEL: shuffle_v4f16_2356:
849 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
850 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
851 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
852 ; GFX10-NEXT: s_waitcnt vmcnt(1)
853 ; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16
854 ; GFX10-NEXT: s_waitcnt vmcnt(0)
855 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
856 ; GFX10-NEXT: s_setpc_b64 s[30:31]
858 ; GFX11-LABEL: shuffle_v4f16_2356:
860 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
861 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
862 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
863 ; GFX11-NEXT: s_waitcnt vmcnt(1)
864 ; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16
865 ; GFX11-NEXT: s_waitcnt vmcnt(0)
866 ; GFX11-NEXT: s_setpc_b64 s[30:31]
867 %val0 = load <4 x half>, ptr addrspace(1) %arg0
868 %val1 = load <4 x half>, ptr addrspace(1) %arg1
869 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
870 ret <4 x half> %shuffle
873 define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
874 ; GFX9-LABEL: shuffle_v4f16_5623:
876 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
878 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
879 ; GFX9-NEXT: s_waitcnt vmcnt(1)
880 ; GFX9-NEXT: v_alignbit_b32 v0, v6, v5, 16
881 ; GFX9-NEXT: s_waitcnt vmcnt(0)
882 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
883 ; GFX9-NEXT: s_setpc_b64 s[30:31]
885 ; GFX10-LABEL: shuffle_v4f16_5623:
887 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
888 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
889 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
890 ; GFX10-NEXT: s_waitcnt vmcnt(1)
891 ; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16
892 ; GFX10-NEXT: s_waitcnt vmcnt(0)
893 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
894 ; GFX10-NEXT: s_setpc_b64 s[30:31]
896 ; GFX11-LABEL: shuffle_v4f16_5623:
898 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
900 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
901 ; GFX11-NEXT: s_waitcnt vmcnt(1)
902 ; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16
903 ; GFX11-NEXT: s_waitcnt vmcnt(0)
904 ; GFX11-NEXT: s_setpc_b64 s[30:31]
905 %val0 = load <4 x half>, ptr addrspace(1) %arg0
906 %val1 = load <4 x half>, ptr addrspace(1) %arg1
907 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
908 ret <4 x half> %shuffle
911 define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
912 ; GFX9-LABEL: shuffle_v4f16_3456:
914 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
915 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
916 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
917 ; GFX9-NEXT: s_waitcnt vmcnt(1)
918 ; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
919 ; GFX9-NEXT: s_waitcnt vmcnt(0)
920 ; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16
921 ; GFX9-NEXT: s_setpc_b64 s[30:31]
923 ; GFX10-LABEL: shuffle_v4f16_3456:
925 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
926 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
927 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
928 ; GFX10-NEXT: s_waitcnt vmcnt(1)
929 ; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
930 ; GFX10-NEXT: s_waitcnt vmcnt(0)
931 ; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16
932 ; GFX10-NEXT: s_setpc_b64 s[30:31]
934 ; GFX11-LABEL: shuffle_v4f16_3456:
936 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
938 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
939 ; GFX11-NEXT: s_waitcnt vmcnt(1)
940 ; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16
941 ; GFX11-NEXT: s_waitcnt vmcnt(0)
942 ; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16
943 ; GFX11-NEXT: s_setpc_b64 s[30:31]
944 %val0 = load <4 x half>, ptr addrspace(1) %arg0
945 %val1 = load <4 x half>, ptr addrspace(1) %arg1
946 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
947 ret <4 x half> %shuffle
950 define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
951 ; GFX9-LABEL: shuffle_v4f16_5634:
953 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
954 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
955 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
956 ; GFX9-NEXT: s_waitcnt vmcnt(1)
957 ; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
958 ; GFX9-NEXT: s_waitcnt vmcnt(0)
959 ; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16
960 ; GFX9-NEXT: s_setpc_b64 s[30:31]
962 ; GFX10-LABEL: shuffle_v4f16_5634:
964 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
965 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
966 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
967 ; GFX10-NEXT: s_waitcnt vmcnt(1)
968 ; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16
969 ; GFX10-NEXT: s_waitcnt vmcnt(0)
970 ; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16
971 ; GFX10-NEXT: s_setpc_b64 s[30:31]
973 ; GFX11-LABEL: shuffle_v4f16_5634:
975 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
976 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
977 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
978 ; GFX11-NEXT: s_waitcnt vmcnt(1)
979 ; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16
980 ; GFX11-NEXT: s_waitcnt vmcnt(0)
981 ; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16
982 ; GFX11-NEXT: s_setpc_b64 s[30:31]
983 %val0 = load <4 x half>, ptr addrspace(1) %arg0
984 %val1 = load <4 x half>, ptr addrspace(1) %arg1
985 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
986 ret <4 x half> %shuffle
989 define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
990 ; GFX9-LABEL: shuffle_v4f16_5734:
992 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
993 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
994 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
995 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
996 ; GFX9-NEXT: s_waitcnt vmcnt(1)
997 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
998 ; GFX9-NEXT: s_waitcnt vmcnt(0)
999 ; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16
1000 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1002 ; GFX10-LABEL: shuffle_v4f16_5734:
1004 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1005 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
1006 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
1007 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1008 ; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302
1009 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1010 ; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16
1011 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1013 ; GFX11-LABEL: shuffle_v4f16_5734:
1015 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
1017 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
1018 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1019 ; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
1020 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1021 ; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16
1022 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1023 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1024 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1025 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
1026 ret <4 x half> %shuffle
1029 define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1030 ; GFX9-LABEL: shuffle_v4i16_2356:
1032 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
1034 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
1035 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1036 ; GFX9-NEXT: v_alignbit_b32 v1, v6, v5, 16
1037 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1038 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
1039 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1041 ; GFX10-LABEL: shuffle_v4i16_2356:
1043 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1044 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
1045 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
1046 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1047 ; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16
1048 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1049 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
1050 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1052 ; GFX11-LABEL: shuffle_v4i16_2356:
1054 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1055 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
1056 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
1057 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1058 ; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16
1059 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1061 %val0 = load <4 x i16>, ptr addrspace(1) %arg0
1062 %val1 = load <4 x i16>, ptr addrspace(1) %arg1
1063 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
1064 ret <4 x i16> %shuffle
1067 define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1068 ; GFX9-LABEL: shuffle_v4i16_0167:
1070 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1072 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
1073 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1074 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
1075 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1076 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
1077 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1079 ; GFX10-LABEL: shuffle_v4i16_0167:
1081 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1082 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1083 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
1084 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1085 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
1086 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1087 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
1088 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1090 ; GFX11-LABEL: shuffle_v4i16_0167:
1092 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1093 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1094 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
1095 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1096 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1097 %val0 = load <4 x i16>, ptr addrspace(1) %arg0
1098 %val1 = load <4 x i16>, ptr addrspace(1) %arg1
1099 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1100 ret <4 x i16> %shuffle
1103 define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1104 ; GFX9-LABEL: shuffle_v4f16_0000:
1106 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1107 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1108 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1109 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1110 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1111 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1112 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1114 ; GFX10-LABEL: shuffle_v4f16_0000:
1116 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1117 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1118 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1119 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
1120 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1121 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1123 ; GFX11-LABEL: shuffle_v4f16_0000:
1125 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1126 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1127 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1128 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
1129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1130 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1131 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1132 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1133 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1134 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
1135 ret <4 x half> %shuffle
1138 define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1139 ; GFX9-LABEL: shuffle_v4f16_1010:
1141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1142 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1143 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1144 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16
1145 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1146 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1148 ; GFX10-LABEL: shuffle_v4f16_1010:
1150 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1152 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1153 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
1154 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1155 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1157 ; GFX11-LABEL: shuffle_v4f16_1010:
1159 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1160 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1161 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1162 ; GFX11-NEXT: v_alignbit_b32 v0, v0, v0, 16
1163 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1164 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1165 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1166 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1167 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1168 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
1169 ret <4 x half> %shuffle
1172 define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1173 ; GFX9-LABEL: shuffle_v4f16_1100:
1175 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
1177 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1178 ; GFX9-NEXT: s_mov_b32 s5, 0x5040100
1179 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1180 ; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4
1181 ; GFX9-NEXT: v_perm_b32 v1, v1, v1, s5
1182 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1184 ; GFX10-LABEL: shuffle_v4f16_1100:
1186 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
1188 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1189 ; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302
1190 ; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100
1191 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1193 ; GFX11-LABEL: shuffle_v4f16_1100:
1195 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
1197 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302
1199 ; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100
1200 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1201 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1202 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1203 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
1204 ret <4 x half> %shuffle
1207 define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1208 ; GFX9-LABEL: shuffle_v4f16_6161:
1210 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1212 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
1213 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
1214 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1215 ; GFX9-NEXT: v_bfi_b32 v0, s4, v5, v4
1216 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1217 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1219 ; GFX10-LABEL: shuffle_v4f16_6161:
1221 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1222 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1223 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
1224 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1225 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v5, v4
1226 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1227 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1229 ; GFX11-LABEL: shuffle_v4f16_6161:
1231 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1232 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1233 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
1234 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1235 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
1236 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1237 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1238 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1239 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1240 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1241 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
1242 ret <4 x half> %shuffle
1245 define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1246 ; GFX9-LABEL: shuffle_v4f16_2333:
1248 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1249 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
1250 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1251 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1252 ; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4
1253 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1255 ; GFX10-LABEL: shuffle_v4f16_2333:
1257 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1258 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
1259 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1260 ; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302
1261 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1263 ; GFX11-LABEL: shuffle_v4f16_2333:
1265 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
1267 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1268 ; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302
1269 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1270 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1271 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1272 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1273 ret <4 x half> %shuffle
1276 define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1277 ; GFX9-LABEL: shuffle_v4f16_6667:
1279 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1280 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
1281 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1282 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1283 ; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4
1284 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1286 ; GFX10-LABEL: shuffle_v4f16_6667:
1288 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
1290 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1291 ; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302
1292 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1294 ; GFX11-LABEL: shuffle_v4f16_6667:
1296 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1297 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
1298 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1299 ; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302
1300 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1301 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1302 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1303 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1304 ret <4 x half> %shuffle
1307 define <4 x half> @shuffle_v8f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1308 ; GFX9-LABEL: shuffle_v8f16_0101:
1310 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1312 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1313 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1314 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1316 ; GFX10-LABEL: shuffle_v8f16_0101:
1318 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1319 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1320 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1321 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1322 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1324 ; GFX11-LABEL: shuffle_v8f16_0101:
1326 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1327 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1328 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1329 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1330 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1331 %val0 = load <8 x half>, ptr addrspace(1) %arg0
1332 %val1 = load <8 x half>, ptr addrspace(1) %arg1
1333 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1334 ret <4 x half> %shuffle
1337 define <4 x half> @shuffle_v8f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1338 ; GFX9-LABEL: shuffle_v8f16_0123:
1340 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1341 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1342 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1343 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1345 ; GFX10-LABEL: shuffle_v8f16_0123:
1347 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1348 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1349 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1350 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1352 ; GFX11-LABEL: shuffle_v8f16_0123:
1354 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1355 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1356 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1357 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1358 %val0 = load <8 x half>, ptr addrspace(1) %arg0
1359 %val1 = load <8 x half>, ptr addrspace(1) %arg1
1360 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1361 ret <4 x half> %shuffle
1364 define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1365 ; GFX9-LABEL: shuffle_v8f16_4589:
1367 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1368 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8
1369 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1370 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1371 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
1372 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1373 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
1374 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1376 ; GFX10-LABEL: shuffle_v8f16_4589:
1378 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1379 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8
1380 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1381 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1382 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
1383 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1384 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
1385 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1387 ; GFX11-LABEL: shuffle_v8f16_4589:
1389 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1390 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8
1391 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1392 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1393 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1394 %val0 = load <8 x half>, ptr addrspace(1) %arg0
1395 %val1 = load <8 x half>, ptr addrspace(1) %arg1
1396 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
1397 ret <4 x half> %shuffle
1400 define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1401 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
1403 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1404 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
1405 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
1406 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1407 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
1408 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1409 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
1410 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1412 ; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
1414 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1415 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
1416 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
1417 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1418 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
1419 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1420 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
1421 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1423 ; GFX11-LABEL: shuffle_v8f16_10_11_2_3:
1425 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1426 ; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
1427 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
1428 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1429 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
1430 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1431 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1432 %val0 = load <8 x half>, ptr addrspace(1) %arg0
1433 %val1 = load <8 x half>, ptr addrspace(1) %arg1
1434 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
1435 ret <4 x half> %shuffle
1438 define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1439 ; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
1441 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
1443 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
1444 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1445 ; GFX9-NEXT: v_alignbit_b32 v0, v6, v5, 16
1446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1447 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
1448 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1450 ; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
1452 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1453 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
1454 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
1455 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1456 ; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16
1457 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1458 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
1459 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1461 ; GFX11-LABEL: shuffle_v8f16_13_14_2_3:
1463 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1464 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8
1465 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
1466 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1467 ; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16
1468 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1469 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1470 %val0 = load <8 x half>, ptr addrspace(1) %arg0
1471 %val1 = load <8 x half>, ptr addrspace(1) %arg1
1472 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
1473 ret <4 x half> %shuffle
1476 define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1477 ; GFX9-LABEL: shuffle_v3f16_0122:
1479 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1480 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1481 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1482 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1483 ; GFX9-NEXT: v_perm_b32 v1, v1, v1, s4
1484 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1486 ; GFX10-LABEL: shuffle_v3f16_0122:
1488 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1489 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1490 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1491 ; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100
1492 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1494 ; GFX11-LABEL: shuffle_v3f16_0122:
1496 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1497 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1498 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1499 ; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100
1500 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1501 %val0 = load <3 x half>, ptr addrspace(1) %arg0
1502 %val1 = load <3 x half>, ptr addrspace(1) %arg1
1503 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1504 ret <4 x half> %shuffle
1507 define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1508 ; GFX9-LABEL: shuffle_v2f16_0122:
1510 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1511 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1512 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1513 ; GFX9-NEXT: v_alignbit_b32 v1, v0, v0, 16
1514 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1516 ; GFX10-LABEL: shuffle_v2f16_0122:
1518 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1519 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1520 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1521 ; GFX10-NEXT: v_alignbit_b32 v1, v0, v0, 16
1522 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1524 ; GFX11-LABEL: shuffle_v2f16_0122:
1526 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1527 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1528 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1529 ; GFX11-NEXT: v_alignbit_b32 v1, v0, v0, 16
1530 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1531 %val0 = load <2 x half>, ptr addrspace(1) %arg0
1532 %val1 = load <2 x half>, ptr addrspace(1) %arg1
1533 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
1534 ret <4 x half> %shuffle
1537 define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1538 ; GFX9-LABEL: shuffle_v6f16_452367:
1540 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1541 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
1542 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1543 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
1544 ; GFX9-NEXT: v_mov_b32_e32 v3, v2
1545 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
1546 ; GFX9-NEXT: global_load_dword v7, v[3:4], off
1547 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1548 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
1549 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1550 ; GFX9-NEXT: v_mov_b32_e32 v2, v7
1551 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1553 ; GFX10-LABEL: shuffle_v6f16_452367:
1555 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1556 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
1557 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1558 ; GFX10-NEXT: v_mov_b32_e32 v4, v3
1559 ; GFX10-NEXT: v_mov_b32_e32 v3, v2
1560 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
1561 ; GFX10-NEXT: global_load_dword v7, v[3:4], off
1562 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1563 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
1564 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1565 ; GFX10-NEXT: v_mov_b32_e32 v2, v7
1566 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1568 ; GFX11-LABEL: shuffle_v6f16_452367:
1570 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1571 ; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
1572 ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
1573 ; GFX11-NEXT: global_load_b32 v3, v[3:4], off
1574 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1575 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
1576 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1577 ; GFX11-NEXT: v_mov_b32_e32 v2, v3
1578 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1579 %val0 = load <6 x half>, ptr addrspace(1) %arg0
1580 %val1 = load <6 x half>, ptr addrspace(1) %arg1
1581 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1582 ret <6 x half> %shuffle
1585 define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) {
1586 ; GFX9-LABEL: fma_shuffle:
1587 ; GFX9: ; %bb.0: ; %entry
1588 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1589 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1590 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1591 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1592 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
1593 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
1594 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
1595 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1596 ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1597 ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1598 ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1599 ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1600 ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
1601 ; GFX9-NEXT: s_endpgm
1603 ; GFX10-LABEL: fma_shuffle:
1604 ; GFX10: ; %bb.0: ; %entry
1605 ; GFX10-NEXT: s_clause 0x1
1606 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1607 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1608 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1609 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1610 ; GFX10-NEXT: s_clause 0x2
1611 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
1612 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
1613 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
1614 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1615 ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1616 ; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1617 ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1618 ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1619 ; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
1620 ; GFX10-NEXT: s_endpgm
1622 ; GFX11-LABEL: fma_shuffle:
1623 ; GFX11: ; %bb.0: ; %entry
1624 ; GFX11-NEXT: s_clause 0x1
1625 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1626 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
1627 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1628 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1629 ; GFX11-NEXT: s_clause 0x2
1630 ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5]
1631 ; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7]
1632 ; GFX11-NEXT: global_load_b64 v[4:5], v6, s[0:1]
1633 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1634 ; GFX11-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1635 ; GFX11-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1636 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1637 ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1638 ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1639 ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1]
1640 ; GFX11-NEXT: s_nop 0
1641 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1642 ; GFX11-NEXT: s_endpgm
1644 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
1645 %tmp12 = zext i32 %tmp1 to i64
1646 %arrayidx = getelementptr inbounds <4 x half>, ptr addrspace(1) %A, i64 %tmp12
1647 %tmp14 = load <4 x half>, ptr addrspace(1) %arrayidx, align 8
1648 %arrayidx1 = getelementptr inbounds <4 x half>, ptr addrspace(1) %B, i64 %tmp12
1649 %tmp15 = load <4 x half>, ptr addrspace(1) %arrayidx1, align 8
1650 %arrayidx2 = getelementptr inbounds <4 x half>, ptr addrspace(1) %C, i64 %tmp12
1651 %tmp16 = load <4 x half>, ptr addrspace(1) %arrayidx2, align 8
1652 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
1653 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1654 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1655 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
1656 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
1657 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1658 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
1659 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1660 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1661 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
1662 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1663 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
1664 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
1665 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
1666 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1667 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1668 store <4 x half> %tmp32, ptr addrspace(1) %arrayidx2, align 8
1672 define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1673 ; GFX9-LABEL: shuffle_v4f16_0456:
1675 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1676 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
1677 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
1678 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1679 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1680 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1681 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1682 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
1683 ; GFX9-NEXT: v_alignbit_b32 v1, v6, v5, 16
1684 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1686 ; GFX10-LABEL: shuffle_v4f16_0456:
1688 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1689 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
1690 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
1691 ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1692 ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1693 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1694 ; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100
1695 ; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16
1696 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1698 ; GFX11-LABEL: shuffle_v4f16_0456:
1700 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1701 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1702 ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
1703 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1704 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
1705 ; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16
1706 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1707 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1708 %val1 = load <4 x half>, ptr addrspace(1) %arg1
1709 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1710 ret <4 x half> %shuffle
1713 define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out) {
1714 ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
1716 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1717 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1718 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1719 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
1720 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1721 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1722 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1723 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1724 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1725 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
1726 ; GFX9-NEXT: s_endpgm
1728 ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
1730 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1731 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1732 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1733 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
1734 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1735 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
1736 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
1737 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
1738 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
1739 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
1740 ; GFX10-NEXT: s_endpgm
1742 ; GFX11-LABEL: shuffle_scalar_load_v8i32_0123:
1744 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1745 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1746 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1747 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
1748 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1749 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
1750 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
1751 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
1752 ; GFX11-NEXT: s_nop 0
1753 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1754 ; GFX11-NEXT: s_endpgm
1755 %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16
1756 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1757 store <4 x i32> %id, ptr addrspace(1) %out, align 8
1761 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
1762 declare i32 @llvm.amdgcn.workitem.id.x() #0
1764 attributes #0 = { nounwind readnone speculatable }
1765 define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1766 ; GFX9-LABEL: low16bits:
1767 ; GFX9: ; %bb.0: ; %entry
1768 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1769 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1770 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1771 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1772 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1773 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
1774 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1776 ; GFX10-LABEL: low16bits:
1777 ; GFX10: ; %bb.0: ; %entry
1778 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1779 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1780 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1781 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1782 ; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100
1783 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1785 ; GFX11-LABEL: low16bits:
1786 ; GFX11: ; %bb.0: ; %entry
1787 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1788 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1789 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1790 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1791 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
1792 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1794 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
1795 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
1796 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef>
1797 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2>
1798 ret <2 x half> %vy1.2.vec.insert
1801 define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1802 ; GFX9-LABEL: hi16bits:
1803 ; GFX9: ; %bb.0: ; %entry
1804 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1805 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1806 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1807 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1808 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1809 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
1810 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1812 ; GFX10-LABEL: hi16bits:
1813 ; GFX10: ; %bb.0: ; %entry
1814 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1815 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1816 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1817 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1818 ; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302
1819 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1821 ; GFX11-LABEL: hi16bits:
1822 ; GFX11: ; %bb.0: ; %entry
1823 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1824 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1825 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1826 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1827 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
1828 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1830 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
1831 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
1832 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef>
1833 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3>
1834 ret <2 x half> %vy1.2.vec.insert
1837 define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1838 ; GFX9-LABEL: low16hi16bits:
1839 ; GFX9: ; %bb.0: ; %entry
1840 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1841 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1842 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1843 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
1844 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1845 ; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v5
1846 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1848 ; GFX10-LABEL: low16hi16bits:
1849 ; GFX10: ; %bb.0: ; %entry
1850 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1851 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1852 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1853 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1854 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
1855 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1857 ; GFX11-LABEL: low16hi16bits:
1858 ; GFX11: ; %bb.0: ; %entry
1859 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1860 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1861 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1862 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1863 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
1864 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1866 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
1867 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
1868 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef>
1869 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3>
1870 ret <2 x half> %vy1.2.vec.insert
1873 define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1874 ; GFX9-LABEL: hi16low16bits:
1875 ; GFX9: ; %bb.0: ; %entry
1876 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1877 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1878 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1879 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1880 ; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
1881 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1883 ; GFX10-LABEL: hi16low16bits:
1884 ; GFX10: ; %bb.0: ; %entry
1885 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1886 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1887 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1888 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1889 ; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16
1890 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1892 ; GFX11-LABEL: hi16low16bits:
1893 ; GFX11: ; %bb.0: ; %entry
1894 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1895 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1896 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1897 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1898 ; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16
1899 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1901 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
1902 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
1903 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef>
1904 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2>
1905 ret <2 x half> %vy1.2.vec.insert
1908 define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1909 ; GFX9-LABEL: i16_low16bits:
1910 ; GFX9: ; %bb.0: ; %entry
1911 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1912 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1913 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1914 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
1915 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1916 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
1917 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1919 ; GFX10-LABEL: i16_low16bits:
1920 ; GFX10: ; %bb.0: ; %entry
1921 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1922 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1923 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1924 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1925 ; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100
1926 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1928 ; GFX11-LABEL: i16_low16bits:
1929 ; GFX11: ; %bb.0: ; %entry
1930 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1931 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1932 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1933 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1934 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
1935 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1937 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
1938 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
1939 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef>
1940 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2>
1941 ret <2 x i16> %vy1.2.vec.insert
1944 define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1945 ; GFX9-LABEL: i16_low16hi16bits:
1946 ; GFX9: ; %bb.0: ; %entry
1947 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1948 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1949 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1950 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
1951 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1952 ; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v5
1953 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1955 ; GFX10-LABEL: i16_low16hi16bits:
1956 ; GFX10: ; %bb.0: ; %entry
1957 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1959 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1960 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1961 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
1962 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1964 ; GFX11-LABEL: i16_low16hi16bits:
1965 ; GFX11: ; %bb.0: ; %entry
1966 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1967 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1968 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
1969 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1970 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
1971 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1973 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
1974 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
1975 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef>
1976 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3>
1977 ret <2 x i16> %vy1.2.vec.insert
1980 define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
1981 ; GFX9-LABEL: i16_hi16low16bits:
1982 ; GFX9: ; %bb.0: ; %entry
1983 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1984 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1985 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1986 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1987 ; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
1988 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1990 ; GFX10-LABEL: i16_hi16low16bits:
1991 ; GFX10: ; %bb.0: ; %entry
1992 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1993 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1994 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1995 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1996 ; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16
1997 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1999 ; GFX11-LABEL: i16_hi16low16bits:
2000 ; GFX11: ; %bb.0: ; %entry
2001 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2002 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2003 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
2004 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2005 ; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16
2006 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2008 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2009 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
2010 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef>
2011 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2>
2012 ret <2 x i16> %vy1.2.vec.insert
2015 define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2016 ; GFX9-LABEL: i16_hi16bits:
2017 ; GFX9: ; %bb.0: ; %entry
2018 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2019 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2020 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
2021 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
2022 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2023 ; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
2024 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2026 ; GFX10-LABEL: i16_hi16bits:
2027 ; GFX10: ; %bb.0: ; %entry
2028 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2029 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2030 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
2031 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2032 ; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302
2033 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2035 ; GFX11-LABEL: i16_hi16bits:
2036 ; GFX11: ; %bb.0: ; %entry
2037 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2038 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2039 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
2040 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2041 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
2042 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2044 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2045 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
2046 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef>
2047 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3>
2048 ret <2 x i16> %vy1.2.vec.insert
2051 define <2 x i16> @v2i16_hi16bits(ptr addrspace(1) %x0) {
2052 ; GFX9-LABEL: v2i16_hi16bits:
2053 ; GFX9: ; %bb.0: ; %entry
2054 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2055 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2056 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2057 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2058 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2060 ; GFX10-LABEL: v2i16_hi16bits:
2061 ; GFX10: ; %bb.0: ; %entry
2062 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2063 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2064 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2065 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2066 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2068 ; GFX11-LABEL: v2i16_hi16bits:
2069 ; GFX11: ; %bb.0: ; %entry
2070 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2071 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2072 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2073 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2074 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2076 %load0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2077 %insert1 = insertelement <2 x i16> undef, i16 0, i32 0
2078 %insert2 = insertelement <2 x i16> %insert1, i16 0, i32 1
2079 %vec.ret = shufflevector <2 x i16> %insert2, <2 x i16> %load0, <2 x i32> <i32 0, i32 3>
2080 ret <2 x i16> %vec.ret
2083 define <2 x half> @v2half_hi16bits(ptr addrspace(1) %x0) {
2084 ; GFX9-LABEL: v2half_hi16bits:
2085 ; GFX9: ; %bb.0: ; %entry
2086 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2087 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
2088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2089 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2090 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2092 ; GFX10-LABEL: v2half_hi16bits:
2093 ; GFX10: ; %bb.0: ; %entry
2094 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2095 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2096 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2097 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2098 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2100 ; GFX11-LABEL: v2half_hi16bits:
2101 ; GFX11: ; %bb.0: ; %entry
2102 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2103 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2104 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2105 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2106 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2108 %load0 = load <2 x half>, ptr addrspace(1) %x0, align 4
2109 %insert1 = insertelement <2 x half> undef, half 0.0, i32 0
2110 %insert2 = insertelement <2 x half> %insert1, half 0.0, i32 1
2111 %vec.ret = shufflevector <2 x half> %insert2, <2 x half> %load0, <2 x i32> <i32 0, i32 3>
2112 ret <2 x half> %vec.ret
2115 define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2116 ; GFX9-LABEL: shuffle_v8f16_concat:
2118 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2119 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2120 ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2121 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2122 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2123 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2124 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2126 ; GFX10-LABEL: shuffle_v8f16_concat:
2128 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2129 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2130 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2131 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2132 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2133 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2135 ; GFX11-LABEL: shuffle_v8f16_concat:
2137 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2138 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2139 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
2140 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2141 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2142 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2143 %val0 = load <4 x half>, ptr addrspace(1) %arg0
2144 %val1 = load <4 x half>, ptr addrspace(1) %arg1
2145 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2146 store <8 x half> %shuffle, ptr addrspace(1) %out
2150 define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2151 ; GFX9-LABEL: shuffle_v16f16_concat:
2153 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2154 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2155 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2156 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2157 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2158 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2159 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2161 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2163 ; GFX10-LABEL: shuffle_v16f16_concat:
2165 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2166 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2167 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2168 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2169 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2170 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2171 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2172 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2174 ; GFX11-LABEL: shuffle_v16f16_concat:
2176 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2177 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2178 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
2179 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2180 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
2181 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2182 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2183 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2184 %val0 = load <8 x half>, ptr addrspace(1) %arg0
2185 %val1 = load <8 x half>, ptr addrspace(1) %arg1
2186 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2187 store <16 x half> %shuffle, ptr addrspace(1) %out
2191 define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2192 ; GFX9-LABEL: shuffle_v32f16_concat:
2194 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2195 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2196 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
2197 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
2198 ; GFX9-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
2199 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2200 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
2201 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2202 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
2203 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2204 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
2205 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2206 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
2207 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2208 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2210 ; GFX10-LABEL: shuffle_v32f16_concat:
2212 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2213 ; GFX10-NEXT: s_clause 0x1
2214 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2215 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
2216 ; GFX10-NEXT: s_clause 0x1
2217 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
2218 ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
2219 ; GFX10-NEXT: s_waitcnt vmcnt(3)
2220 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
2221 ; GFX10-NEXT: s_waitcnt vmcnt(2)
2222 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
2223 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2224 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
2225 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2226 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
2227 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2229 ; GFX11-LABEL: shuffle_v32f16_concat:
2231 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2232 ; GFX11-NEXT: s_clause 0x1
2233 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2234 ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
2235 ; GFX11-NEXT: s_clause 0x1
2236 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
2237 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
2238 ; GFX11-NEXT: s_waitcnt vmcnt(3)
2239 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32
2240 ; GFX11-NEXT: s_waitcnt vmcnt(2)
2241 ; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48
2242 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2243 ; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off
2244 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2245 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
2246 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2247 %val0 = load <16 x half>, ptr addrspace(1) %arg0
2248 %val1 = load <16 x half>, ptr addrspace(1) %arg1
2249 %shuffle = shufflevector <16 x half> %val0, <16 x half> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2250 store <32 x half> %shuffle, ptr addrspace(1) %out
2254 define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2255 ; GFX9-LABEL: shuffle_v8i16_concat:
2257 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2258 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2259 ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2261 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2262 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2263 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2265 ; GFX10-LABEL: shuffle_v8i16_concat:
2267 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2268 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2269 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2270 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2271 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2272 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2274 ; GFX11-LABEL: shuffle_v8i16_concat:
2276 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2277 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2278 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
2279 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2280 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2281 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2282 %val0 = load <4 x i16>, ptr addrspace(1) %arg0
2283 %val1 = load <4 x i16>, ptr addrspace(1) %arg1
2284 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2285 store <8 x i16> %shuffle, ptr addrspace(1) %out
2289 define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2290 ; GFX9-LABEL: shuffle_v16i16_concat:
2292 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2293 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2294 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2295 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2296 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2297 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2298 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2299 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2300 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2302 ; GFX10-LABEL: shuffle_v16i16_concat:
2304 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2305 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2306 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2307 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2308 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2309 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2310 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2311 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2313 ; GFX11-LABEL: shuffle_v16i16_concat:
2315 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2316 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2317 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
2318 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2319 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
2320 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2321 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2322 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2323 %val0 = load <8 x i16>, ptr addrspace(1) %arg0
2324 %val1 = load <8 x i16>, ptr addrspace(1) %arg1
2325 %shuffle = shufflevector <8 x i16> %val0, <8 x i16> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2326 store <16 x i16> %shuffle, ptr addrspace(1) %out
2330 define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2331 ; GFX9-LABEL: shuffle_v32i16_concat:
2333 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2334 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2335 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
2336 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
2337 ; GFX9-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
2338 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2339 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
2340 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2341 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
2342 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2343 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
2344 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2345 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
2346 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2347 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2349 ; GFX10-LABEL: shuffle_v32i16_concat:
2351 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2352 ; GFX10-NEXT: s_clause 0x1
2353 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2354 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
2355 ; GFX10-NEXT: s_clause 0x1
2356 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
2357 ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
2358 ; GFX10-NEXT: s_waitcnt vmcnt(3)
2359 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
2360 ; GFX10-NEXT: s_waitcnt vmcnt(2)
2361 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
2362 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2363 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
2364 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2365 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
2366 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2368 ; GFX11-LABEL: shuffle_v32i16_concat:
2370 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2371 ; GFX11-NEXT: s_clause 0x1
2372 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2373 ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
2374 ; GFX11-NEXT: s_clause 0x1
2375 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
2376 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
2377 ; GFX11-NEXT: s_waitcnt vmcnt(3)
2378 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32
2379 ; GFX11-NEXT: s_waitcnt vmcnt(2)
2380 ; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48
2381 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2382 ; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off
2383 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2384 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
2385 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2386 %val0 = load <16 x i16>, ptr addrspace(1) %arg0
2387 %val1 = load <16 x i16>, ptr addrspace(1) %arg1
2388 %shuffle = shufflevector <16 x i16> %val0, <16 x i16> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2389 store <32 x i16> %shuffle, ptr addrspace(1) %out
2393 define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2394 ; GFX9-LABEL: shuffle_v4i8_concat:
2396 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2397 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
2398 ; GFX9-NEXT: s_nop 0
2399 ; GFX9-NEXT: global_load_short_d16_hi v0, v[2:3], off
2400 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2401 ; GFX9-NEXT: global_store_dword v[4:5], v0, off
2402 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2403 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2405 ; GFX10-LABEL: shuffle_v4i8_concat:
2407 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2408 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
2409 ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off
2410 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2411 ; GFX10-NEXT: global_store_dword v[4:5], v0, off
2412 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2414 ; GFX11-LABEL: shuffle_v4i8_concat:
2416 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
2418 ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
2419 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2420 ; GFX11-NEXT: global_store_b32 v[4:5], v0, off
2421 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2422 %val0 = load <2 x i8>, ptr addrspace(1) %arg0
2423 %val1 = load <2 x i8>, ptr addrspace(1) %arg1
2424 %shuffle = shufflevector <2 x i8> %val0, <2 x i8> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2425 store <4 x i8> %shuffle, ptr addrspace(1) %out
2429 define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2430 ; GFX9-LABEL: shuffle_v8i8_concat:
2432 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2433 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2434 ; GFX9-NEXT: global_load_dword v7, v[2:3], off
2435 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2436 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[6:7], off
2437 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2438 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2440 ; GFX10-LABEL: shuffle_v8i8_concat:
2442 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2443 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2444 ; GFX10-NEXT: global_load_dword v7, v[2:3], off
2445 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2446 ; GFX10-NEXT: global_store_dwordx2 v[4:5], v[6:7], off
2447 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2449 ; GFX11-LABEL: shuffle_v8i8_concat:
2451 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
2453 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off
2454 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2455 ; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
2456 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2457 %val0 = load <4 x i8>, ptr addrspace(1) %arg0
2458 %val1 = load <4 x i8>, ptr addrspace(1) %arg1
2459 %shuffle = shufflevector <4 x i8> %val0, <4 x i8> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2460 store <8 x i8> %shuffle, ptr addrspace(1) %out
2464 define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2465 ; GFX9-LABEL: shuffle_v16i8_concat:
2467 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2468 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2469 ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2470 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2471 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2473 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2475 ; GFX10-LABEL: shuffle_v16i8_concat:
2477 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2478 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2479 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2480 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2481 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2482 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2484 ; GFX11-LABEL: shuffle_v16i8_concat:
2486 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2487 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2488 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
2489 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2490 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2491 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2492 %val0 = load <8 x i8>, ptr addrspace(1) %arg0
2493 %val1 = load <8 x i8>, ptr addrspace(1) %arg1
2494 %shuffle = shufflevector <8 x i8> %val0, <8 x i8> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2495 store <16 x i8> %shuffle, ptr addrspace(1) %out
2499 define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2500 ; GFX9-LABEL: shuffle_v32i8_concat:
2502 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2504 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2505 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2506 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2507 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2508 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2509 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2510 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2512 ; GFX10-LABEL: shuffle_v32i8_concat:
2514 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2515 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2516 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2517 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2518 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2519 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2520 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2521 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2523 ; GFX11-LABEL: shuffle_v32i8_concat:
2525 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2526 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2527 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
2528 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2529 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
2530 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2531 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2532 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2533 %val0 = load <16 x i8>, ptr addrspace(1) %arg0
2534 %val1 = load <16 x i8>, ptr addrspace(1) %arg1
2535 %shuffle = shufflevector <16 x i8> %val0, <16 x i8> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2536 store <32 x i8> %shuffle, ptr addrspace(1) %out
2540 define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2541 ; GFX9-LABEL: shuffle_v4i32_concat:
2543 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2544 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2545 ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2546 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2547 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2549 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2551 ; GFX10-LABEL: shuffle_v4i32_concat:
2553 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2554 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2555 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
2556 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
2558 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2560 ; GFX11-LABEL: shuffle_v4i32_concat:
2562 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2563 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2564 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
2565 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2566 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2567 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2568 %val0 = load <2 x i32>, ptr addrspace(1) %arg0
2569 %val1 = load <2 x i32>, ptr addrspace(1) %arg1
2570 %shuffle = shufflevector <2 x i32> %val0, <2 x i32> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2571 store <4 x i32> %shuffle, ptr addrspace(1) %out
2575 define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2576 ; GFX9-LABEL: shuffle_v8i32_concat:
2578 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2579 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2580 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2581 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2582 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2583 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2584 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2585 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2586 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2588 ; GFX10-LABEL: shuffle_v8i32_concat:
2590 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2591 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2592 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
2593 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2594 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
2595 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2596 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
2597 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2599 ; GFX11-LABEL: shuffle_v8i32_concat:
2601 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2602 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2603 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
2604 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2605 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
2606 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2607 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2608 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2609 %val0 = load <4 x i32>, ptr addrspace(1) %arg0
2610 %val1 = load <4 x i32>, ptr addrspace(1) %arg1
2611 %shuffle = shufflevector <4 x i32> %val0, <4 x i32> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2612 store <8 x i32> %shuffle, ptr addrspace(1) %out
2616 define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2617 ; GFX9-LABEL: shuffle_v16i32_concat:
2619 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2621 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
2622 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
2623 ; GFX9-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
2624 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2625 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
2626 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2627 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
2628 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2629 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
2630 ; GFX9-NEXT: s_waitcnt vmcnt(3)
2631 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
2632 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2633 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2635 ; GFX10-LABEL: shuffle_v16i32_concat:
2637 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2638 ; GFX10-NEXT: s_clause 0x1
2639 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
2640 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
2641 ; GFX10-NEXT: s_clause 0x1
2642 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
2643 ; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
2644 ; GFX10-NEXT: s_waitcnt vmcnt(3)
2645 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
2646 ; GFX10-NEXT: s_waitcnt vmcnt(2)
2647 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
2648 ; GFX10-NEXT: s_waitcnt vmcnt(1)
2649 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
2650 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2651 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
2652 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2654 ; GFX11-LABEL: shuffle_v16i32_concat:
2656 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2657 ; GFX11-NEXT: s_clause 0x1
2658 ; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
2659 ; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
2660 ; GFX11-NEXT: s_clause 0x1
2661 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
2662 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
2663 ; GFX11-NEXT: s_waitcnt vmcnt(3)
2664 ; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32
2665 ; GFX11-NEXT: s_waitcnt vmcnt(2)
2666 ; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48
2667 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2668 ; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off
2669 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2670 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
2671 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2672 %val0 = load <8 x i32>, ptr addrspace(1) %arg0
2673 %val1 = load <8 x i32>, ptr addrspace(1) %arg1
2674 %shuffle = shufflevector <8 x i32> %val0, <8 x i32> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2675 store <16 x i32> %shuffle, ptr addrspace(1) %out