1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
5 define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
6 ; GFX9-LABEL: shuffle_v4f16_23uu:
8 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
10 ; GFX9-NEXT: s_waitcnt vmcnt(0)
11 ; GFX9-NEXT: s_setpc_b64 s[30:31]
13 ; GFX10-LABEL: shuffle_v4f16_23uu:
15 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
17 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
18 ; GFX10-NEXT: s_waitcnt vmcnt(0)
19 ; GFX10-NEXT: s_setpc_b64 s[30:31]
20 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
21 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
22 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
23 ret <4 x half> %shuffle
26 define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
27 ; GFX9-LABEL: shuffle_v4f16_234u:
29 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
31 ; GFX9-NEXT: s_waitcnt vmcnt(0)
32 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
33 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
34 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
35 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
36 ; GFX9-NEXT: s_waitcnt vmcnt(0)
37 ; GFX9-NEXT: v_mov_b32_e32 v0, v5
38 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40 ; GFX10-LABEL: shuffle_v4f16_234u:
42 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
44 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
45 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
46 ; GFX10-NEXT: s_waitcnt vmcnt(1)
47 ; GFX10-NEXT: v_mov_b32_e32 v0, v6
48 ; GFX10-NEXT: s_waitcnt vmcnt(0)
49 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
50 ; GFX10-NEXT: s_setpc_b64 s[30:31]
51 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
52 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
53 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
54 ret <4 x half> %shuffle
57 define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
58 ; GFX9-LABEL: shuffle_v4f16_u1u3:
60 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
62 ; GFX9-NEXT: s_waitcnt vmcnt(0)
63 ; GFX9-NEXT: s_setpc_b64 s[30:31]
65 ; GFX10-LABEL: shuffle_v4f16_u1u3:
67 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
69 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
70 ; GFX10-NEXT: s_waitcnt vmcnt(0)
71 ; GFX10-NEXT: s_setpc_b64 s[30:31]
72 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
73 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
74 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
75 ret <4 x half> %shuffle
78 define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
79 ; GFX9-LABEL: shuffle_v4f16_u3u1:
81 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
83 ; GFX9-NEXT: s_waitcnt vmcnt(0)
84 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
85 ; GFX9-NEXT: s_setpc_b64 s[30:31]
87 ; GFX10-LABEL: shuffle_v4f16_u3u1:
89 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
91 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
92 ; GFX10-NEXT: s_waitcnt vmcnt(0)
93 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
94 ; GFX10-NEXT: s_setpc_b64 s[30:31]
95 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
96 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
97 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
98 ret <4 x half> %shuffle
101 define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
102 ; GFX9-LABEL: shuffle_v4f16_u3uu:
104 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
107 ; GFX9-NEXT: s_setpc_b64 s[30:31]
109 ; GFX10-LABEL: shuffle_v4f16_u3uu:
111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
113 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
114 ; GFX10-NEXT: s_waitcnt vmcnt(0)
115 ; GFX10-NEXT: s_setpc_b64 s[30:31]
116 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
117 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
118 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
119 ret <4 x half> %shuffle
122 define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
123 ; GFX9-LABEL: shuffle_v4f16_3u6u:
125 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
127 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
128 ; GFX9-NEXT: s_waitcnt vmcnt(1)
129 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
131 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
132 ; GFX9-NEXT: s_setpc_b64 s[30:31]
134 ; GFX10-LABEL: shuffle_v4f16_3u6u:
136 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
138 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
139 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
140 ; GFX10-NEXT: s_waitcnt vmcnt(1)
141 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5
142 ; GFX10-NEXT: s_waitcnt vmcnt(0)
143 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
144 ; GFX10-NEXT: s_setpc_b64 s[30:31]
145 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
146 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
147 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
148 ret <4 x half> %shuffle
151 define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
152 ; GFX9-LABEL: shuffle_v4f16_3uu7:
154 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
156 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
157 ; GFX9-NEXT: s_waitcnt vmcnt(1)
158 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
159 ; GFX9-NEXT: s_waitcnt vmcnt(0)
160 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
161 ; GFX9-NEXT: s_setpc_b64 s[30:31]
163 ; GFX10-LABEL: shuffle_v4f16_3uu7:
165 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
167 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
168 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
169 ; GFX10-NEXT: s_waitcnt vmcnt(1)
170 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5
171 ; GFX10-NEXT: s_waitcnt vmcnt(0)
172 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
173 ; GFX10-NEXT: s_setpc_b64 s[30:31]
174 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
175 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
176 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
177 ret <4 x half> %shuffle
180 define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
181 ; GFX9-LABEL: shuffle_v4f16_35u5:
183 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
185 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
186 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
187 ; GFX9-NEXT: s_waitcnt vmcnt(1)
188 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4
189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
190 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
191 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
192 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
193 ; GFX9-NEXT: s_setpc_b64 s[30:31]
195 ; GFX10-LABEL: shuffle_v4f16_35u5:
197 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
199 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
200 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
201 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
202 ; GFX10-NEXT: s_waitcnt vmcnt(1)
203 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4
204 ; GFX10-NEXT: s_waitcnt vmcnt(0)
205 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
206 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
207 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
208 ; GFX10-NEXT: s_setpc_b64 s[30:31]
209 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
210 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
211 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
212 ret <4 x half> %shuffle
215 define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
216 ; GFX9-LABEL: shuffle_v4f16_357u:
218 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
220 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
221 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
222 ; GFX9-NEXT: s_waitcnt vmcnt(1)
223 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
224 ; GFX9-NEXT: s_waitcnt vmcnt(0)
225 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
226 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
227 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
228 ; GFX9-NEXT: s_setpc_b64 s[30:31]
230 ; GFX10-LABEL: shuffle_v4f16_357u:
232 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
234 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
235 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
236 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
237 ; GFX10-NEXT: s_waitcnt vmcnt(1)
238 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4
239 ; GFX10-NEXT: s_waitcnt vmcnt(0)
240 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
241 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
242 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5
243 ; GFX10-NEXT: s_setpc_b64 s[30:31]
244 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
245 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
246 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
247 ret <4 x half> %shuffle
250 define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
251 ; GFX9-LABEL: shuffle_v4f16_0101:
253 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
255 ; GFX9-NEXT: s_waitcnt vmcnt(0)
256 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
257 ; GFX9-NEXT: s_setpc_b64 s[30:31]
259 ; GFX10-LABEL: shuffle_v4f16_0101:
261 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
263 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
264 ; GFX10-NEXT: s_waitcnt vmcnt(0)
265 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
266 ; GFX10-NEXT: s_setpc_b64 s[30:31]
267 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
268 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
269 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
270 ret <4 x half> %shuffle
273 define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
274 ; GFX9-LABEL: shuffle_v4f16_0123:
276 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
278 ; GFX9-NEXT: s_waitcnt vmcnt(0)
279 ; GFX9-NEXT: s_setpc_b64 s[30:31]
281 ; GFX10-LABEL: shuffle_v4f16_0123:
283 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
284 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
285 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
286 ; GFX10-NEXT: s_waitcnt vmcnt(0)
287 ; GFX10-NEXT: s_setpc_b64 s[30:31]
288 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
289 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
290 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
291 ret <4 x half> %shuffle
294 define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
295 ; GFX9-LABEL: shuffle_v4f16_0145:
297 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
299 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
300 ; GFX9-NEXT: s_waitcnt vmcnt(1)
301 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
303 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
304 ; GFX9-NEXT: s_setpc_b64 s[30:31]
306 ; GFX10-LABEL: shuffle_v4f16_0145:
308 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
310 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
311 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
312 ; GFX10-NEXT: s_waitcnt vmcnt(1)
313 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
314 ; GFX10-NEXT: s_waitcnt vmcnt(0)
315 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
316 ; GFX10-NEXT: s_setpc_b64 s[30:31]
317 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
318 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
319 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
320 ret <4 x half> %shuffle
323 define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
324 ; GFX9-LABEL: shuffle_v4f16_0167:
326 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
328 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
329 ; GFX9-NEXT: s_waitcnt vmcnt(1)
330 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
331 ; GFX9-NEXT: s_waitcnt vmcnt(0)
332 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
333 ; GFX9-NEXT: s_setpc_b64 s[30:31]
335 ; GFX10-LABEL: shuffle_v4f16_0167:
337 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
338 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
339 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
340 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
341 ; GFX10-NEXT: s_waitcnt vmcnt(1)
342 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
343 ; GFX10-NEXT: s_waitcnt vmcnt(0)
344 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
345 ; GFX10-NEXT: s_setpc_b64 s[30:31]
346 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
347 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
348 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
349 ret <4 x half> %shuffle
352 define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
353 ; GFX9-LABEL: shuffle_v4f16_2301:
355 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
357 ; GFX9-NEXT: s_waitcnt vmcnt(0)
358 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
359 ; GFX9-NEXT: s_setpc_b64 s[30:31]
361 ; GFX10-LABEL: shuffle_v4f16_2301:
363 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
365 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
366 ; GFX10-NEXT: s_waitcnt vmcnt(0)
367 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
368 ; GFX10-NEXT: s_setpc_b64 s[30:31]
369 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
370 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
371 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
372 ret <4 x half> %shuffle
375 define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
376 ; GFX9-LABEL: shuffle_v4f16_2323:
378 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
380 ; GFX9-NEXT: s_waitcnt vmcnt(0)
381 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
382 ; GFX9-NEXT: s_setpc_b64 s[30:31]
384 ; GFX10-LABEL: shuffle_v4f16_2323:
386 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
388 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
389 ; GFX10-NEXT: s_waitcnt vmcnt(0)
390 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
391 ; GFX10-NEXT: s_setpc_b64 s[30:31]
392 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
393 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
394 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
395 ret <4 x half> %shuffle
398 define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
399 ; GFX9-LABEL: shuffle_v4f16_2345:
401 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
403 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
404 ; GFX9-NEXT: s_waitcnt vmcnt(1)
405 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
406 ; GFX9-NEXT: s_waitcnt vmcnt(0)
407 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
408 ; GFX9-NEXT: s_setpc_b64 s[30:31]
410 ; GFX10-LABEL: shuffle_v4f16_2345:
412 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
414 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
415 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
416 ; GFX10-NEXT: s_waitcnt vmcnt(1)
417 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
418 ; GFX10-NEXT: s_waitcnt vmcnt(0)
419 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
420 ; GFX10-NEXT: s_setpc_b64 s[30:31]
421 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
422 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
423 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
424 ret <4 x half> %shuffle
427 define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
428 ; GFX9-LABEL: shuffle_v4f16_2367:
430 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
431 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
432 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
433 ; GFX9-NEXT: s_waitcnt vmcnt(1)
434 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
435 ; GFX9-NEXT: s_waitcnt vmcnt(0)
436 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
437 ; GFX9-NEXT: s_setpc_b64 s[30:31]
439 ; GFX10-LABEL: shuffle_v4f16_2367:
441 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
443 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
444 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
445 ; GFX10-NEXT: s_waitcnt vmcnt(1)
446 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
447 ; GFX10-NEXT: s_waitcnt vmcnt(0)
448 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
449 ; GFX10-NEXT: s_setpc_b64 s[30:31]
450 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
451 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
452 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
453 ret <4 x half> %shuffle
456 define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
457 ; GFX9-LABEL: shuffle_v4f16_4501:
459 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
460 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
461 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
462 ; GFX9-NEXT: s_waitcnt vmcnt(1)
463 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
464 ; GFX9-NEXT: s_waitcnt vmcnt(0)
465 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
466 ; GFX9-NEXT: s_setpc_b64 s[30:31]
468 ; GFX10-LABEL: shuffle_v4f16_4501:
470 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
471 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
472 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
473 ; GFX10-NEXT: global_load_dword v5, v[0:1], off
474 ; GFX10-NEXT: s_waitcnt vmcnt(1)
475 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
476 ; GFX10-NEXT: s_waitcnt vmcnt(0)
477 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
478 ; GFX10-NEXT: s_setpc_b64 s[30:31]
479 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
480 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
481 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
482 ret <4 x half> %shuffle
485 define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
486 ; GFX9-LABEL: shuffle_v4f16_4523:
488 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489 ; GFX9-NEXT: global_load_dword v4, v[2:3], off
490 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
491 ; GFX9-NEXT: s_waitcnt vmcnt(1)
492 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
493 ; GFX9-NEXT: s_waitcnt vmcnt(0)
494 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
495 ; GFX9-NEXT: s_setpc_b64 s[30:31]
497 ; GFX10-LABEL: shuffle_v4f16_4523:
499 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
501 ; GFX10-NEXT: global_load_dword v4, v[2:3], off
502 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
503 ; GFX10-NEXT: s_waitcnt vmcnt(1)
504 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
505 ; GFX10-NEXT: s_waitcnt vmcnt(0)
506 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
507 ; GFX10-NEXT: s_setpc_b64 s[30:31]
508 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
509 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
510 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
511 ret <4 x half> %shuffle
514 define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
515 ; GFX9-LABEL: shuffle_v4f16_4545:
517 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518 ; GFX9-NEXT: global_load_dword v0, v[2:3], off
519 ; GFX9-NEXT: s_waitcnt vmcnt(0)
520 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
521 ; GFX9-NEXT: s_setpc_b64 s[30:31]
523 ; GFX10-LABEL: shuffle_v4f16_4545:
525 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
527 ; GFX10-NEXT: global_load_dword v0, v[2:3], off
528 ; GFX10-NEXT: s_waitcnt vmcnt(0)
529 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
530 ; GFX10-NEXT: s_setpc_b64 s[30:31]
531 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
532 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
533 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
534 ret <4 x half> %shuffle
537 define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
538 ; GFX9-LABEL: shuffle_v4f16_4567:
540 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
542 ; GFX9-NEXT: s_waitcnt vmcnt(0)
543 ; GFX9-NEXT: s_setpc_b64 s[30:31]
545 ; GFX10-LABEL: shuffle_v4f16_4567:
547 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
549 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
550 ; GFX10-NEXT: s_waitcnt vmcnt(0)
551 ; GFX10-NEXT: s_setpc_b64 s[30:31]
552 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
553 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
554 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
555 ret <4 x half> %shuffle
558 define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
559 ; GFX9-LABEL: shuffle_v4f16_6701:
561 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
563 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
564 ; GFX9-NEXT: s_waitcnt vmcnt(1)
565 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
566 ; GFX9-NEXT: s_waitcnt vmcnt(0)
567 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
568 ; GFX9-NEXT: s_setpc_b64 s[30:31]
570 ; GFX10-LABEL: shuffle_v4f16_6701:
572 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
574 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
575 ; GFX10-NEXT: global_load_dword v5, v[0:1], off
576 ; GFX10-NEXT: s_waitcnt vmcnt(1)
577 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
578 ; GFX10-NEXT: s_waitcnt vmcnt(0)
579 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
580 ; GFX10-NEXT: s_setpc_b64 s[30:31]
581 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
582 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
583 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
584 ret <4 x half> %shuffle
587 define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
588 ; GFX9-LABEL: shuffle_v4f16_6723:
590 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
591 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
592 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
593 ; GFX9-NEXT: s_waitcnt vmcnt(1)
594 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
595 ; GFX9-NEXT: s_waitcnt vmcnt(0)
596 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
597 ; GFX9-NEXT: s_setpc_b64 s[30:31]
599 ; GFX10-LABEL: shuffle_v4f16_6723:
601 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
603 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
604 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
605 ; GFX10-NEXT: s_waitcnt vmcnt(1)
606 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
607 ; GFX10-NEXT: s_waitcnt vmcnt(0)
608 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
609 ; GFX10-NEXT: s_setpc_b64 s[30:31]
610 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
611 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
612 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
613 ret <4 x half> %shuffle
616 define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
617 ; GFX9-LABEL: shuffle_v4f16_6745:
619 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
621 ; GFX9-NEXT: s_waitcnt vmcnt(0)
622 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
625 ; GFX10-LABEL: shuffle_v4f16_6745:
627 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
629 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
630 ; GFX10-NEXT: s_waitcnt vmcnt(0)
631 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
632 ; GFX10-NEXT: s_setpc_b64 s[30:31]
633 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
634 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
635 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
636 ret <4 x half> %shuffle
639 define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
640 ; GFX9-LABEL: shuffle_v4f16_6767:
642 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643 ; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
644 ; GFX9-NEXT: s_waitcnt vmcnt(0)
645 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
646 ; GFX9-NEXT: s_setpc_b64 s[30:31]
648 ; GFX10-LABEL: shuffle_v4f16_6767:
650 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
652 ; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4
653 ; GFX10-NEXT: s_waitcnt vmcnt(0)
654 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
655 ; GFX10-NEXT: s_setpc_b64 s[30:31]
656 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
657 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
658 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
659 ret <4 x half> %shuffle
662 define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
663 ; GFX9-LABEL: shuffle_v4f16_2356:
665 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
667 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
668 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
669 ; GFX9-NEXT: s_waitcnt vmcnt(1)
670 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
671 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0
672 ; GFX9-NEXT: s_waitcnt vmcnt(0)
673 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
674 ; GFX9-NEXT: s_setpc_b64 s[30:31]
676 ; GFX10-LABEL: shuffle_v4f16_2356:
678 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
679 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
680 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
681 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
682 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
683 ; GFX10-NEXT: s_waitcnt vmcnt(1)
684 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
685 ; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0
686 ; GFX10-NEXT: s_waitcnt vmcnt(0)
687 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
688 ; GFX10-NEXT: s_setpc_b64 s[30:31]
689 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
690 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
691 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
692 ret <4 x half> %shuffle
695 define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
696 ; GFX9-LABEL: shuffle_v4f16_5623:
698 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
700 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
701 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
702 ; GFX9-NEXT: s_waitcnt vmcnt(1)
703 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
704 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
706 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
707 ; GFX9-NEXT: s_setpc_b64 s[30:31]
709 ; GFX10-LABEL: shuffle_v4f16_5623:
711 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
712 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
713 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
714 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
715 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
716 ; GFX10-NEXT: s_waitcnt vmcnt(1)
717 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
718 ; GFX10-NEXT: s_waitcnt vmcnt(0)
719 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
720 ; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0
721 ; GFX10-NEXT: s_setpc_b64 s[30:31]
722 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
723 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
724 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
725 ret <4 x half> %shuffle
728 define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
729 ; GFX9-LABEL: shuffle_v4f16_3456:
731 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
732 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
733 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
734 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
735 ; GFX9-NEXT: s_waitcnt vmcnt(1)
736 ; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
737 ; GFX9-NEXT: s_waitcnt vmcnt(0)
738 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
739 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1
740 ; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2
741 ; GFX9-NEXT: s_setpc_b64 s[30:31]
743 ; GFX10-LABEL: shuffle_v4f16_3456:
745 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
746 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
747 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
748 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
749 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
750 ; GFX10-NEXT: s_waitcnt vmcnt(1)
751 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
752 ; GFX10-NEXT: s_waitcnt vmcnt(0)
753 ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
754 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1
755 ; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2
756 ; GFX10-NEXT: s_setpc_b64 s[30:31]
757 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
758 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
759 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
760 ret <4 x half> %shuffle
763 define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
764 ; GFX9-LABEL: shuffle_v4f16_5634:
766 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
768 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
769 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
770 ; GFX9-NEXT: s_waitcnt vmcnt(0)
771 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
772 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
773 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
774 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
775 ; GFX9-NEXT: s_setpc_b64 s[30:31]
777 ; GFX10-LABEL: shuffle_v4f16_5634:
779 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
780 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
781 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
782 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
783 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
784 ; GFX10-NEXT: s_waitcnt vmcnt(1)
785 ; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
786 ; GFX10-NEXT: s_waitcnt vmcnt(0)
787 ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
788 ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1
789 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2
790 ; GFX10-NEXT: s_setpc_b64 s[30:31]
791 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
792 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
793 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
794 ret <4 x half> %shuffle
797 define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
798 ; GFX9-LABEL: shuffle_v4f16_5734:
800 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
802 ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
803 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
804 ; GFX9-NEXT: s_waitcnt vmcnt(1)
805 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5
806 ; GFX9-NEXT: s_waitcnt vmcnt(0)
807 ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
808 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
809 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
810 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
811 ; GFX9-NEXT: s_setpc_b64 s[30:31]
813 ; GFX10-LABEL: shuffle_v4f16_5734:
815 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
816 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
817 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
818 ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
819 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
820 ; GFX10-NEXT: s_waitcnt vmcnt(1)
821 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5
822 ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
823 ; GFX10-NEXT: s_waitcnt vmcnt(0)
824 ; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
825 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2
826 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3
827 ; GFX10-NEXT: s_setpc_b64 s[30:31]
828 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
829 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
830 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
831 ret <4 x half> %shuffle
834 define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
835 ; GFX9-LABEL: shuffle_v4i16_2356:
837 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
838 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
839 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
840 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
841 ; GFX9-NEXT: s_waitcnt vmcnt(1)
842 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
843 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0
844 ; GFX9-NEXT: s_waitcnt vmcnt(0)
845 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
846 ; GFX9-NEXT: s_setpc_b64 s[30:31]
848 ; GFX10-LABEL: shuffle_v4i16_2356:
850 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
852 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
853 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
854 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
855 ; GFX10-NEXT: s_waitcnt vmcnt(1)
856 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
857 ; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0
858 ; GFX10-NEXT: s_waitcnt vmcnt(0)
859 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
860 ; GFX10-NEXT: s_setpc_b64 s[30:31]
861 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
862 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
863 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
864 ret <4 x i16> %shuffle
867 define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
868 ; GFX9-LABEL: shuffle_v4i16_0167:
870 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
872 ; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
873 ; GFX9-NEXT: s_waitcnt vmcnt(1)
874 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
875 ; GFX9-NEXT: s_waitcnt vmcnt(0)
876 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
877 ; GFX9-NEXT: s_setpc_b64 s[30:31]
879 ; GFX10-LABEL: shuffle_v4i16_0167:
881 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
882 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
883 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
884 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
885 ; GFX10-NEXT: s_waitcnt vmcnt(1)
886 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
887 ; GFX10-NEXT: s_waitcnt vmcnt(0)
888 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
889 ; GFX10-NEXT: s_setpc_b64 s[30:31]
890 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
891 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
892 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
893 ret <4 x i16> %shuffle
896 define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
897 ; GFX9-LABEL: shuffle_v4f16_0000:
899 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
901 ; GFX9-NEXT: s_waitcnt vmcnt(0)
902 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
903 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
904 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
905 ; GFX9-NEXT: s_setpc_b64 s[30:31]
907 ; GFX10-LABEL: shuffle_v4f16_0000:
909 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
911 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
912 ; GFX10-NEXT: s_waitcnt vmcnt(0)
913 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0
914 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
915 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
916 ; GFX10-NEXT: s_setpc_b64 s[30:31]
917 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
918 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
919 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
920 ret <4 x half> %shuffle
923 define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
924 ; GFX9-LABEL: shuffle_v4f16_1010:
926 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
928 ; GFX9-NEXT: s_waitcnt vmcnt(0)
929 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
930 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
931 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
932 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
933 ; GFX9-NEXT: s_setpc_b64 s[30:31]
935 ; GFX10-LABEL: shuffle_v4f16_1010:
937 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
938 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
939 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
940 ; GFX10-NEXT: s_waitcnt vmcnt(0)
941 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff
942 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
943 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
944 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
945 ; GFX10-NEXT: s_setpc_b64 s[30:31]
946 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
947 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
948 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
949 ret <4 x half> %shuffle
952 define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
953 ; GFX9-LABEL: shuffle_v4f16_1100:
955 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
957 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
958 ; GFX9-NEXT: s_waitcnt vmcnt(0)
959 ; GFX9-NEXT: v_and_b32_e32 v1, v2, v0
960 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
961 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
962 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v3
963 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
964 ; GFX9-NEXT: s_setpc_b64 s[30:31]
966 ; GFX10-LABEL: shuffle_v4f16_1100:
968 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
969 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
970 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
971 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
972 ; GFX10-NEXT: s_waitcnt vmcnt(0)
973 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
974 ; GFX10-NEXT: v_and_b32_e32 v4, v0, v1
975 ; GFX10-NEXT: v_and_b32_e32 v3, v0, v2
976 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v4
977 ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v3
978 ; GFX10-NEXT: s_setpc_b64 s[30:31]
979 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
980 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
981 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
982 ret <4 x half> %shuffle
985 define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
986 ; GFX9-LABEL: shuffle_v4f16_6161:
988 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
990 ; GFX9-NEXT: global_load_dword v5, v[0:1], off
991 ; GFX9-NEXT: s_waitcnt vmcnt(1)
992 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4
993 ; GFX9-NEXT: s_waitcnt vmcnt(0)
994 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
995 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
996 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
997 ; GFX9-NEXT: s_setpc_b64 s[30:31]
999 ; GFX10-LABEL: shuffle_v4f16_6161:
1001 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1003 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1004 ; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
1005 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1006 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
1007 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1008 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5
1009 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
1010 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1011 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1012 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1013 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1014 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
1015 ret <4 x half> %shuffle
1018 define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1019 ; GFX9-LABEL: shuffle_v4f16_2333:
1021 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1022 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
1023 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1024 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1025 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
1026 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1027 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1029 ; GFX10-LABEL: shuffle_v4f16_2333:
1031 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1032 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1033 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
1034 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1035 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1036 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1
1037 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1038 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1039 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1040 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1041 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1042 ret <4 x half> %shuffle
1045 define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1046 ; GFX9-LABEL: shuffle_v4f16_6667:
1048 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
1050 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1051 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1052 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
1053 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1054 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1056 ; GFX10-LABEL: shuffle_v4f16_6667:
1058 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1060 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
1061 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1062 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1063 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1
1064 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1065 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1066 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1067 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1068 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1069 ret <4 x half> %shuffle
1072 define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1073 ; GFX9-LABEL: shuffle_v8f16_0101:
1075 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1077 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1078 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1079 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1081 ; GFX10-LABEL: shuffle_v8f16_0101:
1083 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1085 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1086 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1087 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1088 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1089 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1090 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1091 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1092 ret <4 x half> %shuffle
1095 define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1096 ; GFX9-LABEL: shuffle_v8f16_0123:
1098 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1100 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1101 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1103 ; GFX10-LABEL: shuffle_v8f16_0123:
1105 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1106 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1107 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1108 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1109 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1110 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1111 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1112 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1113 ret <4 x half> %shuffle
1116 define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1117 ; GFX9-LABEL: shuffle_v8f16_4589:
1119 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8
1121 ; GFX9-NEXT: global_load_dword v5, v[2:3], off
1122 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1123 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
1124 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1125 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
1126 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1128 ; GFX10-LABEL: shuffle_v8f16_4589:
1130 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1131 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1132 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8
1133 ; GFX10-NEXT: global_load_dword v5, v[2:3], off
1134 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1135 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
1136 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1137 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
1138 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1139 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1140 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1141 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
1142 ret <4 x half> %shuffle
1145 define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1146 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
1148 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1149 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
1150 ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
1151 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1152 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
1153 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1154 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
1155 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1157 ; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
1159 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1160 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1161 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
1162 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
1163 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1164 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
1165 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1166 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
1167 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1168 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1169 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1170 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
1171 ret <4 x half> %shuffle
1174 define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
1175 ; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
1177 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
1179 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
1180 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
1181 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1182 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1183 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
1184 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1185 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
1186 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1188 ; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
1190 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1191 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1192 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
1193 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
1194 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
1195 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1196 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1197 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
1199 ; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0
1200 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1201 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
1202 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
1203 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
1204 ret <4 x half> %shuffle
1207 define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
1208 ; GFX9-LABEL: shuffle_v3f16_0122:
1210 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1212 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1213 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
1214 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1215 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1217 ; GFX10-LABEL: shuffle_v3f16_0122:
1219 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1221 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1222 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1223 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1
1224 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
1225 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1226 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
1227 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
1228 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1229 ret <4 x half> %shuffle
1232 define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
1233 ; GFX9-LABEL: shuffle_v2f16_0122:
1235 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1237 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
1238 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1239 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1240 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
1241 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1243 ; GFX10-LABEL: shuffle_v2f16_0122:
1245 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1246 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1247 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1248 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff
1249 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1250 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1251 ; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1
1252 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1253 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
1254 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
1255 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
1256 ret <4 x half> %shuffle
1259 define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
1260 ; GFX9-LABEL: shuffle_v6f16_452367:
1262 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
1264 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1265 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
1266 ; GFX9-NEXT: v_mov_b32_e32 v3, v2
1267 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
1268 ; GFX9-NEXT: global_load_dword v7, v[3:4], off
1269 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1270 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
1271 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1272 ; GFX9-NEXT: v_mov_b32_e32 v2, v7
1273 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1275 ; GFX10-LABEL: shuffle_v6f16_452367:
1277 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1279 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
1280 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1281 ; GFX10-NEXT: v_mov_b32_e32 v4, v3
1282 ; GFX10-NEXT: v_mov_b32_e32 v3, v2
1283 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
1284 ; GFX10-NEXT: global_load_dword v7, v[3:4], off
1285 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1286 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
1287 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1288 ; GFX10-NEXT: v_mov_b32_e32 v2, v7
1289 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1290 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
1291 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
1292 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1293 ret <6 x half> %shuffle
1296 define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) {
1297 ; GFX9-LABEL: fma_shuffle:
1298 ; GFX9: ; %bb.0: ; %entry
1299 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1300 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1301 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1303 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
1304 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
1305 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
1306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1307 ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1308 ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1309 ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1310 ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1311 ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
1312 ; GFX9-NEXT: s_endpgm
1314 ; GFX10-LABEL: fma_shuffle:
1315 ; GFX10: ; %bb.0: ; %entry
1316 ; GFX10-NEXT: s_clause 0x1
1317 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1318 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1319 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1320 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1321 ; GFX10-NEXT: s_clause 0x2
1322 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
1323 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
1324 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
1325 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1326 ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1327 ; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1328 ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1329 ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1330 ; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
1331 ; GFX10-NEXT: s_endpgm
1333 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
1334 %tmp12 = zext i32 %tmp1 to i64
1335 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
1336 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
1337 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
1338 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
1339 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
1340 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
1341 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
1342 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1343 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1344 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
1345 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
1346 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1347 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
1348 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1349 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1350 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
1351 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1352 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
1353 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
1354 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
1355 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1356 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1357 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
1361 define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
1362 ; GFX9-LABEL: shuffle_v4f16_0456:
1364 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
1366 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1367 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
1368 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1369 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
1370 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1371 ; GFX9-NEXT: v_and_b32_e32 v1, v0, v4
1372 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1373 ; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1374 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1
1375 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2
1376 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1378 ; GFX10-LABEL: shuffle_v4f16_0456:
1380 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1381 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1382 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
1383 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1384 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
1385 ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1386 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
1387 ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1388 ; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
1389 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1390 ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1391 ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1
1392 ; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2
1393 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1394 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1395 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
1396 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1397 ret <4 x half> %shuffle
1400 define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) {
1401 ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
1403 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1404 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1405 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1406 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
1407 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1408 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1409 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1410 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1411 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1412 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
1413 ; GFX9-NEXT: s_endpgm
1415 ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
1417 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1418 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1419 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1420 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
1421 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1422 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
1423 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
1424 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
1425 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
1426 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
1427 ; GFX10-NEXT: s_endpgm
1428 %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
1429 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1430 store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
1434 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
1435 declare i32 @llvm.amdgcn.workitem.id.x() #0
1437 attributes #0 = { nounwind readnone speculatable }