1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4 define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
5 ; GFX9-LABEL: shuffle_v4f16_23uu:
7 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
9 ; GFX9-NEXT: s_waitcnt vmcnt(0)
10 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
11 ; GFX9-NEXT: s_setpc_b64 s[30:31]
12 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
13 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
14 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
15 ret <4 x half> %shuffle
18 define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
19 ; GFX9-LABEL: shuffle_v4f16_234u:
21 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
23 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
24 ; GFX9-NEXT: s_waitcnt vmcnt(0)
25 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
26 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
27 ; GFX9-NEXT: s_setpc_b64 s[30:31]
28 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
29 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
30 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
31 ret <4 x half> %shuffle
34 define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
35 ; GFX9-LABEL: shuffle_v4f16_u1u3:
37 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
39 ; GFX9-NEXT: s_waitcnt vmcnt(0)
40 ; GFX9-NEXT: s_setpc_b64 s[30:31]
41 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
42 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
43 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
44 ret <4 x half> %shuffle
47 define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
48 ; GFX9-LABEL: shuffle_v4f16_u3u1:
50 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
52 ; GFX9-NEXT: s_waitcnt vmcnt(0)
53 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
54 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
55 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
56 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2
57 ; GFX9-NEXT: s_setpc_b64 s[30:31]
58 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
59 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
60 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
61 ret <4 x half> %shuffle
64 define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
65 ; GFX9-LABEL: shuffle_v4f16_u3uu:
67 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
69 ; GFX9-NEXT: s_waitcnt vmcnt(0)
70 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
71 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
72 ; GFX9-NEXT: s_setpc_b64 s[30:31]
73 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
74 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
75 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
76 ret <4 x half> %shuffle
79 define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
80 ; GFX9-LABEL: shuffle_v4f16_3u6u:
82 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
84 ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
85 ; GFX9-NEXT: s_waitcnt vmcnt(1)
86 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
87 ; GFX9-NEXT: s_waitcnt vmcnt(0)
88 ; GFX9-NEXT: s_setpc_b64 s[30:31]
89 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
90 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
91 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
92 ret <4 x half> %shuffle
95 define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
96 ; GFX9-LABEL: shuffle_v4f16_3uu7:
98 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99 ; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
100 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
101 ; GFX9-NEXT: s_waitcnt vmcnt(1)
102 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
104 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
105 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
106 ; GFX9-NEXT: s_setpc_b64 s[30:31]
107 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
108 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
109 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
110 ret <4 x half> %shuffle
113 define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
114 ; GFX9-LABEL: shuffle_v4f16_35u5:
116 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
118 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
119 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
120 ; GFX9-NEXT: s_waitcnt vmcnt(1)
121 ; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
122 ; GFX9-NEXT: s_waitcnt vmcnt(0)
123 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
124 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
125 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
126 ; GFX9-NEXT: s_setpc_b64 s[30:31]
127 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
128 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
129 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
130 ret <4 x half> %shuffle
133 define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
134 ; GFX9-LABEL: shuffle_v4f16_357u:
136 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
138 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
139 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
140 ; GFX9-NEXT: s_waitcnt vmcnt(1)
141 ; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
142 ; GFX9-NEXT: s_waitcnt vmcnt(0)
143 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
144 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
145 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
146 ; GFX9-NEXT: s_setpc_b64 s[30:31]
147 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
148 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
149 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
150 ret <4 x half> %shuffle
153 define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
154 ; GFX9-LABEL: shuffle_v4f16_0101:
156 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
159 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
160 ; GFX9-NEXT: s_setpc_b64 s[30:31]
161 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
162 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
163 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
164 ret <4 x half> %shuffle
167 define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
168 ; GFX9-LABEL: shuffle_v4f16_0123:
170 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
172 ; GFX9-NEXT: s_waitcnt vmcnt(0)
173 ; GFX9-NEXT: s_setpc_b64 s[30:31]
174 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
175 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
176 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
177 ret <4 x half> %shuffle
180 define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
181 ; GFX9-LABEL: shuffle_v4f16_0145:
183 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
185 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
186 ; GFX9-NEXT: s_waitcnt vmcnt(0)
187 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
188 ; GFX9-NEXT: s_setpc_b64 s[30:31]
189 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
190 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
191 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
192 ret <4 x half> %shuffle
195 define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
196 ; GFX9-LABEL: shuffle_v4f16_0167:
198 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
200 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
201 ; GFX9-NEXT: s_waitcnt vmcnt(0)
202 ; GFX9-NEXT: v_mov_b32_e32 v1, v3
203 ; GFX9-NEXT: s_setpc_b64 s[30:31]
204 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
205 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
206 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
207 ret <4 x half> %shuffle
210 define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
211 ; GFX9-LABEL: shuffle_v4f16_2301:
213 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
215 ; GFX9-NEXT: s_waitcnt vmcnt(0)
216 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
217 ; GFX9-NEXT: s_setpc_b64 s[30:31]
218 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
219 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
220 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
221 ret <4 x half> %shuffle
224 define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
225 ; GFX9-LABEL: shuffle_v4f16_2323:
227 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
229 ; GFX9-NEXT: s_waitcnt vmcnt(0)
230 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
231 ; GFX9-NEXT: s_setpc_b64 s[30:31]
232 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
233 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
234 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
235 ret <4 x half> %shuffle
238 define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
239 ; GFX9-LABEL: shuffle_v4f16_2345:
241 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
243 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
244 ; GFX9-NEXT: s_waitcnt vmcnt(0)
245 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
246 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
247 ; GFX9-NEXT: s_setpc_b64 s[30:31]
248 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
249 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
250 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
251 ret <4 x half> %shuffle
254 define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
255 ; GFX9-LABEL: shuffle_v4f16_2367:
257 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
259 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
261 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
262 ; GFX9-NEXT: v_mov_b32_e32 v1, v3
263 ; GFX9-NEXT: s_setpc_b64 s[30:31]
264 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
265 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
266 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
267 ret <4 x half> %shuffle
270 define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
271 ; GFX9-LABEL: shuffle_v4f16_4501:
273 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v[2:3], off
275 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
276 ; GFX9-NEXT: s_waitcnt vmcnt(1)
277 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
278 ; GFX9-NEXT: s_waitcnt vmcnt(0)
279 ; GFX9-NEXT: s_setpc_b64 s[30:31]
280 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
281 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
282 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
283 ret <4 x half> %shuffle
286 define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
287 ; GFX9-LABEL: shuffle_v4f16_4523:
289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
291 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
292 ; GFX9-NEXT: s_waitcnt vmcnt(0)
293 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
294 ; GFX9-NEXT: s_setpc_b64 s[30:31]
295 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
296 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
297 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
298 ret <4 x half> %shuffle
301 define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
302 ; GFX9-LABEL: shuffle_v4f16_4545:
304 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
307 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
308 ; GFX9-NEXT: s_setpc_b64 s[30:31]
309 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
310 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
311 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
312 ret <4 x half> %shuffle
315 define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
316 ; GFX9-LABEL: shuffle_v4f16_4567:
318 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
320 ; GFX9-NEXT: s_waitcnt vmcnt(0)
321 ; GFX9-NEXT: s_setpc_b64 s[30:31]
322 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
323 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
324 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
325 ret <4 x half> %shuffle
328 define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
329 ; GFX9-LABEL: shuffle_v4f16_6701:
331 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
333 ; GFX9-NEXT: s_waitcnt vmcnt(0)
334 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
335 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
336 ; GFX9-NEXT: s_waitcnt vmcnt(0)
337 ; GFX9-NEXT: s_setpc_b64 s[30:31]
338 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
339 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
340 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
341 ret <4 x half> %shuffle
344 define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
345 ; GFX9-LABEL: shuffle_v4f16_6723:
347 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
349 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
350 ; GFX9-NEXT: s_waitcnt vmcnt(0)
351 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
352 ; GFX9-NEXT: s_setpc_b64 s[30:31]
353 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
354 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
355 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
356 ret <4 x half> %shuffle
359 define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
360 ; GFX9-LABEL: shuffle_v4f16_6745:
362 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
365 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
366 ; GFX9-NEXT: s_setpc_b64 s[30:31]
367 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
368 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
369 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
370 ret <4 x half> %shuffle
373 define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
374 ; GFX9-LABEL: shuffle_v4f16_6767:
376 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
379 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
380 ; GFX9-NEXT: s_setpc_b64 s[30:31]
381 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
382 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
383 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
384 ret <4 x half> %shuffle
387 define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
388 ; GFX9-LABEL: shuffle_v4f16_2356:
390 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
392 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
393 ; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
394 ; GFX9-NEXT: s_waitcnt vmcnt(1)
395 ; GFX9-NEXT: v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
396 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0
397 ; GFX9-NEXT: s_waitcnt vmcnt(0)
398 ; GFX9-NEXT: v_mov_b32_e32 v0, v5
399 ; GFX9-NEXT: s_setpc_b64 s[30:31]
400 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
401 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
402 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
403 ret <4 x half> %shuffle
406 define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
407 ; GFX9-LABEL: shuffle_v4f16_5623:
409 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
411 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
412 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
413 ; GFX9-NEXT: s_waitcnt vmcnt(0)
414 ; GFX9-NEXT: v_and_b32_sdwa v0, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
415 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
416 ; GFX9-NEXT: s_setpc_b64 s[30:31]
417 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
418 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
419 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
420 ret <4 x half> %shuffle
423 define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
424 ; GFX9-LABEL: shuffle_v4f16_3456:
426 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
428 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
429 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
430 ; GFX9-NEXT: s_waitcnt vmcnt(1)
431 ; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
432 ; GFX9-NEXT: s_waitcnt vmcnt(0)
433 ; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
434 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3
435 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
436 ; GFX9-NEXT: s_setpc_b64 s[30:31]
437 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
438 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
439 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
440 ret <4 x half> %shuffle
443 define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
444 ; GFX9-LABEL: shuffle_v4f16_5634:
446 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
448 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
449 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
450 ; GFX9-NEXT: s_waitcnt vmcnt(1)
451 ; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
452 ; GFX9-NEXT: s_waitcnt vmcnt(0)
453 ; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
454 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
455 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v3
456 ; GFX9-NEXT: s_setpc_b64 s[30:31]
457 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
458 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
459 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
460 ret <4 x half> %shuffle
463 define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
464 ; GFX9-LABEL: shuffle_v4f16_5734:
466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
468 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
469 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
470 ; GFX9-NEXT: s_waitcnt vmcnt(1)
471 ; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
473 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
474 ; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
475 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v3
476 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2
477 ; GFX9-NEXT: s_setpc_b64 s[30:31]
478 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
479 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
480 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
481 ret <4 x half> %shuffle
484 define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
485 ; GFX9-LABEL: shuffle_v4i16_2356:
487 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
489 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
490 ; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
491 ; GFX9-NEXT: s_waitcnt vmcnt(1)
492 ; GFX9-NEXT: v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
493 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0
494 ; GFX9-NEXT: s_waitcnt vmcnt(0)
495 ; GFX9-NEXT: v_mov_b32_e32 v0, v5
496 ; GFX9-NEXT: s_setpc_b64 s[30:31]
497 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
498 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
499 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
500 ret <4 x i16> %shuffle
503 define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
504 ; GFX9-LABEL: shuffle_v4i16_0167:
506 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
508 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
509 ; GFX9-NEXT: s_waitcnt vmcnt(0)
510 ; GFX9-NEXT: v_mov_b32_e32 v1, v3
511 ; GFX9-NEXT: s_setpc_b64 s[30:31]
512 %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
513 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
514 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
515 ret <4 x i16> %shuffle
518 define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
519 ; GFX9-LABEL: shuffle_v4f16_0000:
521 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
523 ; GFX9-NEXT: s_waitcnt vmcnt(0)
524 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
525 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
526 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
527 ; GFX9-NEXT: s_setpc_b64 s[30:31]
528 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
529 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
530 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
531 ret <4 x half> %shuffle
534 define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
535 ; GFX9-LABEL: shuffle_v4f16_1010:
537 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
539 ; GFX9-NEXT: s_waitcnt vmcnt(0)
540 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
541 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
542 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
543 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
544 ; GFX9-NEXT: s_setpc_b64 s[30:31]
545 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
546 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
547 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
548 ret <4 x half> %shuffle
551 define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
552 ; GFX9-LABEL: shuffle_v4f16_1100:
554 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
556 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
557 ; GFX9-NEXT: s_waitcnt vmcnt(0)
558 ; GFX9-NEXT: v_and_b32_e32 v1, v2, v0
559 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
560 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
561 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v3
562 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
563 ; GFX9-NEXT: s_setpc_b64 s[30:31]
564 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
565 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
566 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
567 ret <4 x half> %shuffle
570 define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
571 ; GFX9-LABEL: shuffle_v4f16_6161:
573 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
575 ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
576 ; GFX9-NEXT: s_waitcnt vmcnt(1)
577 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
578 ; GFX9-NEXT: s_waitcnt vmcnt(0)
579 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
580 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
581 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
582 ; GFX9-NEXT: s_setpc_b64 s[30:31]
583 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
584 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
585 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
586 ret <4 x half> %shuffle
589 define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
590 ; GFX9-LABEL: shuffle_v4f16_2333:
592 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
594 ; GFX9-NEXT: s_waitcnt vmcnt(0)
595 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
596 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
597 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
598 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
599 ; GFX9-NEXT: s_setpc_b64 s[30:31]
600 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
601 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
602 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
603 ret <4 x half> %shuffle
606 define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
607 ; GFX9-LABEL: shuffle_v4f16_6667:
609 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
611 ; GFX9-NEXT: s_waitcnt vmcnt(0)
612 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
613 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
614 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
615 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
616 ; GFX9-NEXT: s_setpc_b64 s[30:31]
617 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
618 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
619 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
620 ret <4 x half> %shuffle
623 define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
624 ; GFX9-LABEL: shuffle_v8f16_0101:
626 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
628 ; GFX9-NEXT: s_waitcnt vmcnt(0)
629 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
630 ; GFX9-NEXT: s_setpc_b64 s[30:31]
631 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
632 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
633 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
634 ret <4 x half> %shuffle
637 define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
638 ; GFX9-LABEL: shuffle_v8f16_0123:
640 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
642 ; GFX9-NEXT: s_waitcnt vmcnt(0)
643 ; GFX9-NEXT: s_setpc_b64 s[30:31]
644 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
645 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
646 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
647 ret <4 x half> %shuffle
650 define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
651 ; GFX9-LABEL: shuffle_v8f16_4589:
653 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
655 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8
656 ; GFX9-NEXT: s_waitcnt vmcnt(1)
657 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
658 ; GFX9-NEXT: s_waitcnt vmcnt(0)
659 ; GFX9-NEXT: s_setpc_b64 s[30:31]
660 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
661 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
662 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
663 ret <4 x half> %shuffle
666 define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
667 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
669 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670 ; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
671 ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
672 ; GFX9-NEXT: s_waitcnt vmcnt(1)
673 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
674 ; GFX9-NEXT: s_waitcnt vmcnt(0)
675 ; GFX9-NEXT: s_setpc_b64 s[30:31]
676 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
677 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
678 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
679 ret <4 x half> %shuffle
682 define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
683 ; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
685 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
687 ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
688 ; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
689 ; GFX9-NEXT: s_waitcnt vmcnt(1)
690 ; GFX9-NEXT: v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
691 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
692 ; GFX9-NEXT: s_waitcnt vmcnt(0)
693 ; GFX9-NEXT: s_setpc_b64 s[30:31]
694 %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
695 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
696 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
697 ret <4 x half> %shuffle
700 define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
701 ; GFX9-LABEL: shuffle_v3f16_0122:
703 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
704 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
706 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
707 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
708 ; GFX9-NEXT: s_setpc_b64 s[30:31]
709 %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
710 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
711 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
712 ret <4 x half> %shuffle
715 define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
716 ; GFX9-LABEL: shuffle_v2f16_0122:
718 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
720 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
721 ; GFX9-NEXT: s_waitcnt vmcnt(0)
722 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
723 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
724 ; GFX9-NEXT: s_setpc_b64 s[30:31]
725 %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
726 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
727 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
728 ret <4 x half> %shuffle
731 define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
732 ; GFX9-LABEL: shuffle_v6f16_452367:
734 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735 ; GFX9-NEXT: global_load_dword v3, v[2:3], off
736 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
737 ; GFX9-NEXT: s_waitcnt vmcnt(0)
738 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
739 ; GFX9-NEXT: v_mov_b32_e32 v2, v3
740 ; GFX9-NEXT: s_setpc_b64 s[30:31]
741 %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
742 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
743 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
744 ret <6 x half> %shuffle
747 define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) {
748 ; GFX9-LABEL: fma_shuffle:
749 ; GFX9: ; %bb.0: ; %entry
750 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
751 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
752 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
753 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
754 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
755 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4
756 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
757 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
758 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4
759 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
760 ; GFX9-NEXT: v_mov_b32_e32 v5, s5
761 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
762 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
763 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off
764 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
765 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
767 ; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1]
768 ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1]
769 ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0]
770 ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
771 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
772 ; GFX9-NEXT: s_endpgm
774 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
775 %tmp12 = zext i32 %tmp1 to i64
776 %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
777 %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
778 %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
779 %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
780 %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
781 %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
782 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
783 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
784 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
785 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
786 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
787 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
788 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
789 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
790 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
791 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
792 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
793 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
794 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
795 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
796 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
797 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
798 store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
802 define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
803 ; GFX9-LABEL: shuffle_v4f16_0456:
805 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
807 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
808 ; GFX9-NEXT: s_waitcnt vmcnt(0)
809 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
810 ; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
811 ; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
812 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
813 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4
814 ; GFX9-NEXT: s_setpc_b64 s[30:31]
815 %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
816 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
817 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
818 ret <4 x half> %shuffle
821 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
822 declare i32 @llvm.amdgcn.workitem.id.x() #0
824 attributes #0 = { nounwind readnone speculatable }