1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5 define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
6 ; GFX9-LABEL: shuffle_v2i8_rebroadcast:
7 ; GFX9: ; %bb.0: ; %entry
8 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
10 ; GFX9-NEXT: s_waitcnt vmcnt(0)
11 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
12 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
13 ; GFX9-NEXT: s_setpc_b64 s[30:31]
15 ; GFX10-LABEL: shuffle_v2i8_rebroadcast:
16 ; GFX10: ; %bb.0: ; %entry
17 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
19 ; GFX10-NEXT: s_waitcnt vmcnt(0)
20 ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
21 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
22 ; GFX10-NEXT: s_setpc_b64 s[30:31]
24 ; GFX11-LABEL: shuffle_v2i8_rebroadcast:
25 ; GFX11: ; %bb.0: ; %entry
26 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
28 ; GFX11-NEXT: s_waitcnt vmcnt(0)
29 ; GFX11-NEXT: v_lshrrev_b16 v0, 8, v0
30 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
31 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
32 ; GFX11-NEXT: s_setpc_b64 s[30:31]
34 %val0 = load <2 x i8>, ptr addrspace(1) %arg0
35 %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1>
39 define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) {
40 ; GFX9-LABEL: shuffle_v4i8_rebroadcast:
41 ; GFX9: ; %bb.0: ; %entry
42 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
44 ; GFX9-NEXT: s_waitcnt vmcnt(0)
45 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
46 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
47 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
48 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
49 ; GFX9-NEXT: s_setpc_b64 s[30:31]
51 ; GFX10-LABEL: shuffle_v4i8_rebroadcast:
52 ; GFX10: ; %bb.0: ; %entry
53 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
55 ; GFX10-NEXT: s_waitcnt vmcnt(0)
56 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
57 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
58 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
59 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
60 ; GFX10-NEXT: s_setpc_b64 s[30:31]
62 ; GFX11-LABEL: shuffle_v4i8_rebroadcast:
63 ; GFX11: ; %bb.0: ; %entry
64 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
66 ; GFX11-NEXT: s_waitcnt vmcnt(0)
67 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
68 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
69 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
70 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
71 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
72 ; GFX11-NEXT: s_setpc_b64 s[30:31]
74 %val0 = load <4 x i8>, ptr addrspace(1) %arg0
75 %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
79 define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) {
80 ; GFX9-LABEL: shuffle_v8i8_rebroadcast:
81 ; GFX9: ; %bb.0: ; %entry
82 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
84 ; GFX9-NEXT: s_waitcnt vmcnt(0)
85 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
86 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
87 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
88 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
89 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
90 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
91 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
92 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
93 ; GFX9-NEXT: s_setpc_b64 s[30:31]
95 ; GFX10-LABEL: shuffle_v8i8_rebroadcast:
96 ; GFX10: ; %bb.0: ; %entry
97 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
99 ; GFX10-NEXT: s_waitcnt vmcnt(0)
100 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
101 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
102 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
103 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
104 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
105 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
106 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
107 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
108 ; GFX10-NEXT: s_setpc_b64 s[30:31]
110 ; GFX11-LABEL: shuffle_v8i8_rebroadcast:
111 ; GFX11: ; %bb.0: ; %entry
112 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
114 ; GFX11-NEXT: s_waitcnt vmcnt(0)
115 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
116 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
117 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
118 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
119 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
120 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
121 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
122 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
123 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
124 ; GFX11-NEXT: s_setpc_b64 s[30:31]
126 %val0 = load <8 x i8>, ptr addrspace(1) %arg0
127 %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
131 define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) {
132 ; GFX9-LABEL: shuffle_v16i8_rebroadcast:
133 ; GFX9: ; %bb.0: ; %entry
134 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
136 ; GFX9-NEXT: s_waitcnt vmcnt(0)
137 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
138 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
139 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
140 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
141 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
142 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
143 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
144 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
145 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
146 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
147 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
148 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
149 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
150 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
151 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
152 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
153 ; GFX9-NEXT: s_setpc_b64 s[30:31]
155 ; GFX10-LABEL: shuffle_v16i8_rebroadcast:
156 ; GFX10: ; %bb.0: ; %entry
157 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
159 ; GFX10-NEXT: s_waitcnt vmcnt(0)
160 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
161 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
162 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
163 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
164 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
165 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
166 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
167 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
168 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
169 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
170 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
171 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
172 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
173 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
174 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
175 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
176 ; GFX10-NEXT: s_setpc_b64 s[30:31]
178 ; GFX11-LABEL: shuffle_v16i8_rebroadcast:
179 ; GFX11: ; %bb.0: ; %entry
180 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
182 ; GFX11-NEXT: s_waitcnt vmcnt(0)
183 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
184 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
185 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
186 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
187 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
188 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
189 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
190 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
191 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
192 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
193 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
194 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
195 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
196 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
197 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
198 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
199 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
200 ; GFX11-NEXT: s_setpc_b64 s[30:31]
202 %val0 = load <16 x i8>, ptr addrspace(1) %arg0
203 %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
207 define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) {
208 ; GFX9-LABEL: shuffle_v32i8_rebroadcast:
209 ; GFX9: ; %bb.0: ; %entry
210 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
212 ; GFX9-NEXT: s_waitcnt vmcnt(0)
213 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
214 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
215 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
216 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
217 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
218 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
219 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
220 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
221 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
222 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
223 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
224 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
225 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
226 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
227 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
228 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
229 ; GFX9-NEXT: v_mov_b32_e32 v16, v0
230 ; GFX9-NEXT: v_mov_b32_e32 v17, v0
231 ; GFX9-NEXT: v_mov_b32_e32 v18, v0
232 ; GFX9-NEXT: v_mov_b32_e32 v19, v0
233 ; GFX9-NEXT: v_mov_b32_e32 v20, v0
234 ; GFX9-NEXT: v_mov_b32_e32 v21, v0
235 ; GFX9-NEXT: v_mov_b32_e32 v22, v0
236 ; GFX9-NEXT: v_mov_b32_e32 v23, v0
237 ; GFX9-NEXT: v_mov_b32_e32 v24, v0
238 ; GFX9-NEXT: v_mov_b32_e32 v25, v0
239 ; GFX9-NEXT: v_mov_b32_e32 v26, v0
240 ; GFX9-NEXT: v_mov_b32_e32 v27, v0
241 ; GFX9-NEXT: v_mov_b32_e32 v28, v0
242 ; GFX9-NEXT: v_mov_b32_e32 v29, v0
243 ; GFX9-NEXT: v_mov_b32_e32 v30, v0
244 ; GFX9-NEXT: v_mov_b32_e32 v31, v0
245 ; GFX9-NEXT: s_setpc_b64 s[30:31]
247 ; GFX10-LABEL: shuffle_v32i8_rebroadcast:
248 ; GFX10: ; %bb.0: ; %entry
249 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
251 ; GFX10-NEXT: s_waitcnt vmcnt(0)
252 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
253 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
254 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
255 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
256 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
257 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
258 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
259 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
260 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
261 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
262 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
263 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
264 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
265 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
266 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
267 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
268 ; GFX10-NEXT: v_mov_b32_e32 v16, v0
269 ; GFX10-NEXT: v_mov_b32_e32 v17, v0
270 ; GFX10-NEXT: v_mov_b32_e32 v18, v0
271 ; GFX10-NEXT: v_mov_b32_e32 v19, v0
272 ; GFX10-NEXT: v_mov_b32_e32 v20, v0
273 ; GFX10-NEXT: v_mov_b32_e32 v21, v0
274 ; GFX10-NEXT: v_mov_b32_e32 v22, v0
275 ; GFX10-NEXT: v_mov_b32_e32 v23, v0
276 ; GFX10-NEXT: v_mov_b32_e32 v24, v0
277 ; GFX10-NEXT: v_mov_b32_e32 v25, v0
278 ; GFX10-NEXT: v_mov_b32_e32 v26, v0
279 ; GFX10-NEXT: v_mov_b32_e32 v27, v0
280 ; GFX10-NEXT: v_mov_b32_e32 v28, v0
281 ; GFX10-NEXT: v_mov_b32_e32 v29, v0
282 ; GFX10-NEXT: v_mov_b32_e32 v30, v0
283 ; GFX10-NEXT: v_mov_b32_e32 v31, v0
284 ; GFX10-NEXT: s_setpc_b64 s[30:31]
286 ; GFX11-LABEL: shuffle_v32i8_rebroadcast:
287 ; GFX11: ; %bb.0: ; %entry
288 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
290 ; GFX11-NEXT: s_waitcnt vmcnt(0)
291 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
292 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
293 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
294 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
295 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
296 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
297 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
298 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
299 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
300 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
301 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
302 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
303 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
304 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
305 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
306 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
307 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
308 ; GFX11-NEXT: v_mov_b32_e32 v16, v0
309 ; GFX11-NEXT: v_mov_b32_e32 v17, v0
310 ; GFX11-NEXT: v_mov_b32_e32 v18, v0
311 ; GFX11-NEXT: v_mov_b32_e32 v19, v0
312 ; GFX11-NEXT: v_mov_b32_e32 v20, v0
313 ; GFX11-NEXT: v_mov_b32_e32 v21, v0
314 ; GFX11-NEXT: v_mov_b32_e32 v22, v0
315 ; GFX11-NEXT: v_mov_b32_e32 v23, v0
316 ; GFX11-NEXT: v_mov_b32_e32 v24, v0
317 ; GFX11-NEXT: v_mov_b32_e32 v25, v0
318 ; GFX11-NEXT: v_mov_b32_e32 v26, v0
319 ; GFX11-NEXT: v_mov_b32_e32 v27, v0
320 ; GFX11-NEXT: v_mov_b32_e32 v28, v0
321 ; GFX11-NEXT: v_mov_b32_e32 v29, v0
322 ; GFX11-NEXT: v_mov_b32_e32 v30, v0
323 ; GFX11-NEXT: v_mov_b32_e32 v31, v0
324 ; GFX11-NEXT: s_setpc_b64 s[30:31]
326 %val0 = load <32 x i8>, ptr addrspace(1) %arg0
327 %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
331 define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
332 ; GFX9-LABEL: shuffle_v2i16_rebroadcast:
333 ; GFX9: ; %bb.0: ; %entry
334 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
336 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
337 ; GFX9-NEXT: s_waitcnt vmcnt(0)
338 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
339 ; GFX9-NEXT: s_setpc_b64 s[30:31]
341 ; GFX10-LABEL: shuffle_v2i16_rebroadcast:
342 ; GFX10: ; %bb.0: ; %entry
343 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
345 ; GFX10-NEXT: s_waitcnt vmcnt(0)
346 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
347 ; GFX10-NEXT: s_setpc_b64 s[30:31]
349 ; GFX11-LABEL: shuffle_v2i16_rebroadcast:
350 ; GFX11: ; %bb.0: ; %entry
351 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
353 ; GFX11-NEXT: s_waitcnt vmcnt(0)
354 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
355 ; GFX11-NEXT: s_setpc_b64 s[30:31]
357 %val0 = load <2 x i16>, ptr addrspace(1) %arg0
358 %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> <i32 1, i32 1>
362 define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
363 ; GFX9-LABEL: shuffle_v4i16_rebroadcast:
364 ; GFX9: ; %bb.0: ; %entry
365 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
367 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
368 ; GFX9-NEXT: s_waitcnt vmcnt(0)
369 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
370 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
371 ; GFX9-NEXT: s_setpc_b64 s[30:31]
373 ; GFX10-LABEL: shuffle_v4i16_rebroadcast:
374 ; GFX10: ; %bb.0: ; %entry
375 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
377 ; GFX10-NEXT: s_waitcnt vmcnt(0)
378 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
379 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
380 ; GFX10-NEXT: s_setpc_b64 s[30:31]
382 ; GFX11-LABEL: shuffle_v4i16_rebroadcast:
383 ; GFX11: ; %bb.0: ; %entry
384 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
386 ; GFX11-NEXT: s_waitcnt vmcnt(0)
387 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
388 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
389 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
390 ; GFX11-NEXT: s_setpc_b64 s[30:31]
392 %val0 = load <4 x i16>, ptr addrspace(1) %arg0
393 %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
397 define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
398 ; GFX9-LABEL: shuffle_v8i16_rebroadcast:
399 ; GFX9: ; %bb.0: ; %entry
400 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
402 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
403 ; GFX9-NEXT: s_waitcnt vmcnt(0)
404 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
405 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
406 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
407 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
408 ; GFX9-NEXT: s_setpc_b64 s[30:31]
410 ; GFX10-LABEL: shuffle_v8i16_rebroadcast:
411 ; GFX10: ; %bb.0: ; %entry
412 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
414 ; GFX10-NEXT: s_waitcnt vmcnt(0)
415 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
416 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
417 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
418 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
419 ; GFX10-NEXT: s_setpc_b64 s[30:31]
421 ; GFX11-LABEL: shuffle_v8i16_rebroadcast:
422 ; GFX11: ; %bb.0: ; %entry
423 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
425 ; GFX11-NEXT: s_waitcnt vmcnt(0)
426 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
427 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
428 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
429 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
430 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
431 ; GFX11-NEXT: s_setpc_b64 s[30:31]
433 %val0 = load <8 x i16>, ptr addrspace(1) %arg0
434 %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
438 define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
439 ; GFX9-LABEL: shuffle_v16i16_rebroadcast:
440 ; GFX9: ; %bb.0: ; %entry
441 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
443 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
444 ; GFX9-NEXT: s_waitcnt vmcnt(0)
445 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
446 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
447 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
448 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
449 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
450 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
451 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
452 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
453 ; GFX9-NEXT: s_setpc_b64 s[30:31]
455 ; GFX10-LABEL: shuffle_v16i16_rebroadcast:
456 ; GFX10: ; %bb.0: ; %entry
457 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
459 ; GFX10-NEXT: s_waitcnt vmcnt(0)
460 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
461 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
462 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
463 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
464 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
465 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
466 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
467 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
468 ; GFX10-NEXT: s_setpc_b64 s[30:31]
470 ; GFX11-LABEL: shuffle_v16i16_rebroadcast:
471 ; GFX11: ; %bb.0: ; %entry
472 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
474 ; GFX11-NEXT: s_waitcnt vmcnt(0)
475 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
476 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
477 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
478 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
479 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
480 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
481 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
482 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
483 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
484 ; GFX11-NEXT: s_setpc_b64 s[30:31]
486 %val0 = load <16 x i16>, ptr addrspace(1) %arg0
487 %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
491 define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
492 ; GFX9-LABEL: shuffle_v32i16_rebroadcast:
493 ; GFX9: ; %bb.0: ; %entry
494 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
496 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
497 ; GFX9-NEXT: s_waitcnt vmcnt(0)
498 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
499 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
500 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
501 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
502 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
503 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
504 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
505 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
506 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
507 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
508 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
509 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
510 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
511 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
512 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
513 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
514 ; GFX9-NEXT: s_setpc_b64 s[30:31]
516 ; GFX10-LABEL: shuffle_v32i16_rebroadcast:
517 ; GFX10: ; %bb.0: ; %entry
518 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
520 ; GFX10-NEXT: s_waitcnt vmcnt(0)
521 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
522 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
523 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
524 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
525 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
526 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
527 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
528 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
529 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
530 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
531 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
532 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
533 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
534 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
535 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
536 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
537 ; GFX10-NEXT: s_setpc_b64 s[30:31]
539 ; GFX11-LABEL: shuffle_v32i16_rebroadcast:
540 ; GFX11: ; %bb.0: ; %entry
541 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
542 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
543 ; GFX11-NEXT: s_waitcnt vmcnt(0)
544 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
545 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
546 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
547 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
548 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
549 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
550 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
551 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
552 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
553 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
554 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
555 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
556 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
557 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
558 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
559 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
560 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
561 ; GFX11-NEXT: s_setpc_b64 s[30:31]
563 %val0 = load <32 x i16>, ptr addrspace(1) %arg0
564 %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
568 define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) {
569 ; GFX9-LABEL: shuffle_v2i32_rebroadcast:
570 ; GFX9: ; %bb.0: ; %entry
571 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
572 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
573 ; GFX9-NEXT: s_waitcnt vmcnt(0)
574 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
575 ; GFX9-NEXT: s_setpc_b64 s[30:31]
577 ; GFX10-LABEL: shuffle_v2i32_rebroadcast:
578 ; GFX10: ; %bb.0: ; %entry
579 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
581 ; GFX10-NEXT: s_waitcnt vmcnt(0)
582 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
583 ; GFX10-NEXT: s_setpc_b64 s[30:31]
585 ; GFX11-LABEL: shuffle_v2i32_rebroadcast:
586 ; GFX11: ; %bb.0: ; %entry
587 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
589 ; GFX11-NEXT: s_waitcnt vmcnt(0)
590 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
591 ; GFX11-NEXT: s_setpc_b64 s[30:31]
593 %val0 = load <2 x i32>, ptr addrspace(1) %arg0
594 %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
598 define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) {
599 ; GFX9-LABEL: shuffle_v4i32_rebroadcast:
600 ; GFX9: ; %bb.0: ; %entry
601 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
603 ; GFX9-NEXT: s_waitcnt vmcnt(0)
604 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
605 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
606 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
607 ; GFX9-NEXT: s_setpc_b64 s[30:31]
609 ; GFX10-LABEL: shuffle_v4i32_rebroadcast:
610 ; GFX10: ; %bb.0: ; %entry
611 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
613 ; GFX10-NEXT: s_waitcnt vmcnt(0)
614 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
615 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
616 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
617 ; GFX10-NEXT: s_setpc_b64 s[30:31]
619 ; GFX11-LABEL: shuffle_v4i32_rebroadcast:
620 ; GFX11: ; %bb.0: ; %entry
621 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
622 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
623 ; GFX11-NEXT: s_waitcnt vmcnt(0)
624 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
625 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
626 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
627 ; GFX11-NEXT: s_setpc_b64 s[30:31]
629 %val0 = load <4 x i32>, ptr addrspace(1) %arg0
630 %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
634 define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) {
635 ; GFX9-LABEL: shuffle_v8i32_rebroadcast:
636 ; GFX9: ; %bb.0: ; %entry
637 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
640 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
641 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
642 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
643 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
644 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
645 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
646 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
647 ; GFX9-NEXT: s_setpc_b64 s[30:31]
649 ; GFX10-LABEL: shuffle_v8i32_rebroadcast:
650 ; GFX10: ; %bb.0: ; %entry
651 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
653 ; GFX10-NEXT: s_waitcnt vmcnt(0)
654 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
655 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
656 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
657 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
658 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
659 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
660 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
661 ; GFX10-NEXT: s_setpc_b64 s[30:31]
663 ; GFX11-LABEL: shuffle_v8i32_rebroadcast:
664 ; GFX11: ; %bb.0: ; %entry
665 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
667 ; GFX11-NEXT: s_waitcnt vmcnt(0)
668 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
669 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
670 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
671 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
672 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
673 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
674 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
675 ; GFX11-NEXT: s_setpc_b64 s[30:31]
677 %val0 = load <8 x i32>, ptr addrspace(1) %arg0
678 %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
682 define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) {
683 ; GFX9-LABEL: shuffle_v16i32_rebroadcast:
684 ; GFX9: ; %bb.0: ; %entry
685 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
687 ; GFX9-NEXT: s_waitcnt vmcnt(0)
688 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
689 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
690 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
691 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
692 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
693 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
694 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
695 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
696 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
697 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
698 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
699 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
700 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
701 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
702 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
703 ; GFX9-NEXT: s_setpc_b64 s[30:31]
705 ; GFX10-LABEL: shuffle_v16i32_rebroadcast:
706 ; GFX10: ; %bb.0: ; %entry
707 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
709 ; GFX10-NEXT: s_waitcnt vmcnt(0)
710 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
711 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
712 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
713 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
714 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
715 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
716 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
717 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
718 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
719 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
720 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
721 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
722 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
723 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
724 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
725 ; GFX10-NEXT: s_setpc_b64 s[30:31]
727 ; GFX11-LABEL: shuffle_v16i32_rebroadcast:
728 ; GFX11: ; %bb.0: ; %entry
729 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
731 ; GFX11-NEXT: s_waitcnt vmcnt(0)
732 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
733 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
734 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
735 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
736 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
737 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
738 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
739 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
740 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
741 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
742 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
743 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
744 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
745 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
746 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
747 ; GFX11-NEXT: s_setpc_b64 s[30:31]
749 %val0 = load <16 x i32>, ptr addrspace(1) %arg0
750 %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
754 define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) {
755 ; GFX9-LABEL: shuffle_v32i32_rebroadcast:
756 ; GFX9: ; %bb.0: ; %entry
757 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
758 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
759 ; GFX9-NEXT: s_waitcnt vmcnt(0)
760 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
761 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
762 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
763 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
764 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
765 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
766 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
767 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
768 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
769 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
770 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
771 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
772 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
773 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
774 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
775 ; GFX9-NEXT: v_mov_b32_e32 v16, v0
776 ; GFX9-NEXT: v_mov_b32_e32 v17, v0
777 ; GFX9-NEXT: v_mov_b32_e32 v18, v0
778 ; GFX9-NEXT: v_mov_b32_e32 v19, v0
779 ; GFX9-NEXT: v_mov_b32_e32 v20, v0
780 ; GFX9-NEXT: v_mov_b32_e32 v21, v0
781 ; GFX9-NEXT: v_mov_b32_e32 v22, v0
782 ; GFX9-NEXT: v_mov_b32_e32 v23, v0
783 ; GFX9-NEXT: v_mov_b32_e32 v24, v0
784 ; GFX9-NEXT: v_mov_b32_e32 v25, v0
785 ; GFX9-NEXT: v_mov_b32_e32 v26, v0
786 ; GFX9-NEXT: v_mov_b32_e32 v27, v0
787 ; GFX9-NEXT: v_mov_b32_e32 v28, v0
788 ; GFX9-NEXT: v_mov_b32_e32 v29, v0
789 ; GFX9-NEXT: v_mov_b32_e32 v30, v0
790 ; GFX9-NEXT: v_mov_b32_e32 v31, v0
791 ; GFX9-NEXT: s_setpc_b64 s[30:31]
793 ; GFX10-LABEL: shuffle_v32i32_rebroadcast:
794 ; GFX10: ; %bb.0: ; %entry
795 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
797 ; GFX10-NEXT: s_waitcnt vmcnt(0)
798 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
799 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
800 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
801 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
802 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
803 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
804 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
805 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
806 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
807 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
808 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
809 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
810 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
811 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
812 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
813 ; GFX10-NEXT: v_mov_b32_e32 v16, v0
814 ; GFX10-NEXT: v_mov_b32_e32 v17, v0
815 ; GFX10-NEXT: v_mov_b32_e32 v18, v0
816 ; GFX10-NEXT: v_mov_b32_e32 v19, v0
817 ; GFX10-NEXT: v_mov_b32_e32 v20, v0
818 ; GFX10-NEXT: v_mov_b32_e32 v21, v0
819 ; GFX10-NEXT: v_mov_b32_e32 v22, v0
820 ; GFX10-NEXT: v_mov_b32_e32 v23, v0
821 ; GFX10-NEXT: v_mov_b32_e32 v24, v0
822 ; GFX10-NEXT: v_mov_b32_e32 v25, v0
823 ; GFX10-NEXT: v_mov_b32_e32 v26, v0
824 ; GFX10-NEXT: v_mov_b32_e32 v27, v0
825 ; GFX10-NEXT: v_mov_b32_e32 v28, v0
826 ; GFX10-NEXT: v_mov_b32_e32 v29, v0
827 ; GFX10-NEXT: v_mov_b32_e32 v30, v0
828 ; GFX10-NEXT: v_mov_b32_e32 v31, v0
829 ; GFX10-NEXT: s_setpc_b64 s[30:31]
831 ; GFX11-LABEL: shuffle_v32i32_rebroadcast:
832 ; GFX11: ; %bb.0: ; %entry
833 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
835 ; GFX11-NEXT: s_waitcnt vmcnt(0)
836 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
837 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
838 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
839 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
840 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
841 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
842 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
843 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
844 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
845 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
846 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
847 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
848 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
849 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
850 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
851 ; GFX11-NEXT: v_mov_b32_e32 v16, v0
852 ; GFX11-NEXT: v_mov_b32_e32 v17, v0
853 ; GFX11-NEXT: v_mov_b32_e32 v18, v0
854 ; GFX11-NEXT: v_mov_b32_e32 v19, v0
855 ; GFX11-NEXT: v_mov_b32_e32 v20, v0
856 ; GFX11-NEXT: v_mov_b32_e32 v21, v0
857 ; GFX11-NEXT: v_mov_b32_e32 v22, v0
858 ; GFX11-NEXT: v_mov_b32_e32 v23, v0
859 ; GFX11-NEXT: v_mov_b32_e32 v24, v0
860 ; GFX11-NEXT: v_mov_b32_e32 v25, v0
861 ; GFX11-NEXT: v_mov_b32_e32 v26, v0
862 ; GFX11-NEXT: v_mov_b32_e32 v27, v0
863 ; GFX11-NEXT: v_mov_b32_e32 v28, v0
864 ; GFX11-NEXT: v_mov_b32_e32 v29, v0
865 ; GFX11-NEXT: v_mov_b32_e32 v30, v0
866 ; GFX11-NEXT: v_mov_b32_e32 v31, v0
867 ; GFX11-NEXT: s_setpc_b64 s[30:31]
869 %val0 = load <32 x i32>, ptr addrspace(1) %arg0
870 %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
874 define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) {
875 ; GFX9-LABEL: shuffle_v2bf16_rebroadcast:
876 ; GFX9: ; %bb.0: ; %entry
877 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
878 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
879 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
880 ; GFX9-NEXT: s_waitcnt vmcnt(0)
881 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
882 ; GFX9-NEXT: s_setpc_b64 s[30:31]
884 ; GFX10-LABEL: shuffle_v2bf16_rebroadcast:
885 ; GFX10: ; %bb.0: ; %entry
886 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
887 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
888 ; GFX10-NEXT: s_waitcnt vmcnt(0)
889 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
890 ; GFX10-NEXT: s_setpc_b64 s[30:31]
892 ; GFX11-LABEL: shuffle_v2bf16_rebroadcast:
893 ; GFX11: ; %bb.0: ; %entry
894 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
895 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
896 ; GFX11-NEXT: s_waitcnt vmcnt(0)
897 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
898 ; GFX11-NEXT: s_setpc_b64 s[30:31]
900 %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
901 %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
902 ret <2 x bfloat> %val1
905 define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) {
906 ; GFX9-LABEL: shuffle_v3bf16_rebroadcast:
907 ; GFX9: ; %bb.0: ; %entry
908 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909 ; GFX9-NEXT: global_load_dword v1, v[0:1], off
910 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
911 ; GFX9-NEXT: s_waitcnt vmcnt(0)
912 ; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4
913 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
914 ; GFX9-NEXT: s_setpc_b64 s[30:31]
916 ; GFX10-LABEL: shuffle_v3bf16_rebroadcast:
917 ; GFX10: ; %bb.0: ; %entry
918 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919 ; GFX10-NEXT: global_load_dword v1, v[0:1], off
920 ; GFX10-NEXT: s_waitcnt vmcnt(0)
921 ; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302
922 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
923 ; GFX10-NEXT: s_setpc_b64 s[30:31]
925 ; GFX11-LABEL: shuffle_v3bf16_rebroadcast:
926 ; GFX11: ; %bb.0: ; %entry
927 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off
929 ; GFX11-NEXT: s_waitcnt vmcnt(0)
930 ; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302
931 ; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16
932 ; GFX11-NEXT: s_setpc_b64 s[30:31]
934 %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
935 %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> <i32 1, i32 1, i32 1>
936 ret <3 x bfloat> %val1
939 define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) {
940 ; GFX9-LABEL: shuffle_v4bf16_rebroadcast:
941 ; GFX9: ; %bb.0: ; %entry
942 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
944 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
945 ; GFX9-NEXT: s_waitcnt vmcnt(0)
946 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
947 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
948 ; GFX9-NEXT: s_setpc_b64 s[30:31]
950 ; GFX10-LABEL: shuffle_v4bf16_rebroadcast:
951 ; GFX10: ; %bb.0: ; %entry
952 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
953 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
954 ; GFX10-NEXT: s_waitcnt vmcnt(0)
955 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
956 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
957 ; GFX10-NEXT: s_setpc_b64 s[30:31]
959 ; GFX11-LABEL: shuffle_v4bf16_rebroadcast:
960 ; GFX11: ; %bb.0: ; %entry
961 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
963 ; GFX11-NEXT: s_waitcnt vmcnt(0)
964 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
965 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
966 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
967 ; GFX11-NEXT: s_setpc_b64 s[30:31]
969 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
970 %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
971 ret <4 x bfloat> %val1
974 define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) {
975 ; GFX9-LABEL: shuffle_v6bf16_rebroadcast:
976 ; GFX9: ; %bb.0: ; %entry
977 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
979 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
980 ; GFX9-NEXT: s_waitcnt vmcnt(0)
981 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
982 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
983 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
984 ; GFX9-NEXT: s_setpc_b64 s[30:31]
986 ; GFX10-LABEL: shuffle_v6bf16_rebroadcast:
987 ; GFX10: ; %bb.0: ; %entry
988 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
990 ; GFX10-NEXT: s_waitcnt vmcnt(0)
991 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
992 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
993 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
994 ; GFX10-NEXT: s_setpc_b64 s[30:31]
996 ; GFX11-LABEL: shuffle_v6bf16_rebroadcast:
997 ; GFX11: ; %bb.0: ; %entry
998 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
999 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1000 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1001 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1002 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1003 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1004 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1005 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1007 %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
1008 %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1009 ret <6 x bfloat> %val1
1012 define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) {
1013 ; GFX9-LABEL: shuffle_v8bf16_rebroadcast:
1014 ; GFX9: ; %bb.0: ; %entry
1015 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1017 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1018 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1019 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1020 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1021 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1022 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
1023 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1025 ; GFX10-LABEL: shuffle_v8bf16_rebroadcast:
1026 ; GFX10: ; %bb.0: ; %entry
1027 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1029 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1030 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1031 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1032 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1033 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
1034 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1036 ; GFX11-LABEL: shuffle_v8bf16_rebroadcast:
1037 ; GFX11: ; %bb.0: ; %entry
1038 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1039 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1040 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1041 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1042 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1043 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1044 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1045 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
1046 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1048 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
1049 %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1050 ret <8 x bfloat> %val1
1053 define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) {
1054 ; GFX9-LABEL: shuffle_v16bf16_rebroadcast:
1055 ; GFX9: ; %bb.0: ; %entry
1056 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1058 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1061 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1062 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1063 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
1064 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
1065 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1066 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
1067 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
1068 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1070 ; GFX10-LABEL: shuffle_v16bf16_rebroadcast:
1071 ; GFX10: ; %bb.0: ; %entry
1072 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1073 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1074 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1075 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1076 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1077 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1078 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
1079 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
1080 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1081 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
1082 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
1083 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1085 ; GFX11-LABEL: shuffle_v16bf16_rebroadcast:
1086 ; GFX11: ; %bb.0: ; %entry
1087 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1088 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1089 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1090 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1091 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1092 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1093 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1094 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
1095 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
1096 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1097 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
1098 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
1099 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1101 %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
1102 %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1103 ret <16 x bfloat> %val1
1106 define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) {
1107 ; GFX9-LABEL: shuffle_v32bf16_rebroadcast:
1108 ; GFX9: ; %bb.0: ; %entry
1109 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1110 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1111 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1113 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1114 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1115 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1116 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
1117 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
1118 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1119 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
1120 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
1121 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
1122 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
1123 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
1124 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
1125 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
1126 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
1127 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
1128 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
1129 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1131 ; GFX10-LABEL: shuffle_v32bf16_rebroadcast:
1132 ; GFX10: ; %bb.0: ; %entry
1133 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1135 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1136 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1137 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1138 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1139 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
1140 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
1141 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1142 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
1143 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
1144 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
1145 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
1146 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
1147 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
1148 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
1149 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
1150 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
1151 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
1152 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1154 ; GFX11-LABEL: shuffle_v32bf16_rebroadcast:
1155 ; GFX11: ; %bb.0: ; %entry
1156 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1157 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1158 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1159 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1160 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1161 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1162 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1163 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
1164 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
1165 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1166 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
1167 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
1168 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
1169 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
1170 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
1171 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
1172 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
1173 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
1174 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
1175 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
1176 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1178 %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0
1179 %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1180 ret <32 x bfloat> %val1
1183 define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) {
1184 ; GFX9-LABEL: shuffle_v2f16_rebroadcast:
1185 ; GFX9: ; %bb.0: ; %entry
1186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1188 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1190 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1191 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1193 ; GFX10-LABEL: shuffle_v2f16_rebroadcast:
1194 ; GFX10: ; %bb.0: ; %entry
1195 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1197 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1199 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1201 ; GFX11-LABEL: shuffle_v2f16_rebroadcast:
1202 ; GFX11: ; %bb.0: ; %entry
1203 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1204 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1205 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1206 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1207 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1209 %val0 = load <2 x half>, ptr addrspace(1) %arg0
1210 %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> <i32 1, i32 1>
1211 ret <2 x half> %val1
1214 define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) {
1215 ; GFX9-LABEL: shuffle_v3f16_rebroadcast:
1216 ; GFX9: ; %bb.0: ; %entry
1217 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1218 ; GFX9-NEXT: global_load_dword v1, v[0:1], off
1219 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1220 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1221 ; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4
1222 ; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16
1223 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1225 ; GFX10-LABEL: shuffle_v3f16_rebroadcast:
1226 ; GFX10: ; %bb.0: ; %entry
1227 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1228 ; GFX10-NEXT: global_load_dword v1, v[0:1], off
1229 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1230 ; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302
1231 ; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
1232 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1234 ; GFX11-LABEL: shuffle_v3f16_rebroadcast:
1235 ; GFX11: ; %bb.0: ; %entry
1236 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1237 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off
1238 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1239 ; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302
1240 ; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16
1241 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1243 %val0 = load <3 x half>, ptr addrspace(1) %arg0
1244 %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> <i32 1, i32 1, i32 1>
1245 ret <3 x half> %val1
1248 define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) {
1249 ; GFX9-LABEL: shuffle_v4f16_rebroadcast:
1250 ; GFX9: ; %bb.0: ; %entry
1251 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1252 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1253 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1254 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1255 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1256 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1257 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1259 ; GFX10-LABEL: shuffle_v4f16_rebroadcast:
1260 ; GFX10: ; %bb.0: ; %entry
1261 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1262 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1263 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1264 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1265 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1266 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1268 ; GFX11-LABEL: shuffle_v4f16_rebroadcast:
1269 ; GFX11: ; %bb.0: ; %entry
1270 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1272 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1273 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1274 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1275 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1276 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1278 %val0 = load <4 x half>, ptr addrspace(1) %arg0
1279 %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1280 ret <4 x half> %val1
1283 define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) {
1284 ; GFX9-LABEL: shuffle_v6f16_rebroadcast:
1285 ; GFX9: ; %bb.0: ; %entry
1286 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1287 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1288 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1289 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1290 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1291 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1292 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1293 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1295 ; GFX10-LABEL: shuffle_v6f16_rebroadcast:
1296 ; GFX10: ; %bb.0: ; %entry
1297 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1298 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1299 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1300 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1301 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1302 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1303 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1305 ; GFX11-LABEL: shuffle_v6f16_rebroadcast:
1306 ; GFX11: ; %bb.0: ; %entry
1307 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1309 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1310 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1311 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1312 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1313 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1314 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1316 %val0 = load <6 x half>, ptr addrspace(1) %arg0
1317 %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1318 ret <6 x half> %val1
1321 define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) {
1322 ; GFX9-LABEL: shuffle_v8f16_rebroadcast:
1323 ; GFX9: ; %bb.0: ; %entry
1324 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1325 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1326 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1328 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1329 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1330 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1331 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
1332 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1334 ; GFX10-LABEL: shuffle_v8f16_rebroadcast:
1335 ; GFX10: ; %bb.0: ; %entry
1336 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1337 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1338 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1339 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1340 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1341 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1342 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
1343 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1345 ; GFX11-LABEL: shuffle_v8f16_rebroadcast:
1346 ; GFX11: ; %bb.0: ; %entry
1347 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1348 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1349 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1350 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1351 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1353 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1354 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
1355 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1357 %val0 = load <8 x half>, ptr addrspace(1) %arg0
1358 %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1359 ret <8 x half> %val1
1362 define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) {
1363 ; GFX9-LABEL: shuffle_v16f16_rebroadcast:
1364 ; GFX9: ; %bb.0: ; %entry
1365 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1366 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1367 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1368 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1369 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1370 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1371 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1372 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
1373 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
1374 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1375 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
1376 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
1377 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1379 ; GFX10-LABEL: shuffle_v16f16_rebroadcast:
1380 ; GFX10: ; %bb.0: ; %entry
1381 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1382 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1383 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1384 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1385 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1386 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1387 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
1388 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
1389 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1390 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
1391 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
1392 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1394 ; GFX11-LABEL: shuffle_v16f16_rebroadcast:
1395 ; GFX11: ; %bb.0: ; %entry
1396 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1397 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1398 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1399 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1401 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1402 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1403 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
1404 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
1405 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1406 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
1407 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
1408 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1410 %val0 = load <16 x half>, ptr addrspace(1) %arg0
1411 %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1412 ret <16 x half> %val1
1415 define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) {
1416 ; GFX9-LABEL: shuffle_v32f16_rebroadcast:
1417 ; GFX9: ; %bb.0: ; %entry
1418 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1419 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1420 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
1421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1422 ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
1423 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1424 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1425 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
1426 ; GFX9-NEXT: v_mov_b32_e32 v4, v0
1427 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1428 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
1429 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
1430 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
1431 ; GFX9-NEXT: v_mov_b32_e32 v9, v0
1432 ; GFX9-NEXT: v_mov_b32_e32 v10, v0
1433 ; GFX9-NEXT: v_mov_b32_e32 v11, v0
1434 ; GFX9-NEXT: v_mov_b32_e32 v12, v0
1435 ; GFX9-NEXT: v_mov_b32_e32 v13, v0
1436 ; GFX9-NEXT: v_mov_b32_e32 v14, v0
1437 ; GFX9-NEXT: v_mov_b32_e32 v15, v0
1438 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1440 ; GFX10-LABEL: shuffle_v32f16_rebroadcast:
1441 ; GFX10: ; %bb.0: ; %entry
1442 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1444 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1445 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1446 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1447 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1448 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
1449 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
1450 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1451 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
1452 ; GFX10-NEXT: v_mov_b32_e32 v7, v0
1453 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
1454 ; GFX10-NEXT: v_mov_b32_e32 v9, v0
1455 ; GFX10-NEXT: v_mov_b32_e32 v10, v0
1456 ; GFX10-NEXT: v_mov_b32_e32 v11, v0
1457 ; GFX10-NEXT: v_mov_b32_e32 v12, v0
1458 ; GFX10-NEXT: v_mov_b32_e32 v13, v0
1459 ; GFX10-NEXT: v_mov_b32_e32 v14, v0
1460 ; GFX10-NEXT: v_mov_b32_e32 v15, v0
1461 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1463 ; GFX11-LABEL: shuffle_v32f16_rebroadcast:
1464 ; GFX11: ; %bb.0: ; %entry
1465 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off
1467 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1468 ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302
1469 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1470 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1471 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1472 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
1473 ; GFX11-NEXT: v_mov_b32_e32 v4, v0
1474 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1475 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
1476 ; GFX11-NEXT: v_mov_b32_e32 v7, v0
1477 ; GFX11-NEXT: v_mov_b32_e32 v8, v0
1478 ; GFX11-NEXT: v_mov_b32_e32 v9, v0
1479 ; GFX11-NEXT: v_mov_b32_e32 v10, v0
1480 ; GFX11-NEXT: v_mov_b32_e32 v11, v0
1481 ; GFX11-NEXT: v_mov_b32_e32 v12, v0
1482 ; GFX11-NEXT: v_mov_b32_e32 v13, v0
1483 ; GFX11-NEXT: v_mov_b32_e32 v14, v0
1484 ; GFX11-NEXT: v_mov_b32_e32 v15, v0
1485 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1487 %val0 = load <32 x half>, ptr addrspace(1) %arg0
1488 %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1489 ret <32 x half> %val1
1492 define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) {
1493 ; GFX9-LABEL: shuffle_v2f32_rebroadcast:
1494 ; GFX9: ; %bb.0: ; %entry
1495 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1496 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1497 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1498 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1499 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1501 ; GFX10-LABEL: shuffle_v2f32_rebroadcast:
1502 ; GFX10: ; %bb.0: ; %entry
1503 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1505 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1506 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1507 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1509 ; GFX11-LABEL: shuffle_v2f32_rebroadcast:
1510 ; GFX11: ; %bb.0: ; %entry
1511 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1512 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1513 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1514 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1515 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1517 %val0 = load <2 x float>, ptr addrspace(1) %arg0
1518 %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> <i32 1, i32 1>
1519 ret <2 x float> %val1
1522 define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) {
1523 ; GFX9-LABEL: shuffle_v3f32_rebroadcast:
1524 ; GFX9: ; %bb.0: ; %entry
1525 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1526 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
1527 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1528 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1529 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
1530 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1532 ; GFX10-LABEL: shuffle_v3f32_rebroadcast:
1533 ; GFX10: ; %bb.0: ; %entry
1534 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1535 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
1536 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1537 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1538 ; GFX10-NEXT: v_mov_b32_e32 v2, v1
1539 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1541 ; GFX11-LABEL: shuffle_v3f32_rebroadcast:
1542 ; GFX11: ; %bb.0: ; %entry
1543 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1544 ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
1545 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1546 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1547 ; GFX11-NEXT: v_mov_b32_e32 v2, v1
1548 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1550 %val0 = load <3 x float>, ptr addrspace(1) %arg0
1551 %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> <i32 1, i32 1, i32 1>
1552 ret <3 x float> %val1
1555 define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) {
1556 ; GFX9-LABEL: shuffle_v4f32_rebroadcast:
1557 ; GFX9: ; %bb.0: ; %entry
1558 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1559 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1560 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1561 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1562 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
1563 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
1564 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1566 ; GFX10-LABEL: shuffle_v4f32_rebroadcast:
1567 ; GFX10: ; %bb.0: ; %entry
1568 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1569 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1570 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1571 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1572 ; GFX10-NEXT: v_mov_b32_e32 v2, v1
1573 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1574 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1576 ; GFX11-LABEL: shuffle_v4f32_rebroadcast:
1577 ; GFX11: ; %bb.0: ; %entry
1578 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1579 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
1580 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1581 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1582 ; GFX11-NEXT: v_mov_b32_e32 v2, v1
1583 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
1584 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1586 %val0 = load <4 x float>, ptr addrspace(1) %arg0
1587 %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1588 ret <4 x float> %val1
1591 define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) {
1592 ; GFX9-LABEL: shuffle_v6f32_rebroadcast:
1593 ; GFX9: ; %bb.0: ; %entry
1594 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1595 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1596 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1597 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1598 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
1599 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
1600 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
1601 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
1602 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1604 ; GFX10-LABEL: shuffle_v6f32_rebroadcast:
1605 ; GFX10: ; %bb.0: ; %entry
1606 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1607 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1608 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1609 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1610 ; GFX10-NEXT: v_mov_b32_e32 v2, v1
1611 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1612 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
1613 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
1614 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1616 ; GFX11-LABEL: shuffle_v6f32_rebroadcast:
1617 ; GFX11: ; %bb.0: ; %entry
1618 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1619 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
1620 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1621 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1622 ; GFX11-NEXT: v_mov_b32_e32 v2, v1
1623 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
1624 ; GFX11-NEXT: v_mov_b32_e32 v4, v1
1625 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
1626 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1628 %val0 = load <6 x float>, ptr addrspace(1) %arg0
1629 %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1630 ret <6 x float> %val1
1633 define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) {
1634 ; GFX9-LABEL: shuffle_v8f32_rebroadcast:
1635 ; GFX9: ; %bb.0: ; %entry
1636 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1637 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1638 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1639 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1640 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
1641 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
1642 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
1643 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
1644 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
1645 ; GFX9-NEXT: v_mov_b32_e32 v7, v1
1646 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1648 ; GFX10-LABEL: shuffle_v8f32_rebroadcast:
1649 ; GFX10: ; %bb.0: ; %entry
1650 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1651 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1652 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1653 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1654 ; GFX10-NEXT: v_mov_b32_e32 v2, v1
1655 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1656 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
1657 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
1658 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
1659 ; GFX10-NEXT: v_mov_b32_e32 v7, v1
1660 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1662 ; GFX11-LABEL: shuffle_v8f32_rebroadcast:
1663 ; GFX11: ; %bb.0: ; %entry
1664 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
1666 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1667 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1668 ; GFX11-NEXT: v_mov_b32_e32 v2, v1
1669 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
1670 ; GFX11-NEXT: v_mov_b32_e32 v4, v1
1671 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
1672 ; GFX11-NEXT: v_mov_b32_e32 v6, v1
1673 ; GFX11-NEXT: v_mov_b32_e32 v7, v1
1674 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1676 %val0 = load <8 x float>, ptr addrspace(1) %arg0
1677 %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1678 ret <8 x float> %val1
1681 define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) {
1682 ; GFX9-LABEL: shuffle_v16f32_rebroadcast:
1683 ; GFX9: ; %bb.0: ; %entry
1684 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1685 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1686 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1687 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1688 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
1689 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
1690 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
1691 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
1692 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
1693 ; GFX9-NEXT: v_mov_b32_e32 v7, v1
1694 ; GFX9-NEXT: v_mov_b32_e32 v8, v1
1695 ; GFX9-NEXT: v_mov_b32_e32 v9, v1
1696 ; GFX9-NEXT: v_mov_b32_e32 v10, v1
1697 ; GFX9-NEXT: v_mov_b32_e32 v11, v1
1698 ; GFX9-NEXT: v_mov_b32_e32 v12, v1
1699 ; GFX9-NEXT: v_mov_b32_e32 v13, v1
1700 ; GFX9-NEXT: v_mov_b32_e32 v14, v1
1701 ; GFX9-NEXT: v_mov_b32_e32 v15, v1
1702 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1704 ; GFX10-LABEL: shuffle_v16f32_rebroadcast:
1705 ; GFX10: ; %bb.0: ; %entry
1706 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1707 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1708 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1709 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1710 ; GFX10-NEXT: v_mov_b32_e32 v2, v1
1711 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1712 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
1713 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
1714 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
1715 ; GFX10-NEXT: v_mov_b32_e32 v7, v1
1716 ; GFX10-NEXT: v_mov_b32_e32 v8, v1
1717 ; GFX10-NEXT: v_mov_b32_e32 v9, v1
1718 ; GFX10-NEXT: v_mov_b32_e32 v10, v1
1719 ; GFX10-NEXT: v_mov_b32_e32 v11, v1
1720 ; GFX10-NEXT: v_mov_b32_e32 v12, v1
1721 ; GFX10-NEXT: v_mov_b32_e32 v13, v1
1722 ; GFX10-NEXT: v_mov_b32_e32 v14, v1
1723 ; GFX10-NEXT: v_mov_b32_e32 v15, v1
1724 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1726 ; GFX11-LABEL: shuffle_v16f32_rebroadcast:
1727 ; GFX11: ; %bb.0: ; %entry
1728 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1729 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
1730 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1731 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1732 ; GFX11-NEXT: v_mov_b32_e32 v2, v1
1733 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
1734 ; GFX11-NEXT: v_mov_b32_e32 v4, v1
1735 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
1736 ; GFX11-NEXT: v_mov_b32_e32 v6, v1
1737 ; GFX11-NEXT: v_mov_b32_e32 v7, v1
1738 ; GFX11-NEXT: v_mov_b32_e32 v8, v1
1739 ; GFX11-NEXT: v_mov_b32_e32 v9, v1
1740 ; GFX11-NEXT: v_mov_b32_e32 v10, v1
1741 ; GFX11-NEXT: v_mov_b32_e32 v11, v1
1742 ; GFX11-NEXT: v_mov_b32_e32 v12, v1
1743 ; GFX11-NEXT: v_mov_b32_e32 v13, v1
1744 ; GFX11-NEXT: v_mov_b32_e32 v14, v1
1745 ; GFX11-NEXT: v_mov_b32_e32 v15, v1
1746 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1748 %val0 = load <16 x float>, ptr addrspace(1) %arg0
1749 %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1750 ret <16 x float> %val1
1753 define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) {
1754 ; GFX9-LABEL: shuffle_v32f32_rebroadcast:
1755 ; GFX9: ; %bb.0: ; %entry
1756 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1757 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1758 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1759 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
1760 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
1761 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
1762 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
1763 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
1764 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
1765 ; GFX9-NEXT: v_mov_b32_e32 v7, v1
1766 ; GFX9-NEXT: v_mov_b32_e32 v8, v1
1767 ; GFX9-NEXT: v_mov_b32_e32 v9, v1
1768 ; GFX9-NEXT: v_mov_b32_e32 v10, v1
1769 ; GFX9-NEXT: v_mov_b32_e32 v11, v1
1770 ; GFX9-NEXT: v_mov_b32_e32 v12, v1
1771 ; GFX9-NEXT: v_mov_b32_e32 v13, v1
1772 ; GFX9-NEXT: v_mov_b32_e32 v14, v1
1773 ; GFX9-NEXT: v_mov_b32_e32 v15, v1
1774 ; GFX9-NEXT: v_mov_b32_e32 v16, v1
1775 ; GFX9-NEXT: v_mov_b32_e32 v17, v1
1776 ; GFX9-NEXT: v_mov_b32_e32 v18, v1
1777 ; GFX9-NEXT: v_mov_b32_e32 v19, v1
1778 ; GFX9-NEXT: v_mov_b32_e32 v20, v1
1779 ; GFX9-NEXT: v_mov_b32_e32 v21, v1
1780 ; GFX9-NEXT: v_mov_b32_e32 v22, v1
1781 ; GFX9-NEXT: v_mov_b32_e32 v23, v1
1782 ; GFX9-NEXT: v_mov_b32_e32 v24, v1
1783 ; GFX9-NEXT: v_mov_b32_e32 v25, v1
1784 ; GFX9-NEXT: v_mov_b32_e32 v26, v1
1785 ; GFX9-NEXT: v_mov_b32_e32 v27, v1
1786 ; GFX9-NEXT: v_mov_b32_e32 v28, v1
1787 ; GFX9-NEXT: v_mov_b32_e32 v29, v1
1788 ; GFX9-NEXT: v_mov_b32_e32 v30, v1
1789 ; GFX9-NEXT: v_mov_b32_e32 v31, v1
1790 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1792 ; GFX10-LABEL: shuffle_v32f32_rebroadcast:
1793 ; GFX10: ; %bb.0: ; %entry
1794 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1795 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
1796 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1797 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
1798 ; GFX10-NEXT: v_mov_b32_e32 v2, v1
1799 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1800 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
1801 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
1802 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
1803 ; GFX10-NEXT: v_mov_b32_e32 v7, v1
1804 ; GFX10-NEXT: v_mov_b32_e32 v8, v1
1805 ; GFX10-NEXT: v_mov_b32_e32 v9, v1
1806 ; GFX10-NEXT: v_mov_b32_e32 v10, v1
1807 ; GFX10-NEXT: v_mov_b32_e32 v11, v1
1808 ; GFX10-NEXT: v_mov_b32_e32 v12, v1
1809 ; GFX10-NEXT: v_mov_b32_e32 v13, v1
1810 ; GFX10-NEXT: v_mov_b32_e32 v14, v1
1811 ; GFX10-NEXT: v_mov_b32_e32 v15, v1
1812 ; GFX10-NEXT: v_mov_b32_e32 v16, v1
1813 ; GFX10-NEXT: v_mov_b32_e32 v17, v1
1814 ; GFX10-NEXT: v_mov_b32_e32 v18, v1
1815 ; GFX10-NEXT: v_mov_b32_e32 v19, v1
1816 ; GFX10-NEXT: v_mov_b32_e32 v20, v1
1817 ; GFX10-NEXT: v_mov_b32_e32 v21, v1
1818 ; GFX10-NEXT: v_mov_b32_e32 v22, v1
1819 ; GFX10-NEXT: v_mov_b32_e32 v23, v1
1820 ; GFX10-NEXT: v_mov_b32_e32 v24, v1
1821 ; GFX10-NEXT: v_mov_b32_e32 v25, v1
1822 ; GFX10-NEXT: v_mov_b32_e32 v26, v1
1823 ; GFX10-NEXT: v_mov_b32_e32 v27, v1
1824 ; GFX10-NEXT: v_mov_b32_e32 v28, v1
1825 ; GFX10-NEXT: v_mov_b32_e32 v29, v1
1826 ; GFX10-NEXT: v_mov_b32_e32 v30, v1
1827 ; GFX10-NEXT: v_mov_b32_e32 v31, v1
1828 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1830 ; GFX11-LABEL: shuffle_v32f32_rebroadcast:
1831 ; GFX11: ; %bb.0: ; %entry
1832 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1833 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
1834 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1835 ; GFX11-NEXT: v_mov_b32_e32 v0, v1
1836 ; GFX11-NEXT: v_mov_b32_e32 v2, v1
1837 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
1838 ; GFX11-NEXT: v_mov_b32_e32 v4, v1
1839 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
1840 ; GFX11-NEXT: v_mov_b32_e32 v6, v1
1841 ; GFX11-NEXT: v_mov_b32_e32 v7, v1
1842 ; GFX11-NEXT: v_mov_b32_e32 v8, v1
1843 ; GFX11-NEXT: v_mov_b32_e32 v9, v1
1844 ; GFX11-NEXT: v_mov_b32_e32 v10, v1
1845 ; GFX11-NEXT: v_mov_b32_e32 v11, v1
1846 ; GFX11-NEXT: v_mov_b32_e32 v12, v1
1847 ; GFX11-NEXT: v_mov_b32_e32 v13, v1
1848 ; GFX11-NEXT: v_mov_b32_e32 v14, v1
1849 ; GFX11-NEXT: v_mov_b32_e32 v15, v1
1850 ; GFX11-NEXT: v_mov_b32_e32 v16, v1
1851 ; GFX11-NEXT: v_mov_b32_e32 v17, v1
1852 ; GFX11-NEXT: v_mov_b32_e32 v18, v1
1853 ; GFX11-NEXT: v_mov_b32_e32 v19, v1
1854 ; GFX11-NEXT: v_mov_b32_e32 v20, v1
1855 ; GFX11-NEXT: v_mov_b32_e32 v21, v1
1856 ; GFX11-NEXT: v_mov_b32_e32 v22, v1
1857 ; GFX11-NEXT: v_mov_b32_e32 v23, v1
1858 ; GFX11-NEXT: v_mov_b32_e32 v24, v1
1859 ; GFX11-NEXT: v_mov_b32_e32 v25, v1
1860 ; GFX11-NEXT: v_mov_b32_e32 v26, v1
1861 ; GFX11-NEXT: v_mov_b32_e32 v27, v1
1862 ; GFX11-NEXT: v_mov_b32_e32 v28, v1
1863 ; GFX11-NEXT: v_mov_b32_e32 v29, v1
1864 ; GFX11-NEXT: v_mov_b32_e32 v30, v1
1865 ; GFX11-NEXT: v_mov_b32_e32 v31, v1
1866 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1868 %val0 = load <32 x float>, ptr addrspace(1) %arg0
1869 %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1870 ret <32 x float> %val1