1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
7 define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
8 ; GCN-LABEL: uniform_vec_0_i16:
10 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
11 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
12 ; GCN-NEXT: s_mov_b32 s3, 0xf000
13 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
14 ; GCN-NEXT: s_lshl_b32 s4, s2, 16
15 ; GCN-NEXT: s_mov_b32 s2, -1
16 ; GCN-NEXT: v_mov_b32_e32 v0, s4
17 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
20 ; GFX9-LABEL: uniform_vec_0_i16:
22 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
23 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
25 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
26 ; GFX9-NEXT: s_lshl_b32 s0, s4, 16
27 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
28 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
31 ; GFX906-LABEL: uniform_vec_0_i16:
33 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c
34 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
35 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
36 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX906-NEXT: s_lshl_b32 s0, s4, 16
38 ; GFX906-NEXT: v_mov_b32_e32 v1, s0
39 ; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
40 ; GFX906-NEXT: s_endpgm
42 ; GFX11-LABEL: uniform_vec_0_i16:
44 ; GFX11-NEXT: s_clause 0x1
45 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
46 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
47 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16
49 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
50 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
51 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
53 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
54 ; GFX11-NEXT: s_endpgm
55 %tmp = insertelement <2 x i16> undef, i16 0, i32 0
56 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
57 %val = bitcast <2 x i16> %vec to i32
58 store i32 %val, ptr addrspace(1) %out, align 4
62 define i32 @divergent_vec_0_i16(i16 %a) {
63 ; GCN-LABEL: divergent_vec_0_i16:
65 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
67 ; GCN-NEXT: s_setpc_b64 s[30:31]
69 ; GFX9-LABEL: divergent_vec_0_i16:
71 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
73 ; GFX9-NEXT: s_setpc_b64 s[30:31]
75 ; GFX906-LABEL: divergent_vec_0_i16:
77 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
79 ; GFX906-NEXT: s_setpc_b64 s[30:31]
81 ; GFX11-LABEL: divergent_vec_0_i16:
83 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
85 ; GFX11-NEXT: s_setpc_b64 s[30:31]
86 %tmp = insertelement <2 x i16> undef, i16 0, i32 0
87 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
88 %val = bitcast <2 x i16> %vec to i32
92 define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
93 ; GCN-LABEL: uniform_vec_i16_0:
95 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
96 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
97 ; GCN-NEXT: s_mov_b32 s3, 0xf000
98 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
99 ; GCN-NEXT: s_and_b32 s4, s2, 0xffff
100 ; GCN-NEXT: s_mov_b32 s2, -1
101 ; GCN-NEXT: v_mov_b32_e32 v0, s4
102 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
105 ; GFX9-LABEL: uniform_vec_i16_0:
107 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
108 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
109 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s4
112 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
113 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
114 ; GFX9-NEXT: s_endpgm
116 ; GFX906-LABEL: uniform_vec_i16_0:
118 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c
119 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
120 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
121 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
122 ; GFX906-NEXT: s_and_b32 s0, 0xffff, s4
123 ; GFX906-NEXT: v_mov_b32_e32 v1, s0
124 ; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
125 ; GFX906-NEXT: s_endpgm
127 ; GFX11-LABEL: uniform_vec_i16_0:
129 ; GFX11-NEXT: s_clause 0x1
130 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
131 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
132 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
133 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
134 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
135 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
136 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
137 ; GFX11-NEXT: s_nop 0
138 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
139 ; GFX11-NEXT: s_endpgm
140 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
141 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
142 %val = bitcast <2 x i16> %vec to i32
143 store i32 %val, ptr addrspace(1) %out, align 4
147 define i32 @divergent_vec_i16_0(i16 %a) {
148 ; GCN-LABEL: divergent_vec_i16_0:
150 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
152 ; GCN-NEXT: s_setpc_b64 s[30:31]
154 ; GFX9-LABEL: divergent_vec_i16_0:
156 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
158 ; GFX9-NEXT: s_setpc_b64 s[30:31]
160 ; GFX906-LABEL: divergent_vec_i16_0:
162 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
164 ; GFX906-NEXT: s_setpc_b64 s[30:31]
166 ; GFX11-LABEL: divergent_vec_i16_0:
168 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
170 ; GFX11-NEXT: s_setpc_b64 s[30:31]
171 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
172 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
173 %val = bitcast <2 x i16> %vec to i32
177 define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
178 ; GCN-LABEL: uniform_vec_f16_0:
180 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
181 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
182 ; GCN-NEXT: s_mov_b32 s3, 0xf000
183 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
184 ; GCN-NEXT: s_and_b32 s4, s2, 0xffff
185 ; GCN-NEXT: s_mov_b32 s2, -1
186 ; GCN-NEXT: v_mov_b32_e32 v0, s4
187 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
190 ; GFX9-LABEL: uniform_vec_f16_0:
192 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
193 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
194 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
195 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
196 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s4
197 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
198 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
199 ; GFX9-NEXT: s_endpgm
201 ; GFX906-LABEL: uniform_vec_f16_0:
203 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c
204 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
205 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
206 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX906-NEXT: s_and_b32 s0, 0xffff, s4
208 ; GFX906-NEXT: v_mov_b32_e32 v1, s0
209 ; GFX906-NEXT: global_store_dword v0, v1, s[2:3]
210 ; GFX906-NEXT: s_endpgm
212 ; GFX11-LABEL: uniform_vec_f16_0:
214 ; GFX11-NEXT: s_clause 0x1
215 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
216 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
217 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
218 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
219 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
220 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
221 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
222 ; GFX11-NEXT: s_nop 0
223 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
224 ; GFX11-NEXT: s_endpgm
225 %tmp = insertelement <2 x half> undef, half %a, i32 0
226 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
227 %val = bitcast <2 x half> %vec to float
228 store float %val, ptr addrspace(1) %out, align 4
232 define float @divergent_vec_f16_0(half %a) {
233 ; GCN-LABEL: divergent_vec_f16_0:
235 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
237 ; GCN-NEXT: s_setpc_b64 s[30:31]
239 ; GFX9-LABEL: divergent_vec_f16_0:
241 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
243 ; GFX9-NEXT: s_setpc_b64 s[30:31]
245 ; GFX906-LABEL: divergent_vec_f16_0:
247 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
249 ; GFX906-NEXT: s_setpc_b64 s[30:31]
251 ; GFX11-LABEL: divergent_vec_f16_0:
253 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
255 ; GFX11-NEXT: s_setpc_b64 s[30:31]
256 %tmp = insertelement <2 x half> undef, half %a, i32 0
257 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
258 %val = bitcast <2 x half> %vec to float
262 define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) {
263 ; GCN-LABEL: uniform_vec_i16_LL:
265 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
266 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
267 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
268 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0
269 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
270 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff
271 ; GCN-NEXT: s_lshl_b32 s1, s1, 16
272 ; GCN-NEXT: s_or_b32 s0, s0, s1
273 ; GCN-NEXT: ;;#ASMSTART
275 ; GCN-NEXT: ;;#ASMEND
278 ; GFX9-LABEL: uniform_vec_i16_LL:
280 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
281 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
283 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
284 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
286 ; GFX9-NEXT: ;;#ASMSTART
287 ; GFX9-NEXT: ; use s0
288 ; GFX9-NEXT: ;;#ASMEND
289 ; GFX9-NEXT: s_endpgm
291 ; GFX906-LABEL: uniform_vec_i16_LL:
293 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
294 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
295 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
296 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
297 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
298 ; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
299 ; GFX906-NEXT: ;;#ASMSTART
300 ; GFX906-NEXT: ; use s0
301 ; GFX906-NEXT: ;;#ASMEND
302 ; GFX906-NEXT: s_endpgm
304 ; GFX11-LABEL: uniform_vec_i16_LL:
306 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
307 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
308 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
309 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
310 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
311 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
312 ; GFX11-NEXT: ;;#ASMSTART
313 ; GFX11-NEXT: ; use s0
314 ; GFX11-NEXT: ;;#ASMEND
315 ; GFX11-NEXT: s_endpgm
316 %val0 = load volatile i32, ptr addrspace(4) %in0
317 %val1 = load volatile i32, ptr addrspace(4) %in1
318 %lo = trunc i32 %val0 to i16
319 %hi = trunc i32 %val1 to i16
320 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
321 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
322 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
323 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
327 define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
328 ; GCN-LABEL: divergent_vec_i16_LL:
330 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
332 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
333 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
334 ; GCN-NEXT: s_setpc_b64 s[30:31]
336 ; GFX9-LABEL: divergent_vec_i16_LL:
338 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
340 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
341 ; GFX9-NEXT: s_setpc_b64 s[30:31]
343 ; GFX906-LABEL: divergent_vec_i16_LL:
345 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346 ; GFX906-NEXT: s_mov_b32 s4, 0x5040100
347 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
348 ; GFX906-NEXT: s_setpc_b64 s[30:31]
350 ; GFX11-LABEL: divergent_vec_i16_LL:
352 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
354 ; GFX11-NEXT: s_setpc_b64 s[30:31]
355 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
356 %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
357 %val = bitcast <2 x i16> %vec to i32
361 define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) {
362 ; GCN-LABEL: uniform_vec_i16_LH:
364 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
365 ; GCN-NEXT: s_mov_b32 s7, 0xf000
366 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
367 ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000
368 ; GCN-NEXT: s_and_b32 s2, s2, 0xffff
369 ; GCN-NEXT: s_or_b32 s2, s2, s3
370 ; GCN-NEXT: s_mov_b32 s6, -1
371 ; GCN-NEXT: s_mov_b32 s4, s0
372 ; GCN-NEXT: s_mov_b32 s5, s1
373 ; GCN-NEXT: v_mov_b32_e32 v0, s2
374 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
377 ; GFX9-LABEL: uniform_vec_i16_LH:
379 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
380 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
381 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
382 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3
383 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
384 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
385 ; GFX9-NEXT: s_endpgm
387 ; GFX906-LABEL: uniform_vec_i16_LH:
389 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
390 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
391 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3
393 ; GFX906-NEXT: v_mov_b32_e32 v1, s2
394 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
395 ; GFX906-NEXT: s_endpgm
397 ; GFX11-LABEL: uniform_vec_i16_LH:
399 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3
402 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
403 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
404 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
405 ; GFX11-NEXT: s_nop 0
406 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
407 ; GFX11-NEXT: s_endpgm
408 %shift = lshr i32 %b, 16
409 %tr = trunc i32 %shift to i16
410 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
411 %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
412 %val = bitcast <2 x i16> %vec to i32
413 store i32 %val, ptr addrspace(1) %out, align 4
417 define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
418 ; GCN-LABEL: divergent_vec_i16_LH:
420 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421 ; GCN-NEXT: s_mov_b32 s4, 0xffff
422 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
423 ; GCN-NEXT: s_setpc_b64 s[30:31]
425 ; GFX9-LABEL: divergent_vec_i16_LH:
427 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
429 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
430 ; GFX9-NEXT: s_setpc_b64 s[30:31]
432 ; GFX906-LABEL: divergent_vec_i16_LH:
434 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
435 ; GFX906-NEXT: s_mov_b32 s4, 0xffff
436 ; GFX906-NEXT: v_bfi_b32 v0, s4, v0, v1
437 ; GFX906-NEXT: s_setpc_b64 s[30:31]
439 ; GFX11-LABEL: divergent_vec_i16_LH:
441 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
443 ; GFX11-NEXT: s_setpc_b64 s[30:31]
444 %shift = lshr i32 %b, 16
445 %tr = trunc i32 %shift to i16
446 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
447 %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
448 %val = bitcast <2 x i16> %vec to i32
452 define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) {
453 ; GCN-LABEL: uniform_vec_i16_HH:
455 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
456 ; GCN-NEXT: s_mov_b32 s7, 0xf000
457 ; GCN-NEXT: s_mov_b32 s6, -1
458 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
459 ; GCN-NEXT: s_mov_b32 s4, s0
460 ; GCN-NEXT: s_mov_b32 s5, s1
461 ; GCN-NEXT: s_lshr_b32 s0, s3, 16
462 ; GCN-NEXT: v_mov_b32_e32 v0, s2
463 ; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16
464 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
467 ; GFX9-LABEL: uniform_vec_i16_HH:
469 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
470 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
471 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3
473 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
474 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
475 ; GFX9-NEXT: s_endpgm
477 ; GFX906-LABEL: uniform_vec_i16_HH:
479 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
480 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
481 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
482 ; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3
483 ; GFX906-NEXT: v_mov_b32_e32 v1, s2
484 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
485 ; GFX906-NEXT: s_endpgm
487 ; GFX11-LABEL: uniform_vec_i16_HH:
489 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
490 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
491 ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3
492 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
493 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
494 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
495 ; GFX11-NEXT: s_nop 0
496 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
497 ; GFX11-NEXT: s_endpgm
498 %shift_a = lshr i32 %a, 16
499 %tr_a = trunc i32 %shift_a to i16
500 %shift_b = lshr i32 %b, 16
501 %tr_b = trunc i32 %shift_b to i16
502 %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0
503 %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1
504 %val = bitcast <2 x i16> %vec to i32
505 store i32 %val, ptr addrspace(1) %out, align 4
509 define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
510 ; GCN-LABEL: divergent_vec_i16_HH:
512 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
513 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
514 ; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
515 ; GCN-NEXT: s_setpc_b64 s[30:31]
517 ; GFX9-LABEL: divergent_vec_i16_HH:
519 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520 ; GFX9-NEXT: s_mov_b32 s4, 0x7060302
521 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
522 ; GFX9-NEXT: s_setpc_b64 s[30:31]
524 ; GFX906-LABEL: divergent_vec_i16_HH:
526 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
527 ; GFX906-NEXT: s_mov_b32 s4, 0x7060302
528 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
529 ; GFX906-NEXT: s_setpc_b64 s[30:31]
531 ; GFX11-LABEL: divergent_vec_i16_HH:
533 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
535 ; GFX11-NEXT: s_setpc_b64 s[30:31]
536 %shift_a = lshr i32 %a, 16
537 %tr_a = trunc i32 %shift_a to i16
538 %shift_b = lshr i32 %b, 16
539 %tr_b = trunc i32 %shift_b to i16
540 %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0
541 %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1
542 %val = bitcast <2 x i16> %vec to i32
546 define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) {
547 ; GCN-LABEL: uniform_vec_f16_LL:
549 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
550 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
551 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
552 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0
553 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
554 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff
555 ; GCN-NEXT: s_lshl_b32 s1, s1, 16
556 ; GCN-NEXT: s_or_b32 s0, s0, s1
557 ; GCN-NEXT: ;;#ASMSTART
559 ; GCN-NEXT: ;;#ASMEND
562 ; GFX9-LABEL: uniform_vec_f16_LL:
564 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
565 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
566 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
567 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
568 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
570 ; GFX9-NEXT: ;;#ASMSTART
571 ; GFX9-NEXT: ; use s0
572 ; GFX9-NEXT: ;;#ASMEND
573 ; GFX9-NEXT: s_endpgm
575 ; GFX906-LABEL: uniform_vec_f16_LL:
577 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
578 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
580 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
581 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
582 ; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
583 ; GFX906-NEXT: ;;#ASMSTART
584 ; GFX906-NEXT: ; use s0
585 ; GFX906-NEXT: ;;#ASMEND
586 ; GFX906-NEXT: s_endpgm
588 ; GFX11-LABEL: uniform_vec_f16_LL:
590 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
591 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
592 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
593 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
594 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
595 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
596 ; GFX11-NEXT: ;;#ASMSTART
597 ; GFX11-NEXT: ; use s0
598 ; GFX11-NEXT: ;;#ASMEND
599 ; GFX11-NEXT: s_endpgm
600 %val0 = load volatile i32, ptr addrspace(4) %in0
601 %val1 = load volatile i32, ptr addrspace(4) %in1
602 %lo.i = trunc i32 %val0 to i16
603 %hi.i = trunc i32 %val1 to i16
604 %lo = bitcast i16 %lo.i to half
605 %hi = bitcast i16 %hi.i to half
606 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
607 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
608 %vec.i32 = bitcast <2 x half> %vec.1 to i32
610 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
614 define float @divergent_vec_f16_LL(half %a, half %b) {
615 ; GCN-LABEL: divergent_vec_f16_LL:
617 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
619 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
620 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
621 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
622 ; GCN-NEXT: s_setpc_b64 s[30:31]
624 ; GFX9-LABEL: divergent_vec_f16_LL:
626 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
628 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
629 ; GFX9-NEXT: s_setpc_b64 s[30:31]
631 ; GFX906-LABEL: divergent_vec_f16_LL:
633 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634 ; GFX906-NEXT: s_mov_b32 s4, 0x5040100
635 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4
636 ; GFX906-NEXT: s_setpc_b64 s[30:31]
638 ; GFX11-LABEL: divergent_vec_f16_LL:
640 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641 ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
642 ; GFX11-NEXT: s_setpc_b64 s[30:31]
643 %tmp = insertelement <2 x half> undef, half %a, i32 0
644 %vec = insertelement <2 x half> %tmp, half %b, i32 1
645 %val = bitcast <2 x half> %vec to float
649 define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 {
650 ; GCN-LABEL: build_vec_v2i16_undeflo_divergent:
651 ; GCN: ; %bb.0: ; %entry
652 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
653 ; GCN-NEXT: s_mov_b32 m0, -1
654 ; GCN-NEXT: ds_read_u16 v0, v0
655 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
656 ; GCN-NEXT: s_setpc_b64 s[30:31]
658 ; GFX9-LABEL: build_vec_v2i16_undeflo_divergent:
659 ; GFX9: ; %bb.0: ; %entry
660 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
661 ; GFX9-NEXT: ds_read_u16_d16 v0, v0
662 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
663 ; GFX9-NEXT: s_setpc_b64 s[30:31]
665 ; GFX906-LABEL: build_vec_v2i16_undeflo_divergent:
666 ; GFX906: ; %bb.0: ; %entry
667 ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
668 ; GFX906-NEXT: ds_read_u16 v0, v0
669 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX906-NEXT: s_setpc_b64 s[30:31]
672 ; GFX11-LABEL: build_vec_v2i16_undeflo_divergent:
673 ; GFX11: ; %bb.0: ; %entry
674 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
675 ; GFX11-NEXT: ds_load_u16_d16 v0, v0
676 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
677 ; GFX11-NEXT: s_setpc_b64 s[30:31]
679 %load = load i16, ptr addrspace(3) %in
680 %build = insertelement <2 x i16> undef, i16 %load, i32 0
684 define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
685 ; GCN-LABEL: build_vec_v2i16_undeflo_uniform:
686 ; GCN: ; %bb.0: ; %entry
687 ; GCN-NEXT: s_load_dword s2, s[0:1], 0x9
688 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
689 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
690 ; GCN-NEXT: v_mov_b32_e32 v0, s2
691 ; GCN-NEXT: s_mov_b32 m0, -1
692 ; GCN-NEXT: ds_read_u16 v0, v0
693 ; GCN-NEXT: s_mov_b32 s3, 0xf000
694 ; GCN-NEXT: s_mov_b32 s2, -1
695 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
696 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
699 ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform:
700 ; GFX9: ; %bb.0: ; %entry
701 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
702 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
703 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
704 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
705 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
706 ; GFX9-NEXT: ds_read_u16_d16 v0, v0
707 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
708 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
709 ; GFX9-NEXT: s_endpgm
711 ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform:
712 ; GFX906: ; %bb.0: ; %entry
713 ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24
714 ; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
715 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
716 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX906-NEXT: v_mov_b32_e32 v0, s4
718 ; GFX906-NEXT: ds_read_u16 v0, v0
719 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
720 ; GFX906-NEXT: global_store_dword v1, v0, s[2:3]
721 ; GFX906-NEXT: s_endpgm
723 ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
724 ; GFX11: ; %bb.0: ; %entry
725 ; GFX11-NEXT: s_clause 0x1
726 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24
727 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
728 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
729 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
730 ; GFX11-NEXT: ds_load_u16_d16 v0, v0
731 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
732 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
733 ; GFX11-NEXT: s_nop 0
734 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
735 ; GFX11-NEXT: s_endpgm
737 %load = load i16, ptr addrspace(3) %in
738 %build = insertelement <2 x i16> undef, i16 %load, i32 0
739 %result = bitcast <2 x i16> %build to i32
740 store i32 %result, ptr addrspace(1) %out