1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
8 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
9 ; GFX9-LABEL: s_shl_v2i16:
11 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
12 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX9-NEXT: s_mov_b32 s6, -1
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
16 ; GFX9-NEXT: s_mov_b32 s4, s0
17 ; GFX9-NEXT: s_mov_b32 s5, s1
18 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0
19 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
22 ; VI-LABEL: s_shl_v2i16:
24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
25 ; VI-NEXT: s_mov_b32 s7, 0xf000
26 ; VI-NEXT: s_mov_b32 s6, -1
27 ; VI-NEXT: s_waitcnt lgkmcnt(0)
28 ; VI-NEXT: s_mov_b32 s4, s0
29 ; VI-NEXT: s_mov_b32 s5, s1
30 ; VI-NEXT: s_lshr_b32 s0, s2, 16
31 ; VI-NEXT: s_lshr_b32 s1, s3, 16
32 ; VI-NEXT: s_lshl_b32 s0, s0, s1
33 ; VI-NEXT: s_lshl_b32 s1, s2, s3
34 ; VI-NEXT: s_lshl_b32 s0, s0, 16
35 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
36 ; VI-NEXT: s_or_b32 s0, s1, s0
37 ; VI-NEXT: v_mov_b32_e32 v0, s0
38 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
41 ; CI-LABEL: s_shl_v2i16:
43 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
44 ; CI-NEXT: s_mov_b32 s7, 0xf000
45 ; CI-NEXT: s_mov_b32 s6, -1
46 ; CI-NEXT: s_waitcnt lgkmcnt(0)
47 ; CI-NEXT: s_mov_b32 s4, s0
48 ; CI-NEXT: s_mov_b32 s5, s1
49 ; CI-NEXT: s_lshr_b32 s0, s2, 16
50 ; CI-NEXT: s_lshr_b32 s1, s3, 16
51 ; CI-NEXT: s_lshl_b32 s0, s0, s1
52 ; CI-NEXT: s_lshl_b32 s1, s2, s3
53 ; CI-NEXT: s_lshl_b32 s0, s0, 16
54 ; CI-NEXT: s_and_b32 s1, s1, 0xffff
55 ; CI-NEXT: s_or_b32 s0, s1, s0
56 ; CI-NEXT: v_mov_b32_e32 v0, s0
57 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
60 ; GFX10-LABEL: s_shl_v2i16:
62 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
63 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
64 ; GFX10-NEXT: s_mov_b32 s6, -1
65 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2
67 ; GFX10-NEXT: s_mov_b32 s4, s0
68 ; GFX10-NEXT: s_mov_b32 s5, s1
69 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
70 ; GFX10-NEXT: s_endpgm
72 ; GFX11-LABEL: s_shl_v2i16:
74 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
75 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
76 ; GFX11-NEXT: s_mov_b32 s6, -1
77 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2
79 ; GFX11-NEXT: s_mov_b32 s4, s0
80 ; GFX11-NEXT: s_mov_b32 s5, s1
81 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
83 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
84 ; GFX11-NEXT: s_endpgm
85 %result = shl <2 x i16> %lhs, %rhs
86 store <2 x i16> %result, ptr addrspace(1) %out
90 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
91 ; GFX9-LABEL: v_shl_v2i16:
93 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
94 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
95 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
97 ; GFX9-NEXT: s_waitcnt vmcnt(0)
98 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0
99 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
100 ; GFX9-NEXT: s_endpgm
102 ; VI-LABEL: v_shl_v2i16:
104 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
105 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
106 ; VI-NEXT: s_waitcnt lgkmcnt(0)
107 ; VI-NEXT: v_mov_b32_e32 v1, s3
108 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
109 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
110 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
111 ; VI-NEXT: v_mov_b32_e32 v3, s1
112 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
113 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
114 ; VI-NEXT: s_waitcnt vmcnt(0)
115 ; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
116 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
117 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
118 ; VI-NEXT: flat_store_dword v[2:3], v0
121 ; CI-LABEL: v_shl_v2i16:
123 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
124 ; CI-NEXT: s_mov_b32 s7, 0xf000
125 ; CI-NEXT: s_mov_b32 s6, 0
126 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
127 ; CI-NEXT: v_mov_b32_e32 v1, 0
128 ; CI-NEXT: s_waitcnt lgkmcnt(0)
129 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
130 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
131 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
132 ; CI-NEXT: s_waitcnt vmcnt(0)
133 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
134 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
135 ; CI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
136 ; CI-NEXT: v_lshlrev_b32_e32 v3, v5, v4
137 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
138 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
139 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
140 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
143 ; GFX10-LABEL: v_shl_v2i16:
145 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
146 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
147 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
149 ; GFX10-NEXT: s_waitcnt vmcnt(0)
150 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0
151 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
152 ; GFX10-NEXT: s_endpgm
154 ; GFX11-LABEL: v_shl_v2i16:
156 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
157 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
158 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
160 ; GFX11-NEXT: s_waitcnt vmcnt(0)
161 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0
162 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
163 ; GFX11-NEXT: s_nop 0
164 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
165 ; GFX11-NEXT: s_endpgm
166 %tid = call i32 @llvm.amdgcn.workitem.id.x()
167 %tid.ext = sext i32 %tid to i64
168 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
169 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
170 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
171 %a = load <2 x i16>, ptr addrspace(1) %in.gep
172 %b = load <2 x i16>, ptr addrspace(1) %b_ptr
173 %result = shl <2 x i16> %a, %b
174 store <2 x i16> %result, ptr addrspace(1) %out.gep
178 define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
179 ; GFX9-LABEL: shl_v_s_v2i16:
181 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
182 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
183 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
184 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
186 ; GFX9-NEXT: s_waitcnt vmcnt(0)
187 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1
188 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
189 ; GFX9-NEXT: s_endpgm
191 ; VI-LABEL: shl_v_s_v2i16:
193 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
194 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
195 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
196 ; VI-NEXT: s_waitcnt lgkmcnt(0)
197 ; VI-NEXT: v_mov_b32_e32 v1, s7
198 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
199 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
200 ; VI-NEXT: flat_load_dword v3, v[0:1]
201 ; VI-NEXT: s_lshr_b32 s1, s0, 16
202 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
203 ; VI-NEXT: v_mov_b32_e32 v2, s1
204 ; VI-NEXT: v_mov_b32_e32 v1, s5
205 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
206 ; VI-NEXT: s_waitcnt vmcnt(0)
207 ; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3
208 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
209 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
210 ; VI-NEXT: flat_store_dword v[0:1], v2
213 ; CI-LABEL: shl_v_s_v2i16:
215 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
216 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
217 ; CI-NEXT: s_mov_b32 s3, 0xf000
218 ; CI-NEXT: s_mov_b32 s2, 0
219 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
220 ; CI-NEXT: s_waitcnt lgkmcnt(0)
221 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
222 ; CI-NEXT: v_mov_b32_e32 v1, 0
223 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
224 ; CI-NEXT: s_lshr_b32 s0, s8, 16
225 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
226 ; CI-NEXT: s_waitcnt vmcnt(0)
227 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
228 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
229 ; CI-NEXT: v_lshlrev_b32_e32 v3, s0, v3
230 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
231 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
232 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
233 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
236 ; GFX10-LABEL: shl_v_s_v2i16:
238 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
239 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
240 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
241 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
243 ; GFX10-NEXT: s_waitcnt vmcnt(0)
244 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1
245 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
246 ; GFX10-NEXT: s_endpgm
248 ; GFX11-LABEL: shl_v_s_v2i16:
250 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
251 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
252 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
253 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
254 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
255 ; GFX11-NEXT: s_waitcnt vmcnt(0)
256 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, s0, v1
257 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
258 ; GFX11-NEXT: s_nop 0
259 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
260 ; GFX11-NEXT: s_endpgm
261 %tid = call i32 @llvm.amdgcn.workitem.id.x()
262 %tid.ext = sext i32 %tid to i64
263 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
264 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
265 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
266 %result = shl <2 x i16> %vgpr, %sgpr
267 store <2 x i16> %result, ptr addrspace(1) %out.gep
271 define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
272 ; GFX9-LABEL: shl_s_v_v2i16:
274 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
275 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
276 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
277 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
278 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
279 ; GFX9-NEXT: s_waitcnt vmcnt(0)
280 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2
281 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
282 ; GFX9-NEXT: s_endpgm
284 ; VI-LABEL: shl_s_v_v2i16:
286 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
287 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
288 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
289 ; VI-NEXT: s_waitcnt lgkmcnt(0)
290 ; VI-NEXT: v_mov_b32_e32 v1, s7
291 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
292 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
293 ; VI-NEXT: flat_load_dword v3, v[0:1]
294 ; VI-NEXT: s_lshr_b32 s1, s0, 16
295 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
296 ; VI-NEXT: v_mov_b32_e32 v2, s1
297 ; VI-NEXT: v_mov_b32_e32 v1, s5
298 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
299 ; VI-NEXT: s_waitcnt vmcnt(0)
300 ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0
301 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
302 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
303 ; VI-NEXT: flat_store_dword v[0:1], v2
306 ; CI-LABEL: shl_s_v_v2i16:
308 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
309 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
310 ; CI-NEXT: s_mov_b32 s3, 0xf000
311 ; CI-NEXT: s_mov_b32 s2, 0
312 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
313 ; CI-NEXT: s_waitcnt lgkmcnt(0)
314 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
315 ; CI-NEXT: v_mov_b32_e32 v1, 0
316 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
317 ; CI-NEXT: s_lshr_b32 s0, s8, 16
318 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
319 ; CI-NEXT: s_waitcnt vmcnt(0)
320 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
321 ; CI-NEXT: v_lshl_b32_e32 v2, s8, v2
322 ; CI-NEXT: v_lshl_b32_e32 v3, s0, v3
323 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
324 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
325 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
326 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
329 ; GFX10-LABEL: shl_s_v_v2i16:
331 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
332 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
333 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
334 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
335 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
336 ; GFX10-NEXT: s_waitcnt vmcnt(0)
337 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0
338 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
339 ; GFX10-NEXT: s_endpgm
341 ; GFX11-LABEL: shl_s_v_v2i16:
343 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
344 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
345 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
346 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
348 ; GFX11-NEXT: s_waitcnt vmcnt(0)
349 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s0
350 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
351 ; GFX11-NEXT: s_nop 0
352 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
353 ; GFX11-NEXT: s_endpgm
354 %tid = call i32 @llvm.amdgcn.workitem.id.x()
355 %tid.ext = sext i32 %tid to i64
356 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
357 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
358 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
359 %result = shl <2 x i16> %sgpr, %vgpr
360 store <2 x i16> %result, ptr addrspace(1) %out.gep
364 define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
365 ; GFX9-LABEL: shl_imm_v_v2i16:
367 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
368 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
371 ; GFX9-NEXT: s_waitcnt vmcnt(0)
372 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
373 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
374 ; GFX9-NEXT: s_endpgm
376 ; VI-LABEL: shl_imm_v_v2i16:
378 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
379 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
380 ; VI-NEXT: v_mov_b32_e32 v4, 8
381 ; VI-NEXT: s_waitcnt lgkmcnt(0)
382 ; VI-NEXT: v_mov_b32_e32 v1, s3
383 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
384 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
385 ; VI-NEXT: flat_load_dword v3, v[0:1]
386 ; VI-NEXT: v_mov_b32_e32 v1, s1
387 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
388 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
389 ; VI-NEXT: s_waitcnt vmcnt(0)
390 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
391 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
392 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
393 ; VI-NEXT: flat_store_dword v[0:1], v2
396 ; CI-LABEL: shl_imm_v_v2i16:
398 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
399 ; CI-NEXT: s_mov_b32 s7, 0xf000
400 ; CI-NEXT: s_mov_b32 s6, 0
401 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
402 ; CI-NEXT: v_mov_b32_e32 v1, 0
403 ; CI-NEXT: s_waitcnt lgkmcnt(0)
404 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
405 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
406 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
407 ; CI-NEXT: s_waitcnt vmcnt(0)
408 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
409 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2
410 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3
411 ; CI-NEXT: v_and_b32_e32 v2, 0xfff8, v2
412 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
413 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
414 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
417 ; GFX10-LABEL: shl_imm_v_v2i16:
419 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
420 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
421 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
422 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
423 ; GFX10-NEXT: s_waitcnt vmcnt(0)
424 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
425 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
426 ; GFX10-NEXT: s_endpgm
428 ; GFX11-LABEL: shl_imm_v_v2i16:
430 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
431 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
432 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
434 ; GFX11-NEXT: s_waitcnt vmcnt(0)
435 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
436 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
437 ; GFX11-NEXT: s_nop 0
438 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
439 ; GFX11-NEXT: s_endpgm
440 %tid = call i32 @llvm.amdgcn.workitem.id.x()
441 %tid.ext = sext i32 %tid to i64
442 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
443 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
444 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
445 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
446 store <2 x i16> %result, ptr addrspace(1) %out.gep
450 define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
451 ; GFX9-LABEL: shl_v_imm_v2i16:
453 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
454 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
457 ; GFX9-NEXT: s_waitcnt vmcnt(0)
458 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
459 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
460 ; GFX9-NEXT: s_endpgm
462 ; VI-LABEL: shl_v_imm_v2i16:
464 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
465 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
467 ; VI-NEXT: v_mov_b32_e32 v1, s3
468 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
469 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
470 ; VI-NEXT: flat_load_dword v3, v[0:1]
471 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
472 ; VI-NEXT: v_mov_b32_e32 v1, s1
473 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
474 ; VI-NEXT: s_waitcnt vmcnt(0)
475 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
476 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
477 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
478 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
479 ; VI-NEXT: flat_store_dword v[0:1], v2
482 ; CI-LABEL: shl_v_imm_v2i16:
484 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
485 ; CI-NEXT: s_mov_b32 s7, 0xf000
486 ; CI-NEXT: s_mov_b32 s6, 0
487 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
488 ; CI-NEXT: v_mov_b32_e32 v1, 0
489 ; CI-NEXT: s_waitcnt lgkmcnt(0)
490 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
491 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
492 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
493 ; CI-NEXT: s_waitcnt vmcnt(0)
494 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
495 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
496 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
499 ; GFX10-LABEL: shl_v_imm_v2i16:
501 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
502 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
503 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
505 ; GFX10-NEXT: s_waitcnt vmcnt(0)
506 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
507 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
508 ; GFX10-NEXT: s_endpgm
510 ; GFX11-LABEL: shl_v_imm_v2i16:
512 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
513 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
514 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
516 ; GFX11-NEXT: s_waitcnt vmcnt(0)
517 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
518 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
519 ; GFX11-NEXT: s_nop 0
520 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
521 ; GFX11-NEXT: s_endpgm
522 %tid = call i32 @llvm.amdgcn.workitem.id.x()
523 %tid.ext = sext i32 %tid to i64
524 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
525 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
526 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
527 %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
528 store <2 x i16> %result, ptr addrspace(1) %out.gep
532 define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
533 ; GFX9-LABEL: v_shl_v4i16:
535 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
536 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
537 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
539 ; GFX9-NEXT: s_waitcnt vmcnt(0)
540 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1
541 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0
542 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
543 ; GFX9-NEXT: s_endpgm
545 ; VI-LABEL: v_shl_v4i16:
547 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
548 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
549 ; VI-NEXT: s_waitcnt lgkmcnt(0)
550 ; VI-NEXT: v_mov_b32_e32 v1, s3
551 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
552 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
553 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
554 ; VI-NEXT: v_mov_b32_e32 v5, s1
555 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
556 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
557 ; VI-NEXT: s_waitcnt vmcnt(0)
558 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
559 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
560 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
561 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
562 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
563 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
564 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
567 ; CI-LABEL: v_shl_v4i16:
569 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
570 ; CI-NEXT: s_mov_b32 s7, 0xf000
571 ; CI-NEXT: s_mov_b32 s6, 0
572 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
573 ; CI-NEXT: v_mov_b32_e32 v5, 0
574 ; CI-NEXT: s_waitcnt lgkmcnt(0)
575 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
576 ; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
577 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
578 ; CI-NEXT: s_waitcnt vmcnt(0)
579 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
580 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
581 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
582 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
583 ; CI-NEXT: v_lshlrev_b32_e32 v1, v3, v1
584 ; CI-NEXT: v_lshlrev_b32_e32 v0, v2, v0
585 ; CI-NEXT: v_lshlrev_b32_e32 v2, v9, v7
586 ; CI-NEXT: v_lshlrev_b32_e32 v3, v8, v6
587 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
588 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
589 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
590 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
591 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
592 ; CI-NEXT: v_or_b32_e32 v0, v0, v3
593 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
596 ; GFX10-LABEL: v_shl_v4i16:
598 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
599 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
600 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
601 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
602 ; GFX10-NEXT: s_waitcnt vmcnt(0)
603 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
604 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
605 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
606 ; GFX10-NEXT: s_endpgm
608 ; GFX11-LABEL: v_shl_v4i16:
610 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
611 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
612 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
613 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
614 ; GFX11-NEXT: s_waitcnt vmcnt(0)
615 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1
616 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0
617 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
618 ; GFX11-NEXT: s_nop 0
619 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
620 ; GFX11-NEXT: s_endpgm
621 %tid = call i32 @llvm.amdgcn.workitem.id.x()
622 %tid.ext = sext i32 %tid to i64
623 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
624 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
625 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
626 %a = load <4 x i16>, ptr addrspace(1) %in.gep
627 %b = load <4 x i16>, ptr addrspace(1) %b_ptr
628 %result = shl <4 x i16> %a, %b
629 store <4 x i16> %result, ptr addrspace(1) %out.gep
633 define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
634 ; GFX9-LABEL: shl_v_imm_v4i16:
636 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
637 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
638 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
639 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
640 ; GFX9-NEXT: s_waitcnt vmcnt(0)
641 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
642 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
643 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
644 ; GFX9-NEXT: s_endpgm
646 ; VI-LABEL: shl_v_imm_v4i16:
648 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
649 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
650 ; VI-NEXT: s_waitcnt lgkmcnt(0)
651 ; VI-NEXT: v_mov_b32_e32 v1, s3
652 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
653 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
654 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
655 ; VI-NEXT: v_mov_b32_e32 v3, s1
656 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
657 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
658 ; VI-NEXT: s_waitcnt vmcnt(0)
659 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
660 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
661 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
662 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
663 ; VI-NEXT: v_and_b32_e32 v4, 0xff000000, v4
664 ; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0
665 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
666 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
667 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
670 ; CI-LABEL: shl_v_imm_v4i16:
672 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
673 ; CI-NEXT: s_mov_b32 s7, 0xf000
674 ; CI-NEXT: s_mov_b32 s6, 0
675 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
676 ; CI-NEXT: v_mov_b32_e32 v1, 0
677 ; CI-NEXT: s_waitcnt lgkmcnt(0)
678 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
679 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
680 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
681 ; CI-NEXT: s_waitcnt vmcnt(0)
682 ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v3
683 ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
684 ; CI-NEXT: v_and_b32_e32 v3, 0xff00, v3
685 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
686 ; CI-NEXT: v_and_b32_e32 v4, 0xff00, v4
687 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
688 ; CI-NEXT: v_or_b32_e32 v3, v4, v3
689 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
690 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
693 ; GFX10-LABEL: shl_v_imm_v4i16:
695 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
696 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
697 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
698 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
699 ; GFX10-NEXT: s_waitcnt vmcnt(0)
700 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
701 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
702 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
703 ; GFX10-NEXT: s_endpgm
705 ; GFX11-LABEL: shl_v_imm_v4i16:
707 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
708 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
709 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
710 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
711 ; GFX11-NEXT: s_waitcnt vmcnt(0)
712 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
713 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
714 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
715 ; GFX11-NEXT: s_nop 0
716 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
717 ; GFX11-NEXT: s_endpgm
718 %tid = call i32 @llvm.amdgcn.workitem.id.x()
719 %tid.ext = sext i32 %tid to i64
720 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
721 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
722 %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
723 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
724 store <4 x i16> %result, ptr addrspace(1) %out.gep
728 declare i32 @llvm.amdgcn.workitem.id.x() #1
730 attributes #0 = { nounwind }
731 attributes #1 = { nounwind readnone }