1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
8 define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
9 ; GFX9-LABEL: s_shl_v2i16:
11 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
12 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX9-NEXT: s_mov_b32 s6, -1
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
16 ; GFX9-NEXT: s_mov_b32 s4, s0
17 ; GFX9-NEXT: s_mov_b32 s5, s1
18 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0
19 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
22 ; VI-LABEL: s_shl_v2i16:
24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
25 ; VI-NEXT: s_mov_b32 s7, 0xf000
26 ; VI-NEXT: s_mov_b32 s6, -1
27 ; VI-NEXT: s_waitcnt lgkmcnt(0)
28 ; VI-NEXT: s_mov_b32 s4, s0
29 ; VI-NEXT: s_mov_b32 s5, s1
30 ; VI-NEXT: s_lshr_b32 s0, s2, 16
31 ; VI-NEXT: s_lshr_b32 s1, s3, 16
32 ; VI-NEXT: s_lshl_b32 s0, s0, s1
33 ; VI-NEXT: s_lshl_b32 s1, s2, s3
34 ; VI-NEXT: s_lshl_b32 s0, s0, 16
35 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
36 ; VI-NEXT: s_or_b32 s0, s1, s0
37 ; VI-NEXT: v_mov_b32_e32 v0, s0
38 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
41 ; CI-LABEL: s_shl_v2i16:
43 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
44 ; CI-NEXT: s_mov_b32 s7, 0xf000
45 ; CI-NEXT: s_mov_b32 s6, -1
46 ; CI-NEXT: s_waitcnt lgkmcnt(0)
47 ; CI-NEXT: s_mov_b32 s4, s0
48 ; CI-NEXT: s_mov_b32 s5, s1
49 ; CI-NEXT: s_lshr_b32 s0, s2, 16
50 ; CI-NEXT: s_lshr_b32 s1, s3, 16
51 ; CI-NEXT: s_lshl_b32 s0, s0, s1
52 ; CI-NEXT: s_lshl_b32 s1, s2, s3
53 ; CI-NEXT: s_lshl_b32 s0, s0, 16
54 ; CI-NEXT: s_and_b32 s1, s1, 0xffff
55 ; CI-NEXT: s_or_b32 s0, s1, s0
56 ; CI-NEXT: v_mov_b32_e32 v0, s0
57 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
60 ; GFX10-LABEL: s_shl_v2i16:
62 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
63 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
64 ; GFX10-NEXT: s_mov_b32 s6, -1
65 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2
67 ; GFX10-NEXT: s_mov_b32 s4, s0
68 ; GFX10-NEXT: s_mov_b32 s5, s1
69 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
70 ; GFX10-NEXT: s_endpgm
72 ; GFX11-LABEL: s_shl_v2i16:
74 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
75 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
76 ; GFX11-NEXT: s_mov_b32 s6, -1
77 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2
79 ; GFX11-NEXT: s_mov_b32 s4, s0
80 ; GFX11-NEXT: s_mov_b32 s5, s1
81 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
82 ; GFX11-NEXT: s_endpgm
83 %result = shl <2 x i16> %lhs, %rhs
84 store <2 x i16> %result, ptr addrspace(1) %out
88 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
89 ; GFX9-LABEL: v_shl_v2i16:
91 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
92 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
93 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
95 ; GFX9-NEXT: s_waitcnt vmcnt(0)
96 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0
97 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
100 ; VI-LABEL: v_shl_v2i16:
102 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
103 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
104 ; VI-NEXT: s_waitcnt lgkmcnt(0)
105 ; VI-NEXT: v_mov_b32_e32 v1, s3
106 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
107 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
108 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
109 ; VI-NEXT: v_mov_b32_e32 v3, s1
110 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
111 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
112 ; VI-NEXT: s_waitcnt vmcnt(0)
113 ; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
114 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
115 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
116 ; VI-NEXT: flat_store_dword v[2:3], v0
119 ; CI-LABEL: v_shl_v2i16:
121 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
122 ; CI-NEXT: s_mov_b32 s7, 0xf000
123 ; CI-NEXT: s_mov_b32 s6, 0
124 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
125 ; CI-NEXT: v_mov_b32_e32 v1, 0
126 ; CI-NEXT: s_waitcnt lgkmcnt(0)
127 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
128 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
129 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
130 ; CI-NEXT: s_waitcnt vmcnt(0)
131 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
132 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
133 ; CI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
134 ; CI-NEXT: v_lshlrev_b32_e32 v3, v5, v4
135 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
136 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
137 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
138 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
141 ; GFX10-LABEL: v_shl_v2i16:
143 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
144 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
145 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
146 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
147 ; GFX10-NEXT: s_waitcnt vmcnt(0)
148 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0
149 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
150 ; GFX10-NEXT: s_endpgm
152 ; GFX11-LABEL: v_shl_v2i16:
154 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
155 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
156 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
157 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
158 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
160 ; GFX11-NEXT: s_waitcnt vmcnt(0)
161 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0
162 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
163 ; GFX11-NEXT: s_endpgm
164 %tid = call i32 @llvm.amdgcn.workitem.id.x()
165 %tid.ext = sext i32 %tid to i64
166 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
167 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
168 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
169 %a = load <2 x i16>, ptr addrspace(1) %in.gep
170 %b = load <2 x i16>, ptr addrspace(1) %b_ptr
171 %result = shl <2 x i16> %a, %b
172 store <2 x i16> %result, ptr addrspace(1) %out.gep
176 define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
177 ; GFX9-LABEL: shl_v_s_v2i16:
179 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
180 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34
181 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
182 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
184 ; GFX9-NEXT: s_waitcnt vmcnt(0)
185 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, s6, v1
186 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
187 ; GFX9-NEXT: s_endpgm
189 ; VI-LABEL: shl_v_s_v2i16:
191 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
192 ; VI-NEXT: s_load_dword s4, s[4:5], 0x34
193 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
194 ; VI-NEXT: s_waitcnt lgkmcnt(0)
195 ; VI-NEXT: v_mov_b32_e32 v1, s3
196 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
197 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
198 ; VI-NEXT: flat_load_dword v3, v[0:1]
199 ; VI-NEXT: v_mov_b32_e32 v1, s1
200 ; VI-NEXT: s_lshr_b32 s1, s4, 16
201 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
202 ; VI-NEXT: v_mov_b32_e32 v2, s1
203 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
204 ; VI-NEXT: s_waitcnt vmcnt(0)
205 ; VI-NEXT: v_lshlrev_b16_e32 v4, s4, v3
206 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
207 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
208 ; VI-NEXT: flat_store_dword v[0:1], v2
211 ; CI-LABEL: shl_v_s_v2i16:
213 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
214 ; CI-NEXT: s_load_dword s8, s[4:5], 0xd
215 ; CI-NEXT: s_mov_b32 s7, 0xf000
216 ; CI-NEXT: s_mov_b32 s6, 0
217 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
218 ; CI-NEXT: s_waitcnt lgkmcnt(0)
219 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
220 ; CI-NEXT: v_mov_b32_e32 v1, 0
221 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
222 ; CI-NEXT: s_lshr_b32 s4, s8, 16
223 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
224 ; CI-NEXT: s_waitcnt vmcnt(0)
225 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
226 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
227 ; CI-NEXT: v_lshlrev_b32_e32 v3, s4, v3
228 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
229 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
230 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
231 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
234 ; GFX10-LABEL: shl_v_s_v2i16:
236 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
237 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
238 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34
239 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
240 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
241 ; GFX10-NEXT: s_waitcnt vmcnt(0)
242 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, s4, v1
243 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
244 ; GFX10-NEXT: s_endpgm
246 ; GFX11-LABEL: shl_v_s_v2i16:
248 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
249 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
250 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
251 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
252 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
253 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
254 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
255 ; GFX11-NEXT: s_waitcnt vmcnt(0)
256 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, s4, v1
257 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
258 ; GFX11-NEXT: s_endpgm
259 %tid = call i32 @llvm.amdgcn.workitem.id.x()
260 %tid.ext = sext i32 %tid to i64
261 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
262 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
263 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
264 %result = shl <2 x i16> %vgpr, %sgpr
265 store <2 x i16> %result, ptr addrspace(1) %out.gep
269 define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
270 ; GFX9-LABEL: shl_s_v_v2i16:
272 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
273 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34
274 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
275 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
277 ; GFX9-NEXT: s_waitcnt vmcnt(0)
278 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s6
279 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
280 ; GFX9-NEXT: s_endpgm
282 ; VI-LABEL: shl_s_v_v2i16:
284 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
285 ; VI-NEXT: s_load_dword s4, s[4:5], 0x34
286 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
287 ; VI-NEXT: s_waitcnt lgkmcnt(0)
288 ; VI-NEXT: v_mov_b32_e32 v1, s3
289 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
290 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
291 ; VI-NEXT: flat_load_dword v3, v[0:1]
292 ; VI-NEXT: v_mov_b32_e32 v1, s1
293 ; VI-NEXT: s_lshr_b32 s1, s4, 16
294 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
295 ; VI-NEXT: v_mov_b32_e32 v2, s1
296 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
297 ; VI-NEXT: s_waitcnt vmcnt(0)
298 ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s4
299 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
300 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
301 ; VI-NEXT: flat_store_dword v[0:1], v2
304 ; CI-LABEL: shl_s_v_v2i16:
306 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
307 ; CI-NEXT: s_load_dword s8, s[4:5], 0xd
308 ; CI-NEXT: s_mov_b32 s7, 0xf000
309 ; CI-NEXT: s_mov_b32 s6, 0
310 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
311 ; CI-NEXT: s_waitcnt lgkmcnt(0)
312 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
313 ; CI-NEXT: v_mov_b32_e32 v1, 0
314 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
315 ; CI-NEXT: s_lshr_b32 s4, s8, 16
316 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
317 ; CI-NEXT: s_waitcnt vmcnt(0)
318 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
319 ; CI-NEXT: v_lshl_b32_e32 v2, s8, v2
320 ; CI-NEXT: v_lshl_b32_e32 v3, s4, v3
321 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
322 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
323 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
324 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
327 ; GFX10-LABEL: shl_s_v_v2i16:
329 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
330 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
331 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34
332 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
334 ; GFX10-NEXT: s_waitcnt vmcnt(0)
335 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s4
336 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
337 ; GFX10-NEXT: s_endpgm
339 ; GFX11-LABEL: shl_s_v_v2i16:
341 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
342 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
343 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
344 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
345 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
346 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
348 ; GFX11-NEXT: s_waitcnt vmcnt(0)
349 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s4
350 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
351 ; GFX11-NEXT: s_endpgm
352 %tid = call i32 @llvm.amdgcn.workitem.id.x()
353 %tid.ext = sext i32 %tid to i64
354 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
355 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
356 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
357 %result = shl <2 x i16> %sgpr, %vgpr
358 store <2 x i16> %result, ptr addrspace(1) %out.gep
362 define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
363 ; GFX9-LABEL: shl_imm_v_v2i16:
365 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
366 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
367 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
368 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
369 ; GFX9-NEXT: s_waitcnt vmcnt(0)
370 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
371 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
372 ; GFX9-NEXT: s_endpgm
374 ; VI-LABEL: shl_imm_v_v2i16:
376 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
377 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
378 ; VI-NEXT: v_mov_b32_e32 v4, 8
379 ; VI-NEXT: s_waitcnt lgkmcnt(0)
380 ; VI-NEXT: v_mov_b32_e32 v1, s3
381 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
382 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
383 ; VI-NEXT: flat_load_dword v3, v[0:1]
384 ; VI-NEXT: v_mov_b32_e32 v1, s1
385 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
386 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
387 ; VI-NEXT: s_waitcnt vmcnt(0)
388 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
389 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
390 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
391 ; VI-NEXT: flat_store_dword v[0:1], v2
394 ; CI-LABEL: shl_imm_v_v2i16:
396 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
397 ; CI-NEXT: s_mov_b32 s7, 0xf000
398 ; CI-NEXT: s_mov_b32 s6, 0
399 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
400 ; CI-NEXT: v_mov_b32_e32 v1, 0
401 ; CI-NEXT: s_waitcnt lgkmcnt(0)
402 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
403 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
404 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
405 ; CI-NEXT: s_waitcnt vmcnt(0)
406 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
407 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2
408 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3
409 ; CI-NEXT: v_and_b32_e32 v2, 0xfff8, v2
410 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
411 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
412 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
415 ; GFX10-LABEL: shl_imm_v_v2i16:
417 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
418 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
419 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
420 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
421 ; GFX10-NEXT: s_waitcnt vmcnt(0)
422 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
423 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
424 ; GFX10-NEXT: s_endpgm
426 ; GFX11-LABEL: shl_imm_v_v2i16:
428 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
429 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
430 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
431 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
432 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
434 ; GFX11-NEXT: s_waitcnt vmcnt(0)
435 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
436 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
437 ; GFX11-NEXT: s_endpgm
438 %tid = call i32 @llvm.amdgcn.workitem.id.x()
439 %tid.ext = sext i32 %tid to i64
440 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
441 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
442 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
443 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
444 store <2 x i16> %result, ptr addrspace(1) %out.gep
448 define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
449 ; GFX9-LABEL: shl_v_imm_v2i16:
451 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
452 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
453 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
454 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
455 ; GFX9-NEXT: s_waitcnt vmcnt(0)
456 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
457 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
458 ; GFX9-NEXT: s_endpgm
460 ; VI-LABEL: shl_v_imm_v2i16:
462 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
463 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
464 ; VI-NEXT: s_waitcnt lgkmcnt(0)
465 ; VI-NEXT: v_mov_b32_e32 v1, s3
466 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
467 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
468 ; VI-NEXT: flat_load_dword v3, v[0:1]
469 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
470 ; VI-NEXT: v_mov_b32_e32 v1, s1
471 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
472 ; VI-NEXT: s_waitcnt vmcnt(0)
473 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
474 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
475 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
476 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
477 ; VI-NEXT: flat_store_dword v[0:1], v2
480 ; CI-LABEL: shl_v_imm_v2i16:
482 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
483 ; CI-NEXT: s_mov_b32 s7, 0xf000
484 ; CI-NEXT: s_mov_b32 s6, 0
485 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
486 ; CI-NEXT: v_mov_b32_e32 v1, 0
487 ; CI-NEXT: s_waitcnt lgkmcnt(0)
488 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
489 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
490 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
491 ; CI-NEXT: s_waitcnt vmcnt(0)
492 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
493 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
494 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
497 ; GFX10-LABEL: shl_v_imm_v2i16:
499 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
500 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
501 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
503 ; GFX10-NEXT: s_waitcnt vmcnt(0)
504 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
505 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
506 ; GFX10-NEXT: s_endpgm
508 ; GFX11-LABEL: shl_v_imm_v2i16:
510 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
511 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
512 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
513 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
514 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
516 ; GFX11-NEXT: s_waitcnt vmcnt(0)
517 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
518 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
519 ; GFX11-NEXT: s_endpgm
520 %tid = call i32 @llvm.amdgcn.workitem.id.x()
521 %tid.ext = sext i32 %tid to i64
522 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
523 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
524 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
525 %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
526 store <2 x i16> %result, ptr addrspace(1) %out.gep
530 define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
531 ; GFX9-LABEL: v_shl_v4i16:
533 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
534 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
535 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
537 ; GFX9-NEXT: s_waitcnt vmcnt(0)
538 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1
539 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0
540 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
541 ; GFX9-NEXT: s_endpgm
543 ; VI-LABEL: v_shl_v4i16:
545 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
546 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
547 ; VI-NEXT: s_waitcnt lgkmcnt(0)
548 ; VI-NEXT: v_mov_b32_e32 v1, s3
549 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
550 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
551 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
552 ; VI-NEXT: v_mov_b32_e32 v5, s1
553 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
554 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
555 ; VI-NEXT: s_waitcnt vmcnt(0)
556 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
557 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
558 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
559 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
560 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
561 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
562 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
565 ; CI-LABEL: v_shl_v4i16:
567 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
568 ; CI-NEXT: s_mov_b32 s7, 0xf000
569 ; CI-NEXT: s_mov_b32 s6, 0
570 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
571 ; CI-NEXT: v_mov_b32_e32 v5, 0
572 ; CI-NEXT: s_waitcnt lgkmcnt(0)
573 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
574 ; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
575 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
576 ; CI-NEXT: s_waitcnt vmcnt(0)
577 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
578 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
579 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
580 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
581 ; CI-NEXT: v_lshlrev_b32_e32 v1, v3, v1
582 ; CI-NEXT: v_lshlrev_b32_e32 v0, v2, v0
583 ; CI-NEXT: v_lshlrev_b32_e32 v2, v9, v7
584 ; CI-NEXT: v_lshlrev_b32_e32 v3, v8, v6
585 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
586 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
587 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
588 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
589 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
590 ; CI-NEXT: v_or_b32_e32 v0, v0, v3
591 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
594 ; GFX10-LABEL: v_shl_v4i16:
596 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
597 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
598 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
600 ; GFX10-NEXT: s_waitcnt vmcnt(0)
601 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
602 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
603 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
604 ; GFX10-NEXT: s_endpgm
606 ; GFX11-LABEL: v_shl_v4i16:
608 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
609 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
610 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
611 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
612 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
613 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
614 ; GFX11-NEXT: s_waitcnt vmcnt(0)
615 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1
616 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0
617 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
618 ; GFX11-NEXT: s_endpgm
619 %tid = call i32 @llvm.amdgcn.workitem.id.x()
620 %tid.ext = sext i32 %tid to i64
621 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
622 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
623 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
624 %a = load <4 x i16>, ptr addrspace(1) %in.gep
625 %b = load <4 x i16>, ptr addrspace(1) %b_ptr
626 %result = shl <4 x i16> %a, %b
627 store <4 x i16> %result, ptr addrspace(1) %out.gep
631 define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
632 ; GFX9-LABEL: shl_v_imm_v4i16:
634 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
635 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
636 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
637 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
638 ; GFX9-NEXT: s_waitcnt vmcnt(0)
639 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
640 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
641 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
642 ; GFX9-NEXT: s_endpgm
644 ; VI-LABEL: shl_v_imm_v4i16:
646 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
647 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
648 ; VI-NEXT: s_waitcnt lgkmcnt(0)
649 ; VI-NEXT: v_mov_b32_e32 v1, s3
650 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
651 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
652 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
653 ; VI-NEXT: v_mov_b32_e32 v3, s1
654 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
655 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
656 ; VI-NEXT: s_waitcnt vmcnt(0)
657 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
658 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
659 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
660 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
661 ; VI-NEXT: v_and_b32_e32 v4, 0xff000000, v4
662 ; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0
663 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
664 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
665 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
668 ; CI-LABEL: shl_v_imm_v4i16:
670 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
671 ; CI-NEXT: s_mov_b32 s7, 0xf000
672 ; CI-NEXT: s_mov_b32 s6, 0
673 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
674 ; CI-NEXT: v_mov_b32_e32 v1, 0
675 ; CI-NEXT: s_waitcnt lgkmcnt(0)
676 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
677 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
678 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
679 ; CI-NEXT: s_waitcnt vmcnt(0)
680 ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v3
681 ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
682 ; CI-NEXT: v_and_b32_e32 v3, 0xff00, v3
683 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
684 ; CI-NEXT: v_and_b32_e32 v4, 0xff00, v4
685 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
686 ; CI-NEXT: v_or_b32_e32 v3, v4, v3
687 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
688 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
691 ; GFX10-LABEL: shl_v_imm_v4i16:
693 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
694 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
695 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
696 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
697 ; GFX10-NEXT: s_waitcnt vmcnt(0)
698 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
699 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
700 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
701 ; GFX10-NEXT: s_endpgm
703 ; GFX11-LABEL: shl_v_imm_v4i16:
705 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
706 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
707 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
708 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
709 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
710 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
711 ; GFX11-NEXT: s_waitcnt vmcnt(0)
712 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
713 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
714 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
715 ; GFX11-NEXT: s_endpgm
716 %tid = call i32 @llvm.amdgcn.workitem.id.x()
717 %tid.ext = sext i32 %tid to i64
718 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
719 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
720 %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
721 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
722 store <4 x i16> %result, ptr addrspace(1) %out.gep
726 declare i32 @llvm.amdgcn.workitem.id.x() #1
728 attributes #0 = { nounwind }
729 attributes #1 = { nounwind readnone }