1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
8 define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
9 ; GFX9-LABEL: s_lshr_v2i16:
11 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
15 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1
16 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
19 ; VI-LABEL: s_lshr_v2i16:
21 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
22 ; VI-NEXT: s_waitcnt lgkmcnt(0)
23 ; VI-NEXT: s_and_b32 s4, s2, 0xffff
24 ; VI-NEXT: s_lshr_b32 s2, s2, 16
25 ; VI-NEXT: s_lshr_b32 s5, s3, 16
26 ; VI-NEXT: s_lshr_b32 s2, s2, s5
27 ; VI-NEXT: s_lshr_b32 s3, s4, s3
28 ; VI-NEXT: s_lshl_b32 s2, s2, 16
29 ; VI-NEXT: s_or_b32 s2, s3, s2
30 ; VI-NEXT: v_mov_b32_e32 v0, s0
31 ; VI-NEXT: v_mov_b32_e32 v1, s1
32 ; VI-NEXT: v_mov_b32_e32 v2, s2
33 ; VI-NEXT: flat_store_dword v[0:1], v2
36 ; CI-LABEL: s_lshr_v2i16:
38 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
39 ; CI-NEXT: s_mov_b32 s7, 0xf000
40 ; CI-NEXT: s_mov_b32 s6, -1
41 ; CI-NEXT: s_waitcnt lgkmcnt(0)
42 ; CI-NEXT: s_mov_b32 s4, s0
43 ; CI-NEXT: s_mov_b32 s5, s1
44 ; CI-NEXT: s_and_b32 s0, s2, 0xffff
45 ; CI-NEXT: s_lshr_b32 s1, s2, 16
46 ; CI-NEXT: s_lshr_b32 s2, s3, 16
47 ; CI-NEXT: s_lshr_b32 s1, s1, s2
48 ; CI-NEXT: s_lshl_b32 s1, s1, 16
49 ; CI-NEXT: s_lshr_b32 s0, s0, s3
50 ; CI-NEXT: s_or_b32 s0, s0, s1
51 ; CI-NEXT: v_mov_b32_e32 v0, s0
52 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
55 ; GFX10-LABEL: s_lshr_v2i16:
57 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
58 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
59 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2
61 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
62 ; GFX10-NEXT: s_endpgm
64 ; GFX11-LABEL: s_lshr_v2i16:
66 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
67 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2
70 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
72 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
73 ; GFX11-NEXT: s_endpgm
74 %result = lshr <2 x i16> %lhs, %rhs
75 store <2 x i16> %result, ptr addrspace(1) %out
79 define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
80 ; GFX9-LABEL: v_lshr_v2i16:
82 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
83 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
84 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
86 ; GFX9-NEXT: s_waitcnt vmcnt(0)
87 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0
88 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
91 ; VI-LABEL: v_lshr_v2i16:
93 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
94 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
95 ; VI-NEXT: s_waitcnt lgkmcnt(0)
96 ; VI-NEXT: v_mov_b32_e32 v1, s3
97 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
98 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
99 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
100 ; VI-NEXT: v_mov_b32_e32 v3, s1
101 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
102 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
103 ; VI-NEXT: s_waitcnt vmcnt(0)
104 ; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
105 ; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
106 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
107 ; VI-NEXT: flat_store_dword v[2:3], v0
110 ; CI-LABEL: v_lshr_v2i16:
112 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
113 ; CI-NEXT: s_mov_b32 s7, 0xf000
114 ; CI-NEXT: s_mov_b32 s6, 0
115 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
116 ; CI-NEXT: v_mov_b32_e32 v1, 0
117 ; CI-NEXT: s_waitcnt lgkmcnt(0)
118 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
119 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
120 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
121 ; CI-NEXT: s_waitcnt vmcnt(0)
122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
123 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
124 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
125 ; CI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
126 ; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4
127 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
128 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
129 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
132 ; GFX10-LABEL: v_lshr_v2i16:
134 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
135 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
136 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
137 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
138 ; GFX10-NEXT: s_waitcnt vmcnt(0)
139 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0
140 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
141 ; GFX10-NEXT: s_endpgm
143 ; GFX11-LABEL: v_lshr_v2i16:
145 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
146 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
147 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
149 ; GFX11-NEXT: s_waitcnt vmcnt(0)
150 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0
151 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
152 ; GFX11-NEXT: s_nop 0
153 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
154 ; GFX11-NEXT: s_endpgm
155 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156 %tid.ext = sext i32 %tid to i64
157 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
158 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
159 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
160 %a = load <2 x i16>, ptr addrspace(1) %in.gep
161 %b = load <2 x i16>, ptr addrspace(1) %b_ptr
162 %result = lshr <2 x i16> %a, %b
163 store <2 x i16> %result, ptr addrspace(1) %out.gep
167 define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
168 ; GFX9-LABEL: lshr_v_s_v2i16:
170 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
171 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
172 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
173 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
175 ; GFX9-NEXT: s_waitcnt vmcnt(0)
176 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1
177 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
178 ; GFX9-NEXT: s_endpgm
180 ; VI-LABEL: lshr_v_s_v2i16:
182 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
183 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
184 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
185 ; VI-NEXT: s_waitcnt lgkmcnt(0)
186 ; VI-NEXT: v_mov_b32_e32 v1, s7
187 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
188 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
189 ; VI-NEXT: flat_load_dword v3, v[0:1]
190 ; VI-NEXT: s_lshr_b32 s1, s0, 16
191 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
192 ; VI-NEXT: v_mov_b32_e32 v2, s1
193 ; VI-NEXT: v_mov_b32_e32 v1, s5
194 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
195 ; VI-NEXT: s_waitcnt vmcnt(0)
196 ; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3
197 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
198 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
199 ; VI-NEXT: flat_store_dword v[0:1], v2
202 ; CI-LABEL: lshr_v_s_v2i16:
204 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
205 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
206 ; CI-NEXT: s_mov_b32 s3, 0xf000
207 ; CI-NEXT: s_mov_b32 s2, 0
208 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
209 ; CI-NEXT: s_waitcnt lgkmcnt(0)
210 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
211 ; CI-NEXT: v_mov_b32_e32 v1, 0
212 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
213 ; CI-NEXT: s_lshr_b32 s0, s8, 16
214 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
215 ; CI-NEXT: s_waitcnt vmcnt(0)
216 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
217 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
218 ; CI-NEXT: v_lshrrev_b32_e32 v3, s0, v3
219 ; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2
220 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
221 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
222 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
225 ; GFX10-LABEL: lshr_v_s_v2i16:
227 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
228 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
229 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
230 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
232 ; GFX10-NEXT: s_waitcnt vmcnt(0)
233 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s0, v1
234 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
235 ; GFX10-NEXT: s_endpgm
237 ; GFX11-LABEL: lshr_v_s_v2i16:
239 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
240 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
241 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
242 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
243 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
244 ; GFX11-NEXT: s_waitcnt vmcnt(0)
245 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s0, v1
246 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
247 ; GFX11-NEXT: s_nop 0
248 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
249 ; GFX11-NEXT: s_endpgm
250 %tid = call i32 @llvm.amdgcn.workitem.id.x()
251 %tid.ext = sext i32 %tid to i64
252 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
253 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
254 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
255 %result = lshr <2 x i16> %vgpr, %sgpr
256 store <2 x i16> %result, ptr addrspace(1) %out.gep
260 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
261 ; GFX9-LABEL: lshr_s_v_v2i16:
263 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
264 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
265 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
266 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
269 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2
270 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
271 ; GFX9-NEXT: s_endpgm
273 ; VI-LABEL: lshr_s_v_v2i16:
275 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
276 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
277 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
278 ; VI-NEXT: s_waitcnt lgkmcnt(0)
279 ; VI-NEXT: v_mov_b32_e32 v1, s7
280 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
281 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
282 ; VI-NEXT: flat_load_dword v3, v[0:1]
283 ; VI-NEXT: s_lshr_b32 s1, s0, 16
284 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
285 ; VI-NEXT: v_mov_b32_e32 v2, s1
286 ; VI-NEXT: v_mov_b32_e32 v1, s5
287 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
288 ; VI-NEXT: s_waitcnt vmcnt(0)
289 ; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0
290 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
291 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
292 ; VI-NEXT: flat_store_dword v[0:1], v2
295 ; CI-LABEL: lshr_s_v_v2i16:
297 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
298 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
299 ; CI-NEXT: s_mov_b32 s3, 0xf000
300 ; CI-NEXT: s_mov_b32 s2, 0
301 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
302 ; CI-NEXT: s_waitcnt lgkmcnt(0)
303 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
304 ; CI-NEXT: v_mov_b32_e32 v1, 0
305 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
306 ; CI-NEXT: s_lshr_b32 s0, s8, 16
307 ; CI-NEXT: s_and_b32 s1, s8, 0xffff
308 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
309 ; CI-NEXT: s_waitcnt vmcnt(0)
310 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
311 ; CI-NEXT: v_lshr_b32_e32 v3, s0, v3
312 ; CI-NEXT: v_lshr_b32_e32 v2, s1, v2
313 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
314 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
315 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
318 ; GFX10-LABEL: lshr_s_v_v2i16:
320 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
321 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
322 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
323 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
324 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
325 ; GFX10-NEXT: s_waitcnt vmcnt(0)
326 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s0
327 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
328 ; GFX10-NEXT: s_endpgm
330 ; GFX11-LABEL: lshr_s_v_v2i16:
332 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
333 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
334 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
335 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
336 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
337 ; GFX11-NEXT: s_waitcnt vmcnt(0)
338 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s0
339 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
340 ; GFX11-NEXT: s_nop 0
341 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
342 ; GFX11-NEXT: s_endpgm
343 %tid = call i32 @llvm.amdgcn.workitem.id.x()
344 %tid.ext = sext i32 %tid to i64
345 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
346 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
347 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
348 %result = lshr <2 x i16> %sgpr, %vgpr
349 store <2 x i16> %result, ptr addrspace(1) %out.gep
353 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
354 ; GFX9-LABEL: lshr_imm_v_v2i16:
356 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
357 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
358 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
359 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
360 ; GFX9-NEXT: s_waitcnt vmcnt(0)
361 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
362 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
363 ; GFX9-NEXT: s_endpgm
365 ; VI-LABEL: lshr_imm_v_v2i16:
367 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
368 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
369 ; VI-NEXT: v_mov_b32_e32 v4, 8
370 ; VI-NEXT: s_waitcnt lgkmcnt(0)
371 ; VI-NEXT: v_mov_b32_e32 v1, s3
372 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
373 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
374 ; VI-NEXT: flat_load_dword v3, v[0:1]
375 ; VI-NEXT: v_mov_b32_e32 v1, s1
376 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
377 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
378 ; VI-NEXT: s_waitcnt vmcnt(0)
379 ; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
380 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
381 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
382 ; VI-NEXT: flat_store_dword v[0:1], v2
385 ; CI-LABEL: lshr_imm_v_v2i16:
387 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
388 ; CI-NEXT: s_mov_b32 s7, 0xf000
389 ; CI-NEXT: s_mov_b32 s6, 0
390 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
391 ; CI-NEXT: v_mov_b32_e32 v1, 0
392 ; CI-NEXT: s_waitcnt lgkmcnt(0)
393 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
394 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
395 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
396 ; CI-NEXT: s_waitcnt vmcnt(0)
397 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
398 ; CI-NEXT: v_lshr_b32_e32 v3, 8, v3
399 ; CI-NEXT: v_lshr_b32_e32 v2, 8, v2
400 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
401 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
402 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
405 ; GFX10-LABEL: lshr_imm_v_v2i16:
407 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
408 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
409 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
410 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
411 ; GFX10-NEXT: s_waitcnt vmcnt(0)
412 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
413 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
414 ; GFX10-NEXT: s_endpgm
416 ; GFX11-LABEL: lshr_imm_v_v2i16:
418 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
419 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
420 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
421 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
422 ; GFX11-NEXT: s_waitcnt vmcnt(0)
423 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
424 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
425 ; GFX11-NEXT: s_nop 0
426 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
427 ; GFX11-NEXT: s_endpgm
428 %tid = call i32 @llvm.amdgcn.workitem.id.x()
429 %tid.ext = sext i32 %tid to i64
430 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
431 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
432 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
433 %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
434 store <2 x i16> %result, ptr addrspace(1) %out.gep
438 define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
439 ; GFX9-LABEL: lshr_v_imm_v2i16:
441 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
442 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
443 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
445 ; GFX9-NEXT: s_waitcnt vmcnt(0)
446 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
447 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
448 ; GFX9-NEXT: s_endpgm
450 ; VI-LABEL: lshr_v_imm_v2i16:
452 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
453 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
454 ; VI-NEXT: s_waitcnt lgkmcnt(0)
455 ; VI-NEXT: v_mov_b32_e32 v1, s3
456 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
457 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
458 ; VI-NEXT: flat_load_dword v3, v[0:1]
459 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
460 ; VI-NEXT: v_mov_b32_e32 v1, s1
461 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
462 ; VI-NEXT: s_waitcnt vmcnt(0)
463 ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
464 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
465 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
466 ; VI-NEXT: flat_store_dword v[0:1], v2
469 ; CI-LABEL: lshr_v_imm_v2i16:
471 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
472 ; CI-NEXT: s_mov_b32 s7, 0xf000
473 ; CI-NEXT: s_mov_b32 s6, 0
474 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
475 ; CI-NEXT: v_mov_b32_e32 v1, 0
476 ; CI-NEXT: s_waitcnt lgkmcnt(0)
477 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
478 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
479 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
480 ; CI-NEXT: s_waitcnt vmcnt(0)
481 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
482 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
483 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
486 ; GFX10-LABEL: lshr_v_imm_v2i16:
488 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
489 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
490 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
491 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
492 ; GFX10-NEXT: s_waitcnt vmcnt(0)
493 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
494 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
495 ; GFX10-NEXT: s_endpgm
497 ; GFX11-LABEL: lshr_v_imm_v2i16:
499 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
500 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
501 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
503 ; GFX11-NEXT: s_waitcnt vmcnt(0)
504 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
505 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
506 ; GFX11-NEXT: s_nop 0
507 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
508 ; GFX11-NEXT: s_endpgm
509 %tid = call i32 @llvm.amdgcn.workitem.id.x()
510 %tid.ext = sext i32 %tid to i64
511 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
512 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
513 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
514 %result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
515 store <2 x i16> %result, ptr addrspace(1) %out.gep
519 define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
520 ; GFX9-LABEL: v_lshr_v4i16:
522 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
523 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
524 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
525 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
526 ; GFX9-NEXT: s_waitcnt vmcnt(0)
527 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1
528 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0
529 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
530 ; GFX9-NEXT: s_endpgm
532 ; VI-LABEL: v_lshr_v4i16:
534 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
535 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
536 ; VI-NEXT: s_waitcnt lgkmcnt(0)
537 ; VI-NEXT: v_mov_b32_e32 v1, s3
538 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
539 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
540 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
541 ; VI-NEXT: v_mov_b32_e32 v5, s1
542 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
543 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
544 ; VI-NEXT: s_waitcnt vmcnt(0)
545 ; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
546 ; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
547 ; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0
548 ; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
549 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
550 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
551 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
554 ; CI-LABEL: v_lshr_v4i16:
556 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
557 ; CI-NEXT: s_mov_b32 s7, 0xf000
558 ; CI-NEXT: s_mov_b32 s6, 0
559 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
560 ; CI-NEXT: v_mov_b32_e32 v5, 0
561 ; CI-NEXT: s_waitcnt lgkmcnt(0)
562 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
563 ; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
564 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
565 ; CI-NEXT: s_waitcnt vmcnt(0)
566 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
567 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
568 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
569 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
570 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
571 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3
572 ; CI-NEXT: v_lshrrev_b32_e32 v1, v3, v1
573 ; CI-NEXT: v_lshrrev_b32_e32 v3, v9, v7
574 ; CI-NEXT: v_lshrrev_b32_e32 v0, v2, v0
575 ; CI-NEXT: v_lshrrev_b32_e32 v2, v8, v6
576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
577 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
578 ; CI-NEXT: v_or_b32_e32 v1, v1, v3
579 ; CI-NEXT: v_or_b32_e32 v0, v0, v2
580 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
583 ; GFX10-LABEL: v_lshr_v4i16:
585 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
586 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
587 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
588 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
589 ; GFX10-NEXT: s_waitcnt vmcnt(0)
590 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1
591 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0
592 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
593 ; GFX10-NEXT: s_endpgm
595 ; GFX11-LABEL: v_lshr_v4i16:
597 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
598 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
599 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
600 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
601 ; GFX11-NEXT: s_waitcnt vmcnt(0)
602 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1
603 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0
604 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
605 ; GFX11-NEXT: s_nop 0
606 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
607 ; GFX11-NEXT: s_endpgm
608 %tid = call i32 @llvm.amdgcn.workitem.id.x()
609 %tid.ext = sext i32 %tid to i64
610 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
611 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
612 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
613 %a = load <4 x i16>, ptr addrspace(1) %in.gep
614 %b = load <4 x i16>, ptr addrspace(1) %b_ptr
615 %result = lshr <4 x i16> %a, %b
616 store <4 x i16> %result, ptr addrspace(1) %out.gep
620 define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
621 ; GFX9-LABEL: lshr_v_imm_v4i16:
623 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
624 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
625 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
626 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
628 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
629 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
630 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
631 ; GFX9-NEXT: s_endpgm
633 ; VI-LABEL: lshr_v_imm_v4i16:
635 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
636 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
637 ; VI-NEXT: s_waitcnt lgkmcnt(0)
638 ; VI-NEXT: v_mov_b32_e32 v1, s3
639 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
640 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
641 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
642 ; VI-NEXT: v_mov_b32_e32 v3, s1
643 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
644 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
645 ; VI-NEXT: s_waitcnt vmcnt(0)
646 ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1
647 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0
648 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
649 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
650 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
651 ; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
652 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
655 ; CI-LABEL: lshr_v_imm_v4i16:
657 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
658 ; CI-NEXT: s_mov_b32 s7, 0xf000
659 ; CI-NEXT: s_mov_b32 s6, 0
660 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
661 ; CI-NEXT: v_mov_b32_e32 v1, 0
662 ; CI-NEXT: s_waitcnt lgkmcnt(0)
663 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
664 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
665 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
666 ; CI-NEXT: s_waitcnt vmcnt(0)
667 ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
668 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
669 ; CI-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
670 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
671 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
674 ; GFX10-LABEL: lshr_v_imm_v4i16:
676 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
677 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
678 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
679 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
680 ; GFX10-NEXT: s_waitcnt vmcnt(0)
681 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
682 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
683 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
684 ; GFX10-NEXT: s_endpgm
686 ; GFX11-LABEL: lshr_v_imm_v4i16:
688 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
689 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
690 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
691 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
692 ; GFX11-NEXT: s_waitcnt vmcnt(0)
693 ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
694 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
695 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
696 ; GFX11-NEXT: s_nop 0
697 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
698 ; GFX11-NEXT: s_endpgm
699 %tid = call i32 @llvm.amdgcn.workitem.id.x()
700 %tid.ext = sext i32 %tid to i64
701 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
702 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
703 %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
704 %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
705 store <4 x i16> %result, ptr addrspace(1) %out.gep
709 declare i32 @llvm.amdgcn.workitem.id.x() #1
711 attributes #0 = { nounwind }
712 attributes #1 = { nounwind readnone }