1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
7 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
8 ; GFX9-LABEL: s_lshr_v2i16:
10 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
11 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
12 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x30
13 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
16 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, s5, v1
17 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
20 ; VI-LABEL: s_lshr_v2i16:
22 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
23 ; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
24 ; VI-NEXT: s_load_dword s0, s[0:1], 0x30
25 ; VI-NEXT: s_mov_b32 s4, 0xffff
26 ; VI-NEXT: s_waitcnt lgkmcnt(0)
27 ; VI-NEXT: v_mov_b32_e32 v0, s2
28 ; VI-NEXT: s_lshr_b32 s1, s5, 16
29 ; VI-NEXT: s_lshr_b32 s6, s0, 16
30 ; VI-NEXT: s_lshr_b32 s1, s1, s6
31 ; VI-NEXT: s_and_b32 s5, s5, s4
32 ; VI-NEXT: s_and_b32 s0, s0, s4
33 ; VI-NEXT: s_lshr_b32 s0, s5, s0
34 ; VI-NEXT: s_lshl_b32 s1, s1, 16
35 ; VI-NEXT: s_or_b32 s0, s0, s1
36 ; VI-NEXT: v_mov_b32_e32 v1, s3
37 ; VI-NEXT: v_mov_b32_e32 v2, s0
38 ; VI-NEXT: flat_store_dword v[0:1], v2
41 ; CI-LABEL: s_lshr_v2i16:
43 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
44 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb
45 ; CI-NEXT: s_load_dword s0, s[0:1], 0xc
46 ; CI-NEXT: s_mov_b32 s3, 0xffff
47 ; CI-NEXT: s_mov_b32 s7, 0xf000
48 ; CI-NEXT: s_mov_b32 s6, -1
49 ; CI-NEXT: s_waitcnt lgkmcnt(0)
50 ; CI-NEXT: s_lshr_b32 s1, s2, 16
51 ; CI-NEXT: s_lshr_b32 s8, s0, 16
52 ; CI-NEXT: s_lshr_b32 s1, s1, s8
53 ; CI-NEXT: s_and_b32 s2, s2, s3
54 ; CI-NEXT: s_and_b32 s0, s0, s3
55 ; CI-NEXT: s_lshr_b32 s0, s2, s0
56 ; CI-NEXT: s_lshl_b32 s1, s1, 16
57 ; CI-NEXT: s_or_b32 s0, s0, s1
58 ; CI-NEXT: v_mov_b32_e32 v0, s0
59 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
62 ; GFX10-LABEL: s_lshr_v2i16:
64 ; GFX10-NEXT: s_clause 0x2
65 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
66 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x30
67 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
68 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
69 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s5, s4
71 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
72 ; GFX10-NEXT: s_endpgm
73 %result = lshr <2 x i16> %lhs, %rhs
74 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
78 define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
79 ; GFX9-LABEL: v_lshr_v2i16:
81 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
82 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
83 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
84 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
85 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4
86 ; GFX9-NEXT: s_waitcnt vmcnt(0)
87 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1
88 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
91 ; VI-LABEL: v_lshr_v2i16:
93 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
94 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
95 ; VI-NEXT: s_waitcnt lgkmcnt(0)
96 ; VI-NEXT: v_mov_b32_e32 v1, s3
97 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
98 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
99 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
100 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
101 ; VI-NEXT: flat_load_dword v5, v[0:1]
102 ; VI-NEXT: flat_load_dword v2, v[2:3]
103 ; VI-NEXT: v_mov_b32_e32 v1, s1
104 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
105 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
106 ; VI-NEXT: s_waitcnt vmcnt(0)
107 ; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5
108 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
109 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
110 ; VI-NEXT: flat_store_dword v[0:1], v2
113 ; CI-LABEL: v_lshr_v2i16:
115 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
116 ; CI-NEXT: s_mov_b32 s3, 0xf000
117 ; CI-NEXT: s_mov_b32 s2, 0
118 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
119 ; CI-NEXT: v_mov_b32_e32 v1, 0
120 ; CI-NEXT: s_waitcnt lgkmcnt(0)
121 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
122 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
123 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
124 ; CI-NEXT: s_mov_b32 s0, 0xffff
125 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
126 ; CI-NEXT: s_waitcnt vmcnt(1)
127 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
128 ; CI-NEXT: s_waitcnt vmcnt(0)
129 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
130 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
131 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
132 ; CI-NEXT: v_lshr_b32_e32 v2, v2, v3
133 ; CI-NEXT: v_lshr_b32_e32 v3, v4, v5
134 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
135 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
136 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
139 ; GFX10-LABEL: v_lshr_v2i16:
141 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
142 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
143 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
144 ; GFX10-NEXT: s_clause 0x1
145 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
146 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4
147 ; GFX10-NEXT: s_waitcnt vmcnt(0)
148 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1
149 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
150 ; GFX10-NEXT: s_endpgm
151 %tid = call i32 @llvm.amdgcn.workitem.id.x()
152 %tid.ext = sext i32 %tid to i64
153 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
154 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
155 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
156 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
157 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
158 %result = lshr <2 x i16> %a, %b
159 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
163 define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
164 ; GFX9-LABEL: lshr_v_s_v2i16:
166 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
167 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
168 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
169 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
171 ; GFX9-NEXT: s_waitcnt vmcnt(0)
172 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1
173 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
174 ; GFX9-NEXT: s_endpgm
176 ; VI-LABEL: lshr_v_s_v2i16:
178 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
179 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
180 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
182 ; VI-NEXT: v_mov_b32_e32 v1, s7
183 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
184 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
185 ; VI-NEXT: flat_load_dword v3, v[0:1]
186 ; VI-NEXT: s_lshr_b32 s1, s0, 16
187 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
188 ; VI-NEXT: v_mov_b32_e32 v2, s1
189 ; VI-NEXT: v_mov_b32_e32 v1, s5
190 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
191 ; VI-NEXT: s_waitcnt vmcnt(0)
192 ; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3
193 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
194 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
195 ; VI-NEXT: flat_store_dword v[0:1], v2
198 ; CI-LABEL: lshr_v_s_v2i16:
200 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
201 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
202 ; CI-NEXT: s_mov_b32 s3, 0xf000
203 ; CI-NEXT: s_mov_b32 s2, 0
204 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
205 ; CI-NEXT: s_waitcnt lgkmcnt(0)
206 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
207 ; CI-NEXT: v_mov_b32_e32 v1, 0
208 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
209 ; CI-NEXT: s_mov_b32 s0, 0xffff
210 ; CI-NEXT: s_lshr_b32 s1, s8, 16
211 ; CI-NEXT: s_and_b32 s8, s8, s0
212 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
213 ; CI-NEXT: s_waitcnt vmcnt(0)
214 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
215 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
216 ; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3
217 ; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2
218 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
219 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
220 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
223 ; GFX10-LABEL: lshr_v_s_v2i16:
225 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
226 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
227 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
228 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
229 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
230 ; GFX10-NEXT: s_waitcnt vmcnt(0)
231 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s0, v1
232 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
233 ; GFX10-NEXT: s_endpgm
234 %tid = call i32 @llvm.amdgcn.workitem.id.x()
235 %tid.ext = sext i32 %tid to i64
236 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
237 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
238 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
239 %result = lshr <2 x i16> %vgpr, %sgpr
240 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
244 define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
245 ; GFX9-LABEL: lshr_s_v_v2i16:
247 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
248 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
249 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
250 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
252 ; GFX9-NEXT: s_waitcnt vmcnt(0)
253 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2
254 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
255 ; GFX9-NEXT: s_endpgm
257 ; VI-LABEL: lshr_s_v_v2i16:
259 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
260 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
261 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
262 ; VI-NEXT: s_waitcnt lgkmcnt(0)
263 ; VI-NEXT: v_mov_b32_e32 v1, s7
264 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
265 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
266 ; VI-NEXT: flat_load_dword v3, v[0:1]
267 ; VI-NEXT: s_lshr_b32 s1, s0, 16
268 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
269 ; VI-NEXT: v_mov_b32_e32 v2, s1
270 ; VI-NEXT: v_mov_b32_e32 v1, s5
271 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
272 ; VI-NEXT: s_waitcnt vmcnt(0)
273 ; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0
274 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
275 ; VI-NEXT: v_or_b32_e32 v2, v4, v2
276 ; VI-NEXT: flat_store_dword v[0:1], v2
279 ; CI-LABEL: lshr_s_v_v2i16:
281 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
282 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
283 ; CI-NEXT: s_mov_b32 s3, 0xf000
284 ; CI-NEXT: s_mov_b32 s2, 0
285 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
286 ; CI-NEXT: s_waitcnt lgkmcnt(0)
287 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
288 ; CI-NEXT: v_mov_b32_e32 v1, 0
289 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
290 ; CI-NEXT: s_mov_b32 s0, 0xffff
291 ; CI-NEXT: s_lshr_b32 s1, s8, 16
292 ; CI-NEXT: s_and_b32 s8, s8, s0
293 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
294 ; CI-NEXT: s_waitcnt vmcnt(0)
295 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
296 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
297 ; CI-NEXT: v_lshr_b32_e32 v3, s1, v3
298 ; CI-NEXT: v_lshr_b32_e32 v2, s8, v2
299 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
300 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
301 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
304 ; GFX10-LABEL: lshr_s_v_v2i16:
306 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
307 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
308 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
309 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
310 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
311 ; GFX10-NEXT: s_waitcnt vmcnt(0)
312 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s0
313 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
314 ; GFX10-NEXT: s_endpgm
315 %tid = call i32 @llvm.amdgcn.workitem.id.x()
316 %tid.ext = sext i32 %tid to i64
317 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
318 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
319 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
320 %result = lshr <2 x i16> %sgpr, %vgpr
321 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
325 define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
326 ; GFX9-LABEL: lshr_imm_v_v2i16:
328 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
329 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
330 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
331 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
332 ; GFX9-NEXT: s_waitcnt vmcnt(0)
333 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
334 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
335 ; GFX9-NEXT: s_endpgm
337 ; VI-LABEL: lshr_imm_v_v2i16:
339 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
340 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
341 ; VI-NEXT: v_mov_b32_e32 v4, 8
342 ; VI-NEXT: s_waitcnt lgkmcnt(0)
343 ; VI-NEXT: v_mov_b32_e32 v1, s3
344 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
345 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
346 ; VI-NEXT: flat_load_dword v3, v[0:1]
347 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
348 ; VI-NEXT: v_mov_b32_e32 v1, s1
349 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
350 ; VI-NEXT: s_waitcnt vmcnt(0)
351 ; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
352 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
353 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
354 ; VI-NEXT: flat_store_dword v[0:1], v2
357 ; CI-LABEL: lshr_imm_v_v2i16:
359 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
360 ; CI-NEXT: s_mov_b32 s3, 0xf000
361 ; CI-NEXT: s_mov_b32 s2, 0
362 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
363 ; CI-NEXT: v_mov_b32_e32 v1, 0
364 ; CI-NEXT: s_waitcnt lgkmcnt(0)
365 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
366 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
367 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
368 ; CI-NEXT: s_waitcnt vmcnt(0)
369 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
370 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
371 ; CI-NEXT: v_lshr_b32_e32 v3, 8, v3
372 ; CI-NEXT: v_lshr_b32_e32 v2, 8, v2
373 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
374 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
375 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
378 ; GFX10-LABEL: lshr_imm_v_v2i16:
380 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
381 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
382 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
383 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
384 ; GFX10-NEXT: s_waitcnt vmcnt(0)
385 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
386 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
387 ; GFX10-NEXT: s_endpgm
388 %tid = call i32 @llvm.amdgcn.workitem.id.x()
389 %tid.ext = sext i32 %tid to i64
390 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
391 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
392 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
393 %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
394 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
398 define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
399 ; GFX9-LABEL: lshr_v_imm_v2i16:
401 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
402 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
403 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
404 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
406 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
407 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
408 ; GFX9-NEXT: s_endpgm
410 ; VI-LABEL: lshr_v_imm_v2i16:
412 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
413 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
414 ; VI-NEXT: s_waitcnt lgkmcnt(0)
415 ; VI-NEXT: v_mov_b32_e32 v1, s3
416 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
417 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
418 ; VI-NEXT: flat_load_dword v0, v[0:1]
419 ; VI-NEXT: v_mov_b32_e32 v3, s1
420 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
421 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
422 ; VI-NEXT: s_waitcnt vmcnt(0)
423 ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
424 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
425 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
426 ; VI-NEXT: flat_store_dword v[2:3], v0
429 ; CI-LABEL: lshr_v_imm_v2i16:
431 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
432 ; CI-NEXT: s_mov_b32 s3, 0xf000
433 ; CI-NEXT: s_mov_b32 s2, 0
434 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
435 ; CI-NEXT: v_mov_b32_e32 v1, 0
436 ; CI-NEXT: s_waitcnt lgkmcnt(0)
437 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
438 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
439 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
440 ; CI-NEXT: s_waitcnt vmcnt(0)
441 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
442 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
443 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
446 ; GFX10-LABEL: lshr_v_imm_v2i16:
448 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
449 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
450 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
452 ; GFX10-NEXT: s_waitcnt vmcnt(0)
453 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
454 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
455 ; GFX10-NEXT: s_endpgm
456 %tid = call i32 @llvm.amdgcn.workitem.id.x()
457 %tid.ext = sext i32 %tid to i64
458 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
459 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
460 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
461 %result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
462 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
466 define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
467 ; GFX9-LABEL: v_lshr_v4i16:
469 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
470 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
471 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
473 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
474 ; GFX9-NEXT: s_waitcnt vmcnt(0)
475 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1
476 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0
477 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
478 ; GFX9-NEXT: s_endpgm
480 ; VI-LABEL: v_lshr_v4i16:
482 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
483 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
484 ; VI-NEXT: s_waitcnt lgkmcnt(0)
485 ; VI-NEXT: v_mov_b32_e32 v1, s3
486 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
487 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
488 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
489 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
490 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
491 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
492 ; VI-NEXT: v_mov_b32_e32 v5, s1
493 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
494 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
495 ; VI-NEXT: s_waitcnt vmcnt(0)
496 ; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
497 ; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
498 ; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0
499 ; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
500 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
501 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
502 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
505 ; CI-LABEL: v_lshr_v4i16:
507 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
508 ; CI-NEXT: s_mov_b32 s3, 0xf000
509 ; CI-NEXT: s_mov_b32 s2, 0
510 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
511 ; CI-NEXT: v_mov_b32_e32 v1, 0
512 ; CI-NEXT: s_waitcnt lgkmcnt(0)
513 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
514 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
515 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
516 ; CI-NEXT: s_mov_b32 s0, 0xffff
517 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
518 ; CI-NEXT: s_waitcnt vmcnt(1)
519 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
520 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
521 ; CI-NEXT: s_waitcnt vmcnt(0)
522 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
523 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
524 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
525 ; CI-NEXT: v_and_b32_e32 v4, s0, v4
526 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
527 ; CI-NEXT: v_and_b32_e32 v5, s0, v5
528 ; CI-NEXT: v_lshr_b32_e32 v3, v3, v5
529 ; CI-NEXT: v_lshr_b32_e32 v5, v7, v9
530 ; CI-NEXT: v_lshr_b32_e32 v2, v2, v4
531 ; CI-NEXT: v_lshr_b32_e32 v4, v6, v8
532 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
533 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
534 ; CI-NEXT: v_or_b32_e32 v3, v3, v5
535 ; CI-NEXT: v_or_b32_e32 v2, v2, v4
536 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
539 ; GFX10-LABEL: v_lshr_v4i16:
541 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
542 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
543 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
544 ; GFX10-NEXT: s_clause 0x1
545 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
546 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
547 ; GFX10-NEXT: s_waitcnt vmcnt(0)
548 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1
549 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0
550 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
551 ; GFX10-NEXT: s_endpgm
552 %tid = call i32 @llvm.amdgcn.workitem.id.x()
553 %tid.ext = sext i32 %tid to i64
554 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
555 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
556 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
557 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
558 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
559 %result = lshr <4 x i16> %a, %b
560 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
564 define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
565 ; GFX9-LABEL: lshr_v_imm_v4i16:
567 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
568 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
569 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
570 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
571 ; GFX9-NEXT: s_waitcnt vmcnt(0)
572 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
573 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
574 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
575 ; GFX9-NEXT: s_endpgm
577 ; VI-LABEL: lshr_v_imm_v4i16:
579 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
580 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
581 ; VI-NEXT: s_waitcnt lgkmcnt(0)
582 ; VI-NEXT: v_mov_b32_e32 v1, s3
583 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
584 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
585 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
586 ; VI-NEXT: v_mov_b32_e32 v3, s1
587 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
588 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
589 ; VI-NEXT: s_waitcnt vmcnt(0)
590 ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1
591 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0
592 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
593 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
594 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
595 ; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
596 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
599 ; CI-LABEL: lshr_v_imm_v4i16:
601 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
602 ; CI-NEXT: s_mov_b32 s3, 0xf000
603 ; CI-NEXT: s_mov_b32 s2, 0
604 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
605 ; CI-NEXT: v_mov_b32_e32 v1, 0
606 ; CI-NEXT: s_waitcnt lgkmcnt(0)
607 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
608 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
609 ; CI-NEXT: s_mov_b32 s0, 0xff00ff
610 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
611 ; CI-NEXT: s_waitcnt vmcnt(0)
612 ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
613 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
614 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
615 ; CI-NEXT: v_and_b32_e32 v2, s0, v2
616 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
619 ; GFX10-LABEL: lshr_v_imm_v4i16:
621 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
622 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
623 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
624 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
625 ; GFX10-NEXT: s_waitcnt vmcnt(0)
626 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
627 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
628 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
629 ; GFX10-NEXT: s_endpgm
630 %tid = call i32 @llvm.amdgcn.workitem.id.x()
631 %tid.ext = sext i32 %tid to i64
632 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
633 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
634 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
635 %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
636 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
640 declare i32 @llvm.amdgcn.workitem.id.x() #1
642 attributes #0 = { nounwind }
643 attributes #1 = { nounwind readnone }