1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
6 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
7 ; GFX9-LABEL: s_shl_v2i16:
9 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
11 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
12 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX9-NEXT: s_mov_b32 s6, -1
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
16 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0
17 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
20 ; VI-LABEL: s_shl_v2i16:
22 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
23 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
24 ; VI-NEXT: s_load_dword s0, s[0:1], 0x30
25 ; VI-NEXT: s_mov_b32 s3, 0xffff
26 ; VI-NEXT: s_mov_b32 s7, 0xf000
27 ; VI-NEXT: s_mov_b32 s6, -1
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_lshr_b32 s1, s2, 16
30 ; VI-NEXT: s_lshr_b32 s8, s0, 16
31 ; VI-NEXT: s_and_b32 s2, s2, s3
32 ; VI-NEXT: s_and_b32 s0, s0, s3
33 ; VI-NEXT: s_lshl_b32 s0, s2, s0
34 ; VI-NEXT: s_lshl_b32 s1, s1, s8
35 ; VI-NEXT: s_lshl_b32 s1, s1, 16
36 ; VI-NEXT: s_and_b32 s0, s0, s3
37 ; VI-NEXT: s_or_b32 s0, s0, s1
38 ; VI-NEXT: v_mov_b32_e32 v0, s0
39 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
42 ; CI-LABEL: s_shl_v2i16:
44 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
45 ; CI-NEXT: s_load_dword s2, s[0:1], 0xb
46 ; CI-NEXT: s_load_dword s0, s[0:1], 0xc
47 ; CI-NEXT: s_mov_b32 s3, 0xffff
48 ; CI-NEXT: s_mov_b32 s7, 0xf000
49 ; CI-NEXT: s_mov_b32 s6, -1
50 ; CI-NEXT: s_waitcnt lgkmcnt(0)
51 ; CI-NEXT: s_lshr_b32 s1, s2, 16
52 ; CI-NEXT: s_and_b32 s8, s0, s3
53 ; CI-NEXT: s_lshr_b32 s0, s0, 16
54 ; CI-NEXT: s_lshl_b32 s0, s1, s0
55 ; CI-NEXT: s_lshl_b32 s1, s2, s8
56 ; CI-NEXT: s_lshl_b32 s0, s0, 16
57 ; CI-NEXT: s_and_b32 s1, s1, s3
58 ; CI-NEXT: s_or_b32 s0, s1, s0
59 ; CI-NEXT: v_mov_b32_e32 v0, s0
60 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
62 %result = shl <2 x i16> %lhs, %rhs
63 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
67 define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
68 ; GFX9-LABEL: v_shl_v2i16:
70 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
77 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
78 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
79 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
80 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
81 ; GFX9-NEXT: s_waitcnt vmcnt(0)
82 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3
83 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
86 ; VI-LABEL: v_shl_v2i16:
88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
89 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
90 ; VI-NEXT: s_waitcnt lgkmcnt(0)
91 ; VI-NEXT: v_mov_b32_e32 v1, s3
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
94 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
95 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
96 ; VI-NEXT: flat_load_dword v5, v[0:1]
97 ; VI-NEXT: flat_load_dword v2, v[2:3]
98 ; VI-NEXT: v_mov_b32_e32 v1, s1
99 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
100 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
102 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
103 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
104 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
105 ; VI-NEXT: flat_store_dword v[0:1], v2
108 ; CI-LABEL: v_shl_v2i16:
110 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
111 ; CI-NEXT: s_mov_b32 s7, 0xf000
112 ; CI-NEXT: s_mov_b32 s6, 0
113 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
114 ; CI-NEXT: v_mov_b32_e32 v1, 0
115 ; CI-NEXT: s_waitcnt lgkmcnt(0)
116 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
119 ; CI-NEXT: s_mov_b32 s8, 0xffff
120 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
121 ; CI-NEXT: s_waitcnt vmcnt(1)
122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
123 ; CI-NEXT: s_waitcnt vmcnt(0)
124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3
125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
126 ; CI-NEXT: v_lshlrev_b32_e32 v3, v3, v4
127 ; CI-NEXT: v_lshlrev_b32_e32 v2, v5, v2
128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134 %tid.ext = sext i32 %tid to i64
135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
137 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
138 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
139 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
140 %result = shl <2 x i16> %a, %b
141 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
145 define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
146 ; GFX9-LABEL: shl_v_s_v2i16:
148 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
149 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
150 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
155 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
156 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
157 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
158 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
159 ; GFX9-NEXT: s_waitcnt vmcnt(0)
160 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3
161 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
162 ; GFX9-NEXT: s_endpgm
164 ; VI-LABEL: shl_v_s_v2i16:
166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
167 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
168 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
169 ; VI-NEXT: s_waitcnt lgkmcnt(0)
170 ; VI-NEXT: v_mov_b32_e32 v1, s7
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
173 ; VI-NEXT: flat_load_dword v3, v[0:1]
174 ; VI-NEXT: s_lshr_b32 s1, s0, 16
175 ; VI-NEXT: v_mov_b32_e32 v4, s1
176 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
177 ; VI-NEXT: v_mov_b32_e32 v1, s5
178 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
180 ; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3
181 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
182 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
183 ; VI-NEXT: flat_store_dword v[0:1], v2
186 ; CI-LABEL: shl_v_s_v2i16:
188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
189 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
190 ; CI-NEXT: s_mov_b32 s3, 0xf000
191 ; CI-NEXT: s_mov_b32 s2, 0
192 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
193 ; CI-NEXT: s_waitcnt lgkmcnt(0)
194 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
195 ; CI-NEXT: v_mov_b32_e32 v1, 0
196 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
197 ; CI-NEXT: s_mov_b32 s9, 0xffff
198 ; CI-NEXT: s_lshr_b32 s10, s8, 16
199 ; CI-NEXT: s_and_b32 s8, s8, s9
200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
201 ; CI-NEXT: s_waitcnt vmcnt(0)
202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3
205 ; CI-NEXT: v_and_b32_e32 v2, s9, v2
206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
210 %tid = call i32 @llvm.amdgcn.workitem.id.x()
211 %tid.ext = sext i32 %tid to i64
212 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
213 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
214 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
215 %result = shl <2 x i16> %vgpr, %sgpr
216 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
220 define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
221 ; GFX9-LABEL: shl_s_v_v2i16:
223 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
224 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
225 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
230 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
231 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
232 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
233 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
234 ; GFX9-NEXT: s_waitcnt vmcnt(0)
235 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0
236 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
237 ; GFX9-NEXT: s_endpgm
239 ; VI-LABEL: shl_s_v_v2i16:
241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
242 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
243 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
244 ; VI-NEXT: s_waitcnt lgkmcnt(0)
245 ; VI-NEXT: v_mov_b32_e32 v1, s7
246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
248 ; VI-NEXT: flat_load_dword v3, v[0:1]
249 ; VI-NEXT: s_lshr_b32 s1, s0, 16
250 ; VI-NEXT: v_mov_b32_e32 v4, s1
251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
252 ; VI-NEXT: v_mov_b32_e32 v1, s5
253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
255 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0
256 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
257 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
258 ; VI-NEXT: flat_store_dword v[0:1], v2
261 ; CI-LABEL: shl_s_v_v2i16:
263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
264 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd
265 ; CI-NEXT: s_mov_b32 s3, 0xf000
266 ; CI-NEXT: s_mov_b32 s2, 0
267 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268 ; CI-NEXT: s_waitcnt lgkmcnt(0)
269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
270 ; CI-NEXT: v_mov_b32_e32 v1, 0
271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
272 ; CI-NEXT: s_mov_b32 s0, 0xffff
273 ; CI-NEXT: s_lshr_b32 s1, s8, 16
274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
275 ; CI-NEXT: s_waitcnt vmcnt(0)
276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2
277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
278 ; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3
282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
285 %tid = call i32 @llvm.amdgcn.workitem.id.x()
286 %tid.ext = sext i32 %tid to i64
287 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
288 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
289 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
290 %result = shl <2 x i16> %sgpr, %vgpr
291 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
295 define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
296 ; GFX9-LABEL: shl_imm_v_v2i16:
298 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
304 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
305 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
306 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
307 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0]
310 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
311 ; GFX9-NEXT: s_endpgm
313 ; VI-LABEL: shl_imm_v_v2i16:
315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
316 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
317 ; VI-NEXT: v_mov_b32_e32 v3, 8
318 ; VI-NEXT: s_waitcnt lgkmcnt(0)
319 ; VI-NEXT: v_mov_b32_e32 v1, s3
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
322 ; VI-NEXT: flat_load_dword v4, v[0:1]
323 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
324 ; VI-NEXT: v_mov_b32_e32 v1, s1
325 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
327 ; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8
328 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
329 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
330 ; VI-NEXT: flat_store_dword v[0:1], v2
333 ; CI-LABEL: shl_imm_v_v2i16:
335 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
336 ; CI-NEXT: s_mov_b32 s7, 0xf000
337 ; CI-NEXT: s_mov_b32 s6, 0
338 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
339 ; CI-NEXT: v_mov_b32_e32 v1, 0
340 ; CI-NEXT: s_waitcnt lgkmcnt(0)
341 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
342 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
343 ; CI-NEXT: s_mov_b32 s4, 0xffff
344 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
345 ; CI-NEXT: s_waitcnt vmcnt(0)
346 ; CI-NEXT: v_and_b32_e32 v3, s4, v2
347 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
348 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2
349 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3
350 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
351 ; CI-NEXT: v_and_b32_e32 v3, s4, v3
352 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
353 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
355 %tid = call i32 @llvm.amdgcn.workitem.id.x()
356 %tid.ext = sext i32 %tid to i64
357 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
358 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
359 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
360 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
361 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
365 define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
366 ; GFX9-LABEL: shl_v_imm_v2i16:
368 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
369 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
370 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
372 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
373 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
374 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
375 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
376 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
377 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
379 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
380 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
381 ; GFX9-NEXT: s_endpgm
383 ; VI-LABEL: shl_v_imm_v2i16:
385 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
386 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
387 ; VI-NEXT: s_waitcnt lgkmcnt(0)
388 ; VI-NEXT: v_mov_b32_e32 v1, s3
389 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
390 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
391 ; VI-NEXT: flat_load_dword v3, v[0:1]
392 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
393 ; VI-NEXT: v_mov_b32_e32 v1, s1
394 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
395 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
396 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
397 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
398 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
399 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
400 ; VI-NEXT: flat_store_dword v[0:1], v2
403 ; CI-LABEL: shl_v_imm_v2i16:
405 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
406 ; CI-NEXT: s_mov_b32 s7, 0xf000
407 ; CI-NEXT: s_mov_b32 s6, 0
408 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
409 ; CI-NEXT: v_mov_b32_e32 v1, 0
410 ; CI-NEXT: s_waitcnt lgkmcnt(0)
411 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
412 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
413 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
414 ; CI-NEXT: s_waitcnt vmcnt(0)
415 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
416 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
417 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
419 %tid = call i32 @llvm.amdgcn.workitem.id.x()
420 %tid.ext = sext i32 %tid to i64
421 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
422 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
423 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
424 %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
425 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
429 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
430 ; GFX9-LABEL: v_shl_v4i16:
432 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
433 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
434 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
435 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
436 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
437 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
438 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
439 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
440 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
441 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
442 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
443 ; GFX9-NEXT: s_waitcnt vmcnt(0)
444 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3
445 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2
446 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
447 ; GFX9-NEXT: s_endpgm
449 ; VI-LABEL: v_shl_v4i16:
451 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
452 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
453 ; VI-NEXT: s_waitcnt lgkmcnt(0)
454 ; VI-NEXT: v_mov_b32_e32 v1, s3
455 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
456 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
457 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
458 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
459 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
460 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
461 ; VI-NEXT: v_mov_b32_e32 v5, s1
462 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
463 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
464 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
465 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
466 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
467 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
468 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
469 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
470 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
471 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
474 ; CI-LABEL: v_shl_v4i16:
476 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
477 ; CI-NEXT: s_mov_b32 s7, 0xf000
478 ; CI-NEXT: s_mov_b32 s6, 0
479 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
480 ; CI-NEXT: v_mov_b32_e32 v1, 0
481 ; CI-NEXT: s_waitcnt lgkmcnt(0)
482 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
483 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
484 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
485 ; CI-NEXT: s_mov_b32 s8, 0xffff
486 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
487 ; CI-NEXT: s_waitcnt vmcnt(1)
488 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
489 ; CI-NEXT: s_waitcnt vmcnt(0)
490 ; CI-NEXT: v_and_b32_e32 v8, s8, v4
491 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
492 ; CI-NEXT: v_and_b32_e32 v9, s8, v5
493 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
494 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
495 ; CI-NEXT: v_lshlrev_b32_e32 v5, v5, v7
496 ; CI-NEXT: v_lshlrev_b32_e32 v3, v9, v3
497 ; CI-NEXT: v_lshlrev_b32_e32 v4, v4, v6
498 ; CI-NEXT: v_lshlrev_b32_e32 v2, v8, v2
499 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
500 ; CI-NEXT: v_and_b32_e32 v3, s8, v3
501 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
502 ; CI-NEXT: v_and_b32_e32 v2, s8, v2
503 ; CI-NEXT: v_or_b32_e32 v3, v3, v5
504 ; CI-NEXT: v_or_b32_e32 v2, v2, v4
505 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
507 %tid = call i32 @llvm.amdgcn.workitem.id.x()
508 %tid.ext = sext i32 %tid to i64
509 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
510 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
511 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
512 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
513 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
514 %result = shl <4 x i16> %a, %b
515 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
519 define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
520 ; GFX9-LABEL: shl_v_imm_v4i16:
522 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
523 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
524 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
525 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
526 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
527 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
528 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
529 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
530 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
531 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
532 ; GFX9-NEXT: s_waitcnt vmcnt(0)
533 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
534 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
535 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
536 ; GFX9-NEXT: s_endpgm
538 ; VI-LABEL: shl_v_imm_v4i16:
540 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
541 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
542 ; VI-NEXT: s_mov_b32 s4, 0xff000000
543 ; VI-NEXT: s_waitcnt lgkmcnt(0)
544 ; VI-NEXT: v_mov_b32_e32 v1, s3
545 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
546 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
547 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
548 ; VI-NEXT: v_mov_b32_e32 v3, s1
549 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
550 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
551 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
552 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
553 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
554 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
555 ; VI-NEXT: v_and_b32_e32 v0, s4, v0
556 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
557 ; VI-NEXT: v_and_b32_e32 v4, s4, v4
558 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
559 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
560 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
563 ; CI-LABEL: shl_v_imm_v4i16:
565 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
566 ; CI-NEXT: s_mov_b32 s7, 0xf000
567 ; CI-NEXT: s_mov_b32 s6, 0
568 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
569 ; CI-NEXT: v_mov_b32_e32 v1, 0
570 ; CI-NEXT: s_waitcnt lgkmcnt(0)
571 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
572 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
573 ; CI-NEXT: s_mov_b32 s8, 0xff00
574 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
575 ; CI-NEXT: s_waitcnt vmcnt(0)
576 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
577 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
578 ; CI-NEXT: v_and_b32_e32 v4, s8, v4
579 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
580 ; CI-NEXT: v_and_b32_e32 v3, s8, v3
581 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
582 ; CI-NEXT: v_or_b32_e32 v3, v3, v4
583 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
584 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
586 %tid = call i32 @llvm.amdgcn.workitem.id.x()
587 %tid.ext = sext i32 %tid to i64
588 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
589 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
590 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
591 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
592 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
596 declare i32 @llvm.amdgcn.workitem.id.x() #1
598 attributes #0 = { nounwind }
599 attributes #1 = { nounwind readnone }