1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX803 %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
7 define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
8 ; GFX9-LABEL: s_pack_v2i16:
10 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
13 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
16 ; GFX9-NEXT: ;;#ASMSTART
18 ; GFX9-NEXT: ;;#ASMEND
21 ; GFX803-LABEL: s_pack_v2i16:
23 ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
24 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0
26 ; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0
27 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
28 ; GFX803-NEXT: s_and_b32 s0, s0, 0xffff
29 ; GFX803-NEXT: s_lshl_b32 s1, s1, 16
30 ; GFX803-NEXT: s_or_b32 s0, s0, s1
31 ; GFX803-NEXT: ;;#ASMSTART
32 ; GFX803-NEXT: ; use s0
33 ; GFX803-NEXT: ;;#ASMEND
34 ; GFX803-NEXT: s_endpgm
36 ; GFX7-LABEL: s_pack_v2i16:
38 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
39 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
41 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0
42 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
44 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
45 ; GFX7-NEXT: s_or_b32 s0, s0, s1
46 ; GFX7-NEXT: ;;#ASMSTART
48 ; GFX7-NEXT: ;;#ASMEND
50 %val0 = load volatile i32, ptr addrspace(4) %in0
51 %val1 = load volatile i32, ptr addrspace(4) %in1
52 %lo = trunc i32 %val0 to i16
53 %hi = trunc i32 %val1 to i16
54 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
55 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
56 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
58 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
62 define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 {
63 ; GFX9-LABEL: s_pack_v2i16_imm_lo:
65 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
66 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
68 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x1c8, s0
70 ; GFX9-NEXT: ;;#ASMSTART
72 ; GFX9-NEXT: ;;#ASMEND
75 ; GFX803-LABEL: s_pack_v2i16_imm_lo:
77 ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
78 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
79 ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0
80 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX803-NEXT: s_lshl_b32 s0, s0, 16
82 ; GFX803-NEXT: s_or_b32 s0, s0, 0x1c8
83 ; GFX803-NEXT: ;;#ASMSTART
84 ; GFX803-NEXT: ; use s0
85 ; GFX803-NEXT: ;;#ASMEND
86 ; GFX803-NEXT: s_endpgm
88 ; GFX7-LABEL: s_pack_v2i16_imm_lo:
90 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
91 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
92 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
93 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16
95 ; GFX7-NEXT: s_or_b32 s0, s0, 0x1c8
96 ; GFX7-NEXT: ;;#ASMSTART
98 ; GFX7-NEXT: ;;#ASMEND
100 %val1 = load i32, ptr addrspace(4) %in1
101 %hi = trunc i32 %val1 to i16
102 %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
103 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
104 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
106 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
110 define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 {
111 ; GFX9-LABEL: s_pack_v2i16_imm_hi:
113 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
114 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
116 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x1c8
118 ; GFX9-NEXT: ;;#ASMSTART
119 ; GFX9-NEXT: ; use s0
120 ; GFX9-NEXT: ;;#ASMEND
121 ; GFX9-NEXT: s_endpgm
123 ; GFX803-LABEL: s_pack_v2i16_imm_hi:
125 ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
126 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0
128 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX803-NEXT: s_and_b32 s0, s0, 0xffff
130 ; GFX803-NEXT: s_or_b32 s0, s0, 0x1c80000
131 ; GFX803-NEXT: ;;#ASMSTART
132 ; GFX803-NEXT: ; use s0
133 ; GFX803-NEXT: ;;#ASMEND
134 ; GFX803-NEXT: s_endpgm
136 ; GFX7-LABEL: s_pack_v2i16_imm_hi:
138 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
139 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
141 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
142 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
143 ; GFX7-NEXT: s_or_b32 s0, s0, 0x1c80000
144 ; GFX7-NEXT: ;;#ASMSTART
145 ; GFX7-NEXT: ; use s0
146 ; GFX7-NEXT: ;;#ASMEND
147 ; GFX7-NEXT: s_endpgm
148 %val0 = load i32, ptr addrspace(4) %in0
149 %lo = trunc i32 %val0 to i16
150 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
151 %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1
152 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
154 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
158 define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
159 ; GFX9-LABEL: v_pack_v2i16:
161 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
162 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
163 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
164 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
165 ; GFX9-NEXT: s_waitcnt vmcnt(0)
166 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
167 ; GFX9-NEXT: s_waitcnt vmcnt(0)
168 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
169 ; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0
170 ; GFX9-NEXT: ;;#ASMSTART
171 ; GFX9-NEXT: ; use v0
172 ; GFX9-NEXT: ;;#ASMEND
173 ; GFX9-NEXT: s_endpgm
175 ; GFX803-LABEL: v_pack_v2i16:
177 ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
178 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
179 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
180 ; GFX803-NEXT: v_mov_b32_e32 v1, s1
181 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
182 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
183 ; GFX803-NEXT: v_mov_b32_e32 v3, s3
184 ; GFX803-NEXT: v_add_u32_e32 v2, vcc, s2, v2
185 ; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
186 ; GFX803-NEXT: flat_load_dword v0, v[0:1] glc
187 ; GFX803-NEXT: s_waitcnt vmcnt(0)
188 ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc
189 ; GFX803-NEXT: s_waitcnt vmcnt(0)
190 ; GFX803-NEXT: s_mov_b32 s0, 0x1000504
191 ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0
192 ; GFX803-NEXT: ;;#ASMSTART
193 ; GFX803-NEXT: ; use v0
194 ; GFX803-NEXT: ;;#ASMEND
195 ; GFX803-NEXT: s_endpgm
197 ; GFX7-LABEL: v_pack_v2i16:
199 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
200 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000
201 ; GFX7-NEXT: s_mov_b32 s6, 0
202 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
203 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
204 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
205 ; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1]
206 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
207 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
208 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
209 ; GFX7-NEXT: s_waitcnt vmcnt(0)
210 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
211 ; GFX7-NEXT: s_waitcnt vmcnt(0)
212 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
213 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
214 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
215 ; GFX7-NEXT: ;;#ASMSTART
216 ; GFX7-NEXT: ; use v0
217 ; GFX7-NEXT: ;;#ASMEND
218 ; GFX7-NEXT: s_endpgm
219 %tid = call i32 @llvm.amdgcn.workitem.id.x()
220 %tid.ext = sext i32 %tid to i64
221 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
222 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
223 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
224 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
225 %lo = trunc i32 %val0 to i16
226 %hi = trunc i32 %val1 to i16
227 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
228 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
229 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
230 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
234 define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
235 ; GFX9-LABEL: v_pack_v2i16_user:
237 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
238 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
239 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
240 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
241 ; GFX9-NEXT: s_waitcnt vmcnt(0)
242 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
243 ; GFX9-NEXT: s_waitcnt vmcnt(0)
244 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
245 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
246 ; GFX9-NEXT: s_mov_b32 s2, -1
247 ; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0
248 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0
249 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
250 ; GFX9-NEXT: s_waitcnt vmcnt(0)
251 ; GFX9-NEXT: s_endpgm
253 ; GFX803-LABEL: v_pack_v2i16_user:
255 ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
256 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
257 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX803-NEXT: v_mov_b32_e32 v1, s1
259 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
260 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
261 ; GFX803-NEXT: v_mov_b32_e32 v3, s3
262 ; GFX803-NEXT: v_add_u32_e32 v2, vcc, s2, v2
263 ; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
264 ; GFX803-NEXT: flat_load_dword v0, v[0:1] glc
265 ; GFX803-NEXT: s_waitcnt vmcnt(0)
266 ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc
267 ; GFX803-NEXT: s_waitcnt vmcnt(0)
268 ; GFX803-NEXT: s_mov_b32 s0, 0x1000504
269 ; GFX803-NEXT: s_mov_b32 s3, 0x1100f000
270 ; GFX803-NEXT: s_mov_b32 s2, -1
271 ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0
272 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 9, v0
273 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0
274 ; GFX803-NEXT: s_waitcnt vmcnt(0)
275 ; GFX803-NEXT: s_endpgm
277 ; GFX7-LABEL: v_pack_v2i16_user:
279 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
280 ; GFX7-NEXT: s_mov_b32 s6, 0
281 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000
282 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
283 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
284 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1]
286 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
287 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
288 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
289 ; GFX7-NEXT: s_waitcnt vmcnt(0)
290 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
291 ; GFX7-NEXT: s_waitcnt vmcnt(0)
292 ; GFX7-NEXT: s_mov_b32 s6, -1
293 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
294 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
295 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
296 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 9, v0
297 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
298 ; GFX7-NEXT: s_waitcnt vmcnt(0)
299 ; GFX7-NEXT: s_endpgm
300 %tid = call i32 @llvm.amdgcn.workitem.id.x()
301 %tid.ext = sext i32 %tid to i64
302 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
303 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
304 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
305 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
306 %lo = trunc i32 %val0 to i16
307 %hi = trunc i32 %val1 to i16
308 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
309 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
310 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
311 %foo = add i32 %vec.i32, 9
312 store volatile i32 %foo, ptr addrspace(1) undef
316 define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
317 ; GFX9-LABEL: v_pack_v2i16_imm_lo:
319 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
320 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
321 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
322 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
325 ; GFX9-NEXT: s_movk_i32 s0, 0x7b
326 ; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1
327 ; GFX9-NEXT: ;;#ASMSTART
328 ; GFX9-NEXT: ; use v0
329 ; GFX9-NEXT: ;;#ASMEND
330 ; GFX9-NEXT: s_endpgm
332 ; GFX803-LABEL: v_pack_v2i16_imm_lo:
334 ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
335 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
336 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX803-NEXT: v_mov_b32_e32 v1, s1
338 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
339 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
340 ; GFX803-NEXT: flat_load_dword v0, v[0:1] glc
341 ; GFX803-NEXT: s_waitcnt vmcnt(0)
342 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
343 ; GFX803-NEXT: v_or_b32_e32 v0, 0x7b, v0
344 ; GFX803-NEXT: ;;#ASMSTART
345 ; GFX803-NEXT: ; use v0
346 ; GFX803-NEXT: ;;#ASMEND
347 ; GFX803-NEXT: s_endpgm
349 ; GFX7-LABEL: v_pack_v2i16_imm_lo:
351 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
352 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
353 ; GFX7-NEXT: s_mov_b32 s2, 0
354 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
356 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
357 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
358 ; GFX7-NEXT: s_waitcnt vmcnt(0)
359 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
360 ; GFX7-NEXT: v_or_b32_e32 v0, 0x7b, v0
361 ; GFX7-NEXT: ;;#ASMSTART
362 ; GFX7-NEXT: ; use v0
363 ; GFX7-NEXT: ;;#ASMEND
364 ; GFX7-NEXT: s_endpgm
365 %tid = call i32 @llvm.amdgcn.workitem.id.x()
366 %tid.ext = sext i32 %tid to i64
367 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
368 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
369 %hi = trunc i32 %val1 to i16
370 %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0
371 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
372 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
373 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
377 define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 {
378 ; GFX9-LABEL: v_pack_v2i16_inline_imm_lo:
380 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
381 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
382 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
383 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
384 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
385 ; GFX9-NEXT: s_waitcnt vmcnt(0)
386 ; GFX9-NEXT: v_perm_b32 v0, v0, 64, v1
387 ; GFX9-NEXT: ;;#ASMSTART
388 ; GFX9-NEXT: ; use v0
389 ; GFX9-NEXT: ;;#ASMEND
390 ; GFX9-NEXT: s_endpgm
392 ; GFX803-LABEL: v_pack_v2i16_inline_imm_lo:
394 ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
395 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
396 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX803-NEXT: v_mov_b32_e32 v1, s1
398 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
399 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
400 ; GFX803-NEXT: flat_load_dword v0, v[0:1] glc
401 ; GFX803-NEXT: s_waitcnt vmcnt(0)
402 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
403 ; GFX803-NEXT: v_or_b32_e32 v0, 64, v0
404 ; GFX803-NEXT: ;;#ASMSTART
405 ; GFX803-NEXT: ; use v0
406 ; GFX803-NEXT: ;;#ASMEND
407 ; GFX803-NEXT: s_endpgm
409 ; GFX7-LABEL: v_pack_v2i16_inline_imm_lo:
411 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
412 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
413 ; GFX7-NEXT: s_mov_b32 s2, 0
414 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
415 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
416 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
417 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
418 ; GFX7-NEXT: s_waitcnt vmcnt(0)
419 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
420 ; GFX7-NEXT: v_or_b32_e32 v0, 64, v0
421 ; GFX7-NEXT: ;;#ASMSTART
422 ; GFX7-NEXT: ; use v0
423 ; GFX7-NEXT: ;;#ASMEND
424 ; GFX7-NEXT: s_endpgm
425 %tid = call i32 @llvm.amdgcn.workitem.id.x()
426 %tid.ext = sext i32 %tid to i64
427 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
428 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
429 %hi = trunc i32 %val1 to i16
430 %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0
431 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
432 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
433 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
437 define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
438 ; GFX9-LABEL: v_pack_v2i16_imm_hi:
440 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
441 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
442 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
443 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
445 ; GFX9-NEXT: s_waitcnt vmcnt(0)
446 ; GFX9-NEXT: s_movk_i32 s0, 0x7b
447 ; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1
448 ; GFX9-NEXT: ;;#ASMSTART
449 ; GFX9-NEXT: ; use v0
450 ; GFX9-NEXT: ;;#ASMEND
451 ; GFX9-NEXT: s_endpgm
453 ; GFX803-LABEL: v_pack_v2i16_imm_hi:
455 ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
456 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
457 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
458 ; GFX803-NEXT: v_mov_b32_e32 v1, s1
459 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
460 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
461 ; GFX803-NEXT: flat_load_dword v0, v[0:1] glc
462 ; GFX803-NEXT: s_waitcnt vmcnt(0)
463 ; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b0000
464 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
465 ; GFX803-NEXT: ;;#ASMSTART
466 ; GFX803-NEXT: ; use v0
467 ; GFX803-NEXT: ;;#ASMEND
468 ; GFX803-NEXT: s_endpgm
470 ; GFX7-LABEL: v_pack_v2i16_imm_hi:
472 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
473 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
474 ; GFX7-NEXT: s_mov_b32 s2, 0
475 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
476 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
477 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
479 ; GFX7-NEXT: s_waitcnt vmcnt(0)
480 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
481 ; GFX7-NEXT: v_or_b32_e32 v0, 0x7b0000, v0
482 ; GFX7-NEXT: ;;#ASMSTART
483 ; GFX7-NEXT: ; use v0
484 ; GFX7-NEXT: ;;#ASMEND
485 ; GFX7-NEXT: s_endpgm
486 %tid = call i32 @llvm.amdgcn.workitem.id.x()
487 %tid.ext = sext i32 %tid to i64
488 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
489 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
490 %lo = trunc i32 %val0 to i16
491 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
492 %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1
493 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
494 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
498 define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 {
499 ; GFX9-LABEL: v_pack_v2i16_inline_imm_hi:
501 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
502 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
503 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
504 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
505 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
506 ; GFX9-NEXT: s_waitcnt vmcnt(0)
507 ; GFX9-NEXT: v_perm_b32 v0, 7, v0, v1
508 ; GFX9-NEXT: ;;#ASMSTART
509 ; GFX9-NEXT: ; use v0
510 ; GFX9-NEXT: ;;#ASMEND
511 ; GFX9-NEXT: s_endpgm
513 ; GFX803-LABEL: v_pack_v2i16_inline_imm_hi:
515 ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
516 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
517 ; GFX803-NEXT: s_waitcnt lgkmcnt(0)
518 ; GFX803-NEXT: v_mov_b32_e32 v1, s1
519 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
520 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
521 ; GFX803-NEXT: flat_load_dword v0, v[0:1] glc
522 ; GFX803-NEXT: s_waitcnt vmcnt(0)
523 ; GFX803-NEXT: v_mov_b32_e32 v1, 0x70000
524 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
525 ; GFX803-NEXT: ;;#ASMSTART
526 ; GFX803-NEXT: ; use v0
527 ; GFX803-NEXT: ;;#ASMEND
528 ; GFX803-NEXT: s_endpgm
530 ; GFX7-LABEL: v_pack_v2i16_inline_imm_hi:
532 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
533 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
534 ; GFX7-NEXT: s_mov_b32 s2, 0
535 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
536 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
537 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
539 ; GFX7-NEXT: s_waitcnt vmcnt(0)
540 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
541 ; GFX7-NEXT: v_or_b32_e32 v0, 0x70000, v0
542 ; GFX7-NEXT: ;;#ASMSTART
543 ; GFX7-NEXT: ; use v0
544 ; GFX7-NEXT: ;;#ASMEND
545 ; GFX7-NEXT: s_endpgm
546 %tid = call i32 @llvm.amdgcn.workitem.id.x()
547 %tid.ext = sext i32 %tid to i64
548 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
549 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
550 %lo = trunc i32 %val0 to i16
551 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
552 %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1
553 %vec.i32 = bitcast <2 x i16> %vec.1 to i32
554 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
558 declare i32 @llvm.amdgcn.workitem.id.x() #1
560 attributes #0 = { nounwind }
561 attributes #1 = { nounwind readnone }