1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
7 define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
8 ; GFX9-LABEL: s_pack_v2f16:
10 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
13 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
16 ; GFX9-NEXT: ;;#ASMSTART
18 ; GFX9-NEXT: ;;#ASMEND
21 ; GFX8-LABEL: s_pack_v2f16:
23 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0
26 ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0
27 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
28 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
29 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16
30 ; GFX8-NEXT: s_or_b32 s0, s0, s1
31 ; GFX8-NEXT: ;;#ASMSTART
33 ; GFX8-NEXT: ;;#ASMEND
36 ; GFX7-LABEL: s_pack_v2f16:
38 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
39 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
41 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0
42 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
44 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
45 ; GFX7-NEXT: s_or_b32 s0, s0, s1
46 ; GFX7-NEXT: ;;#ASMSTART
48 ; GFX7-NEXT: ;;#ASMEND
50 %val0 = load volatile i32, ptr addrspace(4) %in0
51 %val1 = load volatile i32, ptr addrspace(4) %in1
52 %lo.i = trunc i32 %val0 to i16
53 %hi.i = trunc i32 %val1 to i16
54 %lo = bitcast i16 %lo.i to half
55 %hi = bitcast i16 %hi.i to half
56 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
57 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
58 %vec.i32 = bitcast <2 x half> %vec.1 to i32
60 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
64 define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 {
65 ; GFX9-LABEL: s_pack_v2f16_imm_lo:
67 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
68 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
70 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x1234, s0
72 ; GFX9-NEXT: ;;#ASMSTART
74 ; GFX9-NEXT: ;;#ASMEND
77 ; GFX8-LABEL: s_pack_v2f16_imm_lo:
79 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
80 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
81 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0
82 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
83 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16
84 ; GFX8-NEXT: s_or_b32 s0, s0, 0x1234
85 ; GFX8-NEXT: ;;#ASMSTART
87 ; GFX8-NEXT: ;;#ASMEND
90 ; GFX7-LABEL: s_pack_v2f16_imm_lo:
92 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
93 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
95 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16
97 ; GFX7-NEXT: s_or_b32 s0, s0, 0x1234
98 ; GFX7-NEXT: ;;#ASMSTART
100 ; GFX7-NEXT: ;;#ASMEND
101 ; GFX7-NEXT: s_endpgm
102 %val1 = load i32, ptr addrspace(4) %in1
103 %hi.i = trunc i32 %val1 to i16
104 %hi = bitcast i16 %hi.i to half
105 %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
106 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
107 %vec.i32 = bitcast <2 x half> %vec.1 to i32
109 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
113 define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 {
114 ; GFX9-LABEL: s_pack_v2f16_imm_hi:
116 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
117 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
119 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x1234
121 ; GFX9-NEXT: ;;#ASMSTART
122 ; GFX9-NEXT: ; use s0
123 ; GFX9-NEXT: ;;#ASMEND
124 ; GFX9-NEXT: s_endpgm
126 ; GFX8-LABEL: s_pack_v2f16_imm_hi:
128 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
129 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
130 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0
131 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
132 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
133 ; GFX8-NEXT: s_or_b32 s0, s0, 0x12340000
134 ; GFX8-NEXT: ;;#ASMSTART
135 ; GFX8-NEXT: ; use s0
136 ; GFX8-NEXT: ;;#ASMEND
137 ; GFX8-NEXT: s_endpgm
139 ; GFX7-LABEL: s_pack_v2f16_imm_hi:
141 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
142 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
144 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
145 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
146 ; GFX7-NEXT: s_or_b32 s0, s0, 0x12340000
147 ; GFX7-NEXT: ;;#ASMSTART
148 ; GFX7-NEXT: ; use s0
149 ; GFX7-NEXT: ;;#ASMEND
150 ; GFX7-NEXT: s_endpgm
151 %val0 = load i32, ptr addrspace(4) %in0
152 %lo.i = trunc i32 %val0 to i16
153 %lo = bitcast i16 %lo.i to half
154 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
155 %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
156 %vec.i32 = bitcast <2 x half> %vec.1 to i32
158 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
162 define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
163 ; GFX9-LABEL: v_pack_v2f16:
165 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
166 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
167 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
168 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
169 ; GFX9-NEXT: s_waitcnt vmcnt(0)
170 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
171 ; GFX9-NEXT: s_waitcnt vmcnt(0)
172 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
173 ; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0
174 ; GFX9-NEXT: ;;#ASMSTART
175 ; GFX9-NEXT: ; use v0
176 ; GFX9-NEXT: ;;#ASMEND
177 ; GFX9-NEXT: s_endpgm
179 ; GFX8-LABEL: v_pack_v2f16:
181 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
182 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
183 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
185 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
186 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
187 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
188 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
189 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
190 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
191 ; GFX8-NEXT: s_waitcnt vmcnt(0)
192 ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
193 ; GFX8-NEXT: s_waitcnt vmcnt(0)
194 ; GFX8-NEXT: s_mov_b32 s0, 0x1000504
195 ; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0
196 ; GFX8-NEXT: ;;#ASMSTART
197 ; GFX8-NEXT: ; use v0
198 ; GFX8-NEXT: ;;#ASMEND
199 ; GFX8-NEXT: s_endpgm
201 ; GFX7-LABEL: v_pack_v2f16:
203 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
204 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000
205 ; GFX7-NEXT: s_mov_b32 s6, 0
206 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
207 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
208 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1]
210 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
211 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
212 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
213 ; GFX7-NEXT: s_waitcnt vmcnt(0)
214 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
215 ; GFX7-NEXT: s_waitcnt vmcnt(0)
216 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
217 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
218 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
219 ; GFX7-NEXT: ;;#ASMSTART
220 ; GFX7-NEXT: ; use v0
221 ; GFX7-NEXT: ;;#ASMEND
222 ; GFX7-NEXT: s_endpgm
223 %tid = call i32 @llvm.amdgcn.workitem.id.x()
224 %tid.ext = sext i32 %tid to i64
225 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
226 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
227 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
228 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
229 %lo.i = trunc i32 %val0 to i16
230 %hi.i = trunc i32 %val1 to i16
231 %lo = bitcast i16 %lo.i to half
232 %hi = bitcast i16 %hi.i to half
233 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
234 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
235 %vec.i32 = bitcast <2 x half> %vec.1 to i32
236 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
240 define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
241 ; GFX9-LABEL: v_pack_v2f16_user:
243 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
244 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
245 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
246 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
247 ; GFX9-NEXT: s_waitcnt vmcnt(0)
248 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
249 ; GFX9-NEXT: s_waitcnt vmcnt(0)
250 ; GFX9-NEXT: s_mov_b32 s0, 0x5040100
251 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
252 ; GFX9-NEXT: s_mov_b32 s2, -1
253 ; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0
254 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0
255 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
256 ; GFX9-NEXT: s_waitcnt vmcnt(0)
257 ; GFX9-NEXT: s_endpgm
259 ; GFX8-LABEL: v_pack_v2f16_user:
261 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
262 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
263 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
265 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
266 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
267 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
268 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
269 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
270 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
271 ; GFX8-NEXT: s_waitcnt vmcnt(0)
272 ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
273 ; GFX8-NEXT: s_waitcnt vmcnt(0)
274 ; GFX8-NEXT: s_mov_b32 s0, 0x1000504
275 ; GFX8-NEXT: s_mov_b32 s3, 0x1100f000
276 ; GFX8-NEXT: s_mov_b32 s2, -1
277 ; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0
278 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0
279 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
280 ; GFX8-NEXT: s_waitcnt vmcnt(0)
281 ; GFX8-NEXT: s_endpgm
283 ; GFX7-LABEL: v_pack_v2f16_user:
285 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
286 ; GFX7-NEXT: s_mov_b32 s6, 0
287 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000
288 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
289 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
290 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
291 ; GFX7-NEXT: s_mov_b64 s[4:5], s[0:1]
292 ; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3]
293 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
294 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
295 ; GFX7-NEXT: s_waitcnt vmcnt(0)
296 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
297 ; GFX7-NEXT: s_waitcnt vmcnt(0)
298 ; GFX7-NEXT: s_mov_b32 s6, -1
299 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
300 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
301 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
302 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 9, v0
303 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
304 ; GFX7-NEXT: s_waitcnt vmcnt(0)
305 ; GFX7-NEXT: s_endpgm
306 %tid = call i32 @llvm.amdgcn.workitem.id.x()
307 %tid.ext = sext i32 %tid to i64
308 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
309 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
310 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
311 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
312 %lo.i = trunc i32 %val0 to i16
313 %hi.i = trunc i32 %val1 to i16
314 %lo = bitcast i16 %lo.i to half
315 %hi = bitcast i16 %hi.i to half
316 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
317 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
318 %vec.i32 = bitcast <2 x half> %vec.1 to i32
319 %foo = add i32 %vec.i32, 9
320 store volatile i32 %foo, ptr addrspace(1) undef
324 define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
325 ; GFX9-LABEL: v_pack_v2f16_imm_lo:
327 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
328 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
329 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
330 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
331 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
332 ; GFX9-NEXT: s_waitcnt vmcnt(0)
333 ; GFX9-NEXT: s_movk_i32 s0, 0x1234
334 ; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1
335 ; GFX9-NEXT: ;;#ASMSTART
336 ; GFX9-NEXT: ; use v0
337 ; GFX9-NEXT: ;;#ASMEND
338 ; GFX9-NEXT: s_endpgm
340 ; GFX8-LABEL: v_pack_v2f16_imm_lo:
342 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
343 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
344 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
345 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
346 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
347 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
348 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
349 ; GFX8-NEXT: s_waitcnt vmcnt(0)
350 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
351 ; GFX8-NEXT: v_or_b32_e32 v0, 0x1234, v0
352 ; GFX8-NEXT: ;;#ASMSTART
353 ; GFX8-NEXT: ; use v0
354 ; GFX8-NEXT: ;;#ASMEND
355 ; GFX8-NEXT: s_endpgm
357 ; GFX7-LABEL: v_pack_v2f16_imm_lo:
359 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
360 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
361 ; GFX7-NEXT: s_mov_b32 s2, 0
362 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
363 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
364 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
366 ; GFX7-NEXT: s_waitcnt vmcnt(0)
367 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
368 ; GFX7-NEXT: v_or_b32_e32 v0, 0x1234, v0
369 ; GFX7-NEXT: ;;#ASMSTART
370 ; GFX7-NEXT: ; use v0
371 ; GFX7-NEXT: ;;#ASMEND
372 ; GFX7-NEXT: s_endpgm
373 %tid = call i32 @llvm.amdgcn.workitem.id.x()
374 %tid.ext = sext i32 %tid to i64
375 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
376 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
377 %hi.i = trunc i32 %val1 to i16
378 %hi = bitcast i16 %hi.i to half
379 %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
380 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
381 %vec.i32 = bitcast <2 x half> %vec.1 to i32
382 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
386 define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 {
387 ; GFX9-LABEL: v_pack_v2f16_inline_imm_lo:
389 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
390 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
391 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
392 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
393 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
394 ; GFX9-NEXT: s_waitcnt vmcnt(0)
395 ; GFX9-NEXT: s_movk_i32 s0, 0x4400
396 ; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1
397 ; GFX9-NEXT: ;;#ASMSTART
398 ; GFX9-NEXT: ; use v0
399 ; GFX9-NEXT: ;;#ASMEND
400 ; GFX9-NEXT: s_endpgm
402 ; GFX8-LABEL: v_pack_v2f16_inline_imm_lo:
404 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
405 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
406 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
407 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
408 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
409 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
410 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
411 ; GFX8-NEXT: s_waitcnt vmcnt(0)
412 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
413 ; GFX8-NEXT: v_or_b32_e32 v0, 0x4400, v0
414 ; GFX8-NEXT: ;;#ASMSTART
415 ; GFX8-NEXT: ; use v0
416 ; GFX8-NEXT: ;;#ASMEND
417 ; GFX8-NEXT: s_endpgm
419 ; GFX7-LABEL: v_pack_v2f16_inline_imm_lo:
421 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
422 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
423 ; GFX7-NEXT: s_mov_b32 s2, 0
424 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
425 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
426 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
427 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
428 ; GFX7-NEXT: s_waitcnt vmcnt(0)
429 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
430 ; GFX7-NEXT: v_or_b32_e32 v0, 0x4400, v0
431 ; GFX7-NEXT: ;;#ASMSTART
432 ; GFX7-NEXT: ; use v0
433 ; GFX7-NEXT: ;;#ASMEND
434 ; GFX7-NEXT: s_endpgm
435 %tid = call i32 @llvm.amdgcn.workitem.id.x()
436 %tid.ext = sext i32 %tid to i64
437 %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
438 %val1 = load volatile i32, ptr addrspace(1) %in1.gep
439 %hi.i = trunc i32 %val1 to i16
440 %hi = bitcast i16 %hi.i to half
441 %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0
442 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
443 %vec.i32 = bitcast <2 x half> %vec.1 to i32
444 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
448 define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
449 ; GFX9-LABEL: v_pack_v2f16_imm_hi:
451 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
452 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
453 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
454 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
455 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
456 ; GFX9-NEXT: s_waitcnt vmcnt(0)
457 ; GFX9-NEXT: s_movk_i32 s0, 0x1234
458 ; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1
459 ; GFX9-NEXT: ;;#ASMSTART
460 ; GFX9-NEXT: ; use v0
461 ; GFX9-NEXT: ;;#ASMEND
462 ; GFX9-NEXT: s_endpgm
464 ; GFX8-LABEL: v_pack_v2f16_imm_hi:
466 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
467 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
468 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
469 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
470 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
471 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
472 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
473 ; GFX8-NEXT: s_waitcnt vmcnt(0)
474 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x12340000
475 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
476 ; GFX8-NEXT: ;;#ASMSTART
477 ; GFX8-NEXT: ; use v0
478 ; GFX8-NEXT: ;;#ASMEND
479 ; GFX8-NEXT: s_endpgm
481 ; GFX7-LABEL: v_pack_v2f16_imm_hi:
483 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
484 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
485 ; GFX7-NEXT: s_mov_b32 s2, 0
486 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
487 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
488 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
490 ; GFX7-NEXT: s_waitcnt vmcnt(0)
491 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
492 ; GFX7-NEXT: v_or_b32_e32 v0, 0x12340000, v0
493 ; GFX7-NEXT: ;;#ASMSTART
494 ; GFX7-NEXT: ; use v0
495 ; GFX7-NEXT: ;;#ASMEND
496 ; GFX7-NEXT: s_endpgm
497 %tid = call i32 @llvm.amdgcn.workitem.id.x()
498 %tid.ext = sext i32 %tid to i64
499 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
500 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
501 %lo.i = trunc i32 %val0 to i16
502 %lo = bitcast i16 %lo.i to half
503 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
504 %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
505 %vec.i32 = bitcast <2 x half> %vec.1 to i32
506 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
510 define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 {
511 ; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi:
513 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
514 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
515 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
516 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
517 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
518 ; GFX9-NEXT: s_waitcnt vmcnt(0)
519 ; GFX9-NEXT: s_movk_i32 s0, 0x3c00
520 ; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1
521 ; GFX9-NEXT: ;;#ASMSTART
522 ; GFX9-NEXT: ; use v0
523 ; GFX9-NEXT: ;;#ASMEND
524 ; GFX9-NEXT: s_endpgm
526 ; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi:
528 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
529 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
530 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
531 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
532 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
533 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
534 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
535 ; GFX8-NEXT: s_waitcnt vmcnt(0)
536 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 60
537 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
538 ; GFX8-NEXT: ;;#ASMSTART
539 ; GFX8-NEXT: ; use v0
540 ; GFX8-NEXT: ;;#ASMEND
541 ; GFX8-NEXT: s_endpgm
543 ; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi:
545 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
546 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
547 ; GFX7-NEXT: s_mov_b32 s2, 0
548 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
549 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
550 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
551 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
552 ; GFX7-NEXT: s_waitcnt vmcnt(0)
553 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
554 ; GFX7-NEXT: v_or_b32_e32 v0, 0x3c000000, v0
555 ; GFX7-NEXT: ;;#ASMSTART
556 ; GFX7-NEXT: ; use v0
557 ; GFX7-NEXT: ;;#ASMEND
558 ; GFX7-NEXT: s_endpgm
559 %tid = call i32 @llvm.amdgcn.workitem.id.x()
560 %tid.ext = sext i32 %tid to i64
561 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
562 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
563 %lo.i = trunc i32 %val0 to i16
564 %lo = bitcast i16 %lo.i to half
565 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
566 %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1
567 %vec.i32 = bitcast <2 x half> %vec.1 to i32
568 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
572 define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 {
573 ; GFX9-LABEL: v_pack_v2f16_inline_imm_hi:
575 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
576 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
577 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100
578 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
580 ; GFX9-NEXT: s_waitcnt vmcnt(0)
581 ; GFX9-NEXT: v_perm_b32 v0, 64, v0, v1
582 ; GFX9-NEXT: ;;#ASMSTART
583 ; GFX9-NEXT: ; use v0
584 ; GFX9-NEXT: ;;#ASMEND
585 ; GFX9-NEXT: s_endpgm
587 ; GFX8-LABEL: v_pack_v2f16_inline_imm_hi:
589 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
590 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
591 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
592 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
593 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
594 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
595 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
596 ; GFX8-NEXT: s_waitcnt vmcnt(0)
597 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x400000
598 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
599 ; GFX8-NEXT: ;;#ASMSTART
600 ; GFX8-NEXT: ; use v0
601 ; GFX8-NEXT: ;;#ASMEND
602 ; GFX8-NEXT: s_endpgm
604 ; GFX7-LABEL: v_pack_v2f16_inline_imm_hi:
606 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
607 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000
608 ; GFX7-NEXT: s_mov_b32 s2, 0
609 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
610 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
611 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
612 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
613 ; GFX7-NEXT: s_waitcnt vmcnt(0)
614 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
615 ; GFX7-NEXT: v_or_b32_e32 v0, 0x400000, v0
616 ; GFX7-NEXT: ;;#ASMSTART
617 ; GFX7-NEXT: ; use v0
618 ; GFX7-NEXT: ;;#ASMEND
619 ; GFX7-NEXT: s_endpgm
620 %tid = call i32 @llvm.amdgcn.workitem.id.x()
621 %tid.ext = sext i32 %tid to i64
622 %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
623 %val0 = load volatile i32, ptr addrspace(1) %in0.gep
624 %lo.i = trunc i32 %val0 to i16
625 %lo = bitcast i16 %lo.i to half
626 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
627 %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1
628 %vec.i32 = bitcast <2 x half> %vec.1 to i32
629 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
633 declare i32 @llvm.amdgcn.workitem.id.x() #1
635 attributes #0 = { nounwind }
636 attributes #1 = { nounwind readnone }