1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
6 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
8 define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 {
9 ; GFX6-LABEL: s_sint_to_fp_i64_to_f16:
11 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
12 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX6-NEXT: s_mov_b32 s6, -1
14 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX6-NEXT: s_mov_b32 s4, s0
16 ; GFX6-NEXT: s_mov_b32 s5, s1
17 ; GFX6-NEXT: s_flbit_i32 s0, s3
18 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
19 ; GFX6-NEXT: s_add_i32 s0, s0, -1
20 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
21 ; GFX6-NEXT: s_add_i32 s1, s1, 32
22 ; GFX6-NEXT: s_min_u32 s8, s0, s1
23 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
24 ; GFX6-NEXT: s_min_u32 s0, s0, 1
25 ; GFX6-NEXT: s_or_b32 s0, s1, s0
26 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
27 ; GFX6-NEXT: s_sub_i32 s0, 32, s8
28 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
29 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
30 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
33 ; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
35 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
36 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX8-NEXT: s_xor_b32 s5, s2, s3
38 ; GFX8-NEXT: s_flbit_i32 s4, s3
39 ; GFX8-NEXT: s_ashr_i32 s5, s5, 31
40 ; GFX8-NEXT: s_add_i32 s4, s4, -1
41 ; GFX8-NEXT: s_add_i32 s5, s5, 32
42 ; GFX8-NEXT: s_min_u32 s4, s4, s5
43 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
44 ; GFX8-NEXT: s_min_u32 s2, s2, 1
45 ; GFX8-NEXT: s_or_b32 s2, s3, s2
46 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
47 ; GFX8-NEXT: s_sub_i32 s2, 32, s4
48 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
49 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
50 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
51 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
52 ; GFX8-NEXT: flat_store_short v[0:1], v2
55 ; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
57 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
58 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
59 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX11-NEXT: s_xor_b32 s4, s2, s3
61 ; GFX11-NEXT: s_cls_i32 s5, s3
62 ; GFX11-NEXT: s_ashr_i32 s4, s4, 31
63 ; GFX11-NEXT: s_add_i32 s5, s5, -1
64 ; GFX11-NEXT: s_add_i32 s4, s4, 32
65 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
66 ; GFX11-NEXT: s_min_u32 s4, s5, s4
67 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
68 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
69 ; GFX11-NEXT: s_min_u32 s2, s2, 1
70 ; GFX11-NEXT: s_or_b32 s2, s3, s2
71 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
72 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
73 ; GFX11-NEXT: s_sub_i32 s2, 32, s4
74 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
75 ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
76 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
77 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
78 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
80 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81 ; GFX11-NEXT: s_endpgm
82 %result = sitofp i64 %in to half
83 store half %result, ptr addrspace(1) %out
87 define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
88 ; GFX6-LABEL: v_sint_to_fp_i64_to_f16:
90 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
91 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
92 ; GFX6-NEXT: s_mov_b32 s6, 0
93 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0
94 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
95 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
97 ; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
98 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0
99 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
100 ; GFX6-NEXT: s_waitcnt vmcnt(0)
101 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
102 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4
103 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
104 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5
105 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
106 ; GFX6-NEXT: v_min_u32_e32 v0, v5, v0
107 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
108 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
109 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
110 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
111 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0
112 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0
113 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
114 ; GFX6-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64
115 ; GFX6-NEXT: s_endpgm
117 ; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
119 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
120 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
121 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
122 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
124 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
125 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
126 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
127 ; GFX8-NEXT: s_waitcnt vmcnt(0)
128 ; GFX8-NEXT: v_xor_b32_e32 v3, v1, v2
129 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v2
130 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v3
131 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
132 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 32, v3
133 ; GFX8-NEXT: v_min_u32_e32 v3, v4, v3
134 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
135 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3
136 ; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
137 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
138 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
139 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
140 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
141 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
142 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
143 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
144 ; GFX8-NEXT: flat_store_short v[0:1], v3
145 ; GFX8-NEXT: s_endpgm
147 ; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
149 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
150 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
151 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
152 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
154 ; GFX11-NEXT: s_waitcnt vmcnt(0)
155 ; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
156 ; GFX11-NEXT: v_cls_i32_e32 v4, v2
157 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
158 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3
159 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4
160 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
161 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3
162 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3
163 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
164 ; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
165 ; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
166 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
167 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
168 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3
169 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
171 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
172 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
173 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
174 ; GFX11-NEXT: s_nop 0
175 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
176 ; GFX11-NEXT: s_endpgm
177 %tid = call i32 @llvm.amdgcn.workitem.id.x()
178 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
179 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
180 %val = load i64, ptr addrspace(1) %in.gep
181 %result = sitofp i64 %val to half
182 store half %result, ptr addrspace(1) %out.gep
186 define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 {
187 ; GFX6-LABEL: s_sint_to_fp_i64_to_f32:
189 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
190 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
191 ; GFX6-NEXT: s_mov_b32 s6, -1
192 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
193 ; GFX6-NEXT: s_mov_b32 s4, s0
194 ; GFX6-NEXT: s_mov_b32 s5, s1
195 ; GFX6-NEXT: s_flbit_i32 s0, s3
196 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
197 ; GFX6-NEXT: s_add_i32 s0, s0, -1
198 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
199 ; GFX6-NEXT: s_add_i32 s1, s1, 32
200 ; GFX6-NEXT: s_min_u32 s8, s0, s1
201 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
202 ; GFX6-NEXT: s_min_u32 s0, s0, 1
203 ; GFX6-NEXT: s_or_b32 s0, s1, s0
204 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
205 ; GFX6-NEXT: s_sub_i32 s0, 32, s8
206 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
207 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
208 ; GFX6-NEXT: s_endpgm
210 ; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
212 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
213 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX8-NEXT: s_xor_b32 s5, s2, s3
215 ; GFX8-NEXT: s_flbit_i32 s4, s3
216 ; GFX8-NEXT: s_ashr_i32 s5, s5, 31
217 ; GFX8-NEXT: s_add_i32 s4, s4, -1
218 ; GFX8-NEXT: s_add_i32 s5, s5, 32
219 ; GFX8-NEXT: s_min_u32 s4, s4, s5
220 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
221 ; GFX8-NEXT: s_min_u32 s2, s2, 1
222 ; GFX8-NEXT: s_or_b32 s2, s3, s2
223 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
224 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
225 ; GFX8-NEXT: s_sub_i32 s0, 32, s4
226 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
227 ; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
228 ; GFX8-NEXT: flat_store_dword v[0:1], v2
229 ; GFX8-NEXT: s_endpgm
231 ; GFX11-LABEL: s_sint_to_fp_i64_to_f32:
233 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
234 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
235 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
236 ; GFX11-NEXT: s_xor_b32 s4, s2, s3
237 ; GFX11-NEXT: s_cls_i32 s5, s3
238 ; GFX11-NEXT: s_ashr_i32 s4, s4, 31
239 ; GFX11-NEXT: s_add_i32 s5, s5, -1
240 ; GFX11-NEXT: s_add_i32 s4, s4, 32
241 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
242 ; GFX11-NEXT: s_min_u32 s4, s5, s4
243 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
244 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
245 ; GFX11-NEXT: s_min_u32 s2, s2, 1
246 ; GFX11-NEXT: s_or_b32 s2, s3, s2
247 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
248 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
249 ; GFX11-NEXT: s_sub_i32 s2, 32, s4
250 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
251 ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
252 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
253 ; GFX11-NEXT: s_nop 0
254 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
255 ; GFX11-NEXT: s_endpgm
256 %result = sitofp i64 %in to float
257 store float %result, ptr addrspace(1) %out
261 define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
262 ; GFX6-LABEL: v_sint_to_fp_i64_to_f32:
264 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
265 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
266 ; GFX6-NEXT: s_mov_b32 s6, 0
267 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0
268 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
269 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
271 ; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
272 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
273 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
274 ; GFX6-NEXT: s_waitcnt vmcnt(0)
275 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
276 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4
277 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
278 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5
279 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
280 ; GFX6-NEXT: v_min_u32_e32 v0, v5, v0
281 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
282 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
283 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
284 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
285 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0
286 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0
287 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
288 ; GFX6-NEXT: s_endpgm
290 ; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
292 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
293 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
294 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
295 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
296 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
297 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
298 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
299 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
300 ; GFX8-NEXT: s_waitcnt vmcnt(0)
301 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v2
302 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v2
303 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
304 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
305 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
306 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v0
307 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
308 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
309 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
310 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
311 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
312 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
313 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
314 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
315 ; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
316 ; GFX8-NEXT: flat_store_dword v[0:1], v2
317 ; GFX8-NEXT: s_endpgm
319 ; GFX11-LABEL: v_sint_to_fp_i64_to_f32:
321 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
322 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
323 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
324 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
325 ; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
327 ; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
328 ; GFX11-NEXT: v_cls_i32_e32 v4, v2
329 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
330 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3
331 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4
332 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
333 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3
334 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3
335 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
336 ; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
337 ; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
338 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
339 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
340 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3
341 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
342 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
343 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
344 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
345 ; GFX11-NEXT: s_nop 0
346 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
347 ; GFX11-NEXT: s_endpgm
348 %tid = call i32 @llvm.amdgcn.workitem.id.x()
349 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
350 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
351 %val = load i64, ptr addrspace(1) %in.gep
352 %result = sitofp i64 %val to float
353 store float %result, ptr addrspace(1) %out.gep
357 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{
358 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32:
360 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
361 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
362 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
363 ; GFX6-NEXT: s_mov_b32 s2, -1
364 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX6-NEXT: s_flbit_i32 s8, s7
366 ; GFX6-NEXT: s_xor_b32 s9, s6, s7
367 ; GFX6-NEXT: s_flbit_i32 s10, s5
368 ; GFX6-NEXT: s_xor_b32 s11, s4, s5
369 ; GFX6-NEXT: s_add_i32 s8, s8, -1
370 ; GFX6-NEXT: s_ashr_i32 s9, s9, 31
371 ; GFX6-NEXT: s_add_i32 s10, s10, -1
372 ; GFX6-NEXT: s_ashr_i32 s11, s11, 31
373 ; GFX6-NEXT: s_add_i32 s9, s9, 32
374 ; GFX6-NEXT: s_add_i32 s11, s11, 32
375 ; GFX6-NEXT: s_min_u32 s8, s8, s9
376 ; GFX6-NEXT: s_min_u32 s9, s10, s11
377 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
378 ; GFX6-NEXT: s_sub_i32 s8, 32, s8
379 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
380 ; GFX6-NEXT: s_sub_i32 s9, 32, s9
381 ; GFX6-NEXT: s_min_u32 s6, s6, 1
382 ; GFX6-NEXT: s_min_u32 s4, s4, 1
383 ; GFX6-NEXT: s_or_b32 s6, s7, s6
384 ; GFX6-NEXT: s_or_b32 s4, s5, s4
385 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
386 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
387 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8
388 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9
389 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
390 ; GFX6-NEXT: s_endpgm
392 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
394 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
395 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
396 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX8-NEXT: s_xor_b32 s3, s6, s7
398 ; GFX8-NEXT: s_flbit_i32 s2, s7
399 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
400 ; GFX8-NEXT: s_add_i32 s2, s2, -1
401 ; GFX8-NEXT: s_add_i32 s3, s3, 32
402 ; GFX8-NEXT: s_min_u32 s9, s2, s3
403 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
404 ; GFX8-NEXT: s_min_u32 s2, s2, 1
405 ; GFX8-NEXT: s_or_b32 s2, s3, s2
406 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
407 ; GFX8-NEXT: s_xor_b32 s2, s4, s5
408 ; GFX8-NEXT: s_flbit_i32 s8, s5
409 ; GFX8-NEXT: s_ashr_i32 s2, s2, 31
410 ; GFX8-NEXT: s_add_i32 s8, s8, -1
411 ; GFX8-NEXT: s_add_i32 s2, s2, 32
412 ; GFX8-NEXT: s_min_u32 s6, s8, s2
413 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
414 ; GFX8-NEXT: s_min_u32 s2, s2, 1
415 ; GFX8-NEXT: s_or_b32 s2, s3, s2
416 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
417 ; GFX8-NEXT: s_sub_i32 s2, 32, s9
418 ; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
419 ; GFX8-NEXT: s_sub_i32 s2, 32, s6
420 ; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
421 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
422 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
423 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
424 ; GFX8-NEXT: s_endpgm
426 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32:
428 ; GFX11-NEXT: s_clause 0x1
429 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
430 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
431 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
432 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX11-NEXT: s_xor_b32 s3, s6, s7
434 ; GFX11-NEXT: s_xor_b32 s9, s4, s5
435 ; GFX11-NEXT: s_cls_i32 s2, s7
436 ; GFX11-NEXT: s_cls_i32 s8, s5
437 ; GFX11-NEXT: s_ashr_i32 s3, s3, 31
438 ; GFX11-NEXT: s_ashr_i32 s9, s9, 31
439 ; GFX11-NEXT: s_add_i32 s2, s2, -1
440 ; GFX11-NEXT: s_add_i32 s8, s8, -1
441 ; GFX11-NEXT: s_add_i32 s3, s3, 32
442 ; GFX11-NEXT: s_add_i32 s9, s9, 32
443 ; GFX11-NEXT: s_min_u32 s10, s2, s3
444 ; GFX11-NEXT: s_min_u32 s8, s8, s9
445 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
446 ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
447 ; GFX11-NEXT: s_min_u32 s2, s2, 1
448 ; GFX11-NEXT: s_min_u32 s4, s4, 1
449 ; GFX11-NEXT: s_or_b32 s2, s3, s2
450 ; GFX11-NEXT: s_or_b32 s3, s5, s4
451 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
452 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s3
453 ; GFX11-NEXT: s_sub_i32 s2, 32, s10
454 ; GFX11-NEXT: s_sub_i32 s3, 32, s8
455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
456 ; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
457 ; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
458 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
459 ; GFX11-NEXT: s_nop 0
460 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
461 ; GFX11-NEXT: s_endpgm
462 %result = sitofp <2 x i64> %in to <2 x float>
463 store <2 x float> %result, ptr addrspace(1) %out
467 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
468 ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32:
470 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
471 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
472 ; GFX6-NEXT: s_mov_b32 s6, 0
473 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0
474 ; GFX6-NEXT: v_mov_b32_e32 v9, 0
475 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
476 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
477 ; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
478 ; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
479 ; GFX6-NEXT: v_lshlrev_b32_e32 v10, 4, v0
480 ; GFX6-NEXT: v_mov_b32_e32 v11, v9
481 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
482 ; GFX6-NEXT: s_waitcnt vmcnt(1)
483 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
484 ; GFX6-NEXT: v_ffbh_i32_e32 v9, v4
485 ; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2
486 ; GFX6-NEXT: v_ffbh_i32_e32 v13, v2
487 ; GFX6-NEXT: s_waitcnt vmcnt(0)
488 ; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8
489 ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8
490 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6
491 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6
492 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
493 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9
494 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12
495 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13
496 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14
497 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15
498 ; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16
499 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17
500 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
501 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12
502 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14
503 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16
504 ; GFX6-NEXT: v_min_u32_e32 v0, v9, v0
505 ; GFX6-NEXT: v_min_u32_e32 v9, v13, v12
506 ; GFX6-NEXT: v_min_u32_e32 v12, v15, v14
507 ; GFX6-NEXT: v_min_u32_e32 v13, v17, v16
508 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
509 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
510 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
511 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9
512 ; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12
513 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12
514 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13
515 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13
516 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
517 ; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
518 ; GFX6-NEXT: v_min_u32_e32 v7, 1, v7
519 ; GFX6-NEXT: v_min_u32_e32 v5, 1, v5
520 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
521 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
522 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7
523 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v5
524 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
525 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
526 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1
527 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4
528 ; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14
529 ; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v2
530 ; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9
531 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v12
532 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64
533 ; GFX6-NEXT: s_endpgm
535 ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
537 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
538 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
539 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
540 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
541 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
542 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
543 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
544 ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
545 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
546 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
547 ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
548 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
549 ; GFX8-NEXT: v_mov_b32_e32 v10, s1
550 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
551 ; GFX8-NEXT: s_waitcnt vmcnt(1)
552 ; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
553 ; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
554 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
555 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
556 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
557 ; GFX8-NEXT: s_waitcnt vmcnt(0)
558 ; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
559 ; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
560 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
561 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
562 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
563 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
564 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
565 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
566 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
567 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
568 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
569 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
570 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
571 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
572 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
573 ; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
574 ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
575 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
576 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
577 ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
578 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
579 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
580 ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
581 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
582 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
583 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
584 ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
585 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
586 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
587 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
588 ; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
589 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
590 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
591 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
592 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v1
593 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
594 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
595 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
596 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
597 ; GFX8-NEXT: v_ldexp_f32 v1, v3, v14
598 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
599 ; GFX8-NEXT: v_ldexp_f32 v3, v5, v11
600 ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
601 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
602 ; GFX8-NEXT: s_endpgm
604 ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32:
606 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
607 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
608 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
609 ; GFX11-NEXT: s_clause 0x1
610 ; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
611 ; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
612 ; GFX11-NEXT: s_waitcnt vmcnt(1)
613 ; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
614 ; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
615 ; GFX11-NEXT: s_waitcnt vmcnt(0)
616 ; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8
617 ; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6
618 ; GFX11-NEXT: v_cls_i32_e32 v10, v4
619 ; GFX11-NEXT: v_cls_i32_e32 v12, v2
620 ; GFX11-NEXT: v_cls_i32_e32 v14, v8
621 ; GFX11-NEXT: v_cls_i32_e32 v16, v6
622 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9
623 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
624 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13
625 ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v15
626 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
627 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
628 ; GFX11-NEXT: v_add_nc_u32_e32 v14, -1, v14
629 ; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v16
630 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
631 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
632 ; GFX11-NEXT: v_add_nc_u32_e32 v13, 32, v13
633 ; GFX11-NEXT: v_add_nc_u32_e32 v15, 32, v15
634 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
635 ; GFX11-NEXT: v_min_u32_e32 v9, v10, v9
636 ; GFX11-NEXT: v_min_u32_e32 v10, v12, v11
637 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
638 ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13
639 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15
640 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
641 ; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4]
642 ; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2]
643 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
644 ; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
645 ; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
646 ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
647 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10
648 ; GFX11-NEXT: v_min_u32_e32 v3, 1, v3
649 ; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
650 ; GFX11-NEXT: v_min_u32_e32 v7, 1, v7
651 ; GFX11-NEXT: v_min_u32_e32 v5, 1, v5
652 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11
653 ; GFX11-NEXT: v_or_b32_e32 v3, v4, v3
654 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
655 ; GFX11-NEXT: v_or_b32_e32 v2, v8, v7
656 ; GFX11-NEXT: v_or_b32_e32 v4, v6, v5
657 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12
658 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
659 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
660 ; GFX11-NEXT: v_cvt_f32_i32_e32 v6, v2
661 ; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4
662 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0
663 ; GFX11-NEXT: v_ldexp_f32 v3, v3, v9
664 ; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
665 ; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
666 ; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
667 ; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
668 ; GFX11-NEXT: s_nop 0
669 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
670 ; GFX11-NEXT: s_endpgm
671 %tid = call i32 @llvm.amdgcn.workitem.id.x()
672 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
673 %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
674 %value = load <4 x i64>, ptr addrspace(1) %in.gep
675 %result = sitofp <4 x i64> %value to <4 x float>
676 store <4 x float> %result, ptr addrspace(1) %out.gep
680 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{
681 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16:
683 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
684 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
685 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
686 ; GFX6-NEXT: s_mov_b32 s2, -1
687 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
688 ; GFX6-NEXT: s_flbit_i32 s8, s7
689 ; GFX6-NEXT: s_xor_b32 s9, s6, s7
690 ; GFX6-NEXT: s_flbit_i32 s10, s5
691 ; GFX6-NEXT: s_xor_b32 s11, s4, s5
692 ; GFX6-NEXT: s_add_i32 s8, s8, -1
693 ; GFX6-NEXT: s_ashr_i32 s9, s9, 31
694 ; GFX6-NEXT: s_add_i32 s10, s10, -1
695 ; GFX6-NEXT: s_ashr_i32 s11, s11, 31
696 ; GFX6-NEXT: s_add_i32 s9, s9, 32
697 ; GFX6-NEXT: s_add_i32 s11, s11, 32
698 ; GFX6-NEXT: s_min_u32 s8, s8, s9
699 ; GFX6-NEXT: s_min_u32 s9, s10, s11
700 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
701 ; GFX6-NEXT: s_sub_i32 s8, 32, s8
702 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
703 ; GFX6-NEXT: s_sub_i32 s9, 32, s9
704 ; GFX6-NEXT: s_min_u32 s6, s6, 1
705 ; GFX6-NEXT: s_min_u32 s4, s4, 1
706 ; GFX6-NEXT: s_or_b32 s6, s7, s6
707 ; GFX6-NEXT: s_or_b32 s4, s5, s4
708 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
709 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
710 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8
711 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9
712 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
713 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
714 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
715 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
716 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
717 ; GFX6-NEXT: s_endpgm
719 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
721 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
722 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
723 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
724 ; GFX8-NEXT: s_xor_b32 s3, s6, s7
725 ; GFX8-NEXT: s_flbit_i32 s2, s7
726 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
727 ; GFX8-NEXT: s_add_i32 s2, s2, -1
728 ; GFX8-NEXT: s_add_i32 s3, s3, 32
729 ; GFX8-NEXT: s_min_u32 s8, s2, s3
730 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
731 ; GFX8-NEXT: s_min_u32 s2, s2, 1
732 ; GFX8-NEXT: s_or_b32 s2, s3, s2
733 ; GFX8-NEXT: s_xor_b32 s3, s4, s5
734 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
735 ; GFX8-NEXT: s_flbit_i32 s2, s5
736 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
737 ; GFX8-NEXT: s_add_i32 s2, s2, -1
738 ; GFX8-NEXT: s_add_i32 s3, s3, 32
739 ; GFX8-NEXT: s_min_u32 s7, s2, s3
740 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7
741 ; GFX8-NEXT: s_min_u32 s2, s2, 1
742 ; GFX8-NEXT: s_or_b32 s2, s3, s2
743 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2
744 ; GFX8-NEXT: s_sub_i32 s6, 32, s8
745 ; GFX8-NEXT: s_sub_i32 s2, 32, s7
746 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
747 ; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
748 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
749 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
750 ; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
751 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
752 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
753 ; GFX8-NEXT: flat_store_dword v[0:1], v2
754 ; GFX8-NEXT: s_endpgm
756 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16:
758 ; GFX11-NEXT: s_clause 0x1
759 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
760 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
761 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
762 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
763 ; GFX11-NEXT: s_xor_b32 s3, s6, s7
764 ; GFX11-NEXT: s_xor_b32 s9, s4, s5
765 ; GFX11-NEXT: s_cls_i32 s2, s7
766 ; GFX11-NEXT: s_cls_i32 s8, s5
767 ; GFX11-NEXT: s_ashr_i32 s3, s3, 31
768 ; GFX11-NEXT: s_ashr_i32 s9, s9, 31
769 ; GFX11-NEXT: s_add_i32 s2, s2, -1
770 ; GFX11-NEXT: s_add_i32 s8, s8, -1
771 ; GFX11-NEXT: s_add_i32 s3, s3, 32
772 ; GFX11-NEXT: s_add_i32 s9, s9, 32
773 ; GFX11-NEXT: s_min_u32 s10, s2, s3
774 ; GFX11-NEXT: s_min_u32 s8, s8, s9
775 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
776 ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
777 ; GFX11-NEXT: s_min_u32 s2, s2, 1
778 ; GFX11-NEXT: s_min_u32 s4, s4, 1
779 ; GFX11-NEXT: s_or_b32 s2, s3, s2
780 ; GFX11-NEXT: s_or_b32 s3, s5, s4
781 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
782 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s3
783 ; GFX11-NEXT: s_sub_i32 s2, 32, s10
784 ; GFX11-NEXT: s_sub_i32 s3, 32, s8
785 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
786 ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
787 ; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
788 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
789 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
790 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
791 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
792 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
793 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
794 ; GFX11-NEXT: s_nop 0
795 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
796 ; GFX11-NEXT: s_endpgm
797 %result = sitofp <2 x i64> %in to <2 x half>
798 store <2 x half> %result, ptr addrspace(1) %out
802 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
803 ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16:
805 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
806 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
807 ; GFX6-NEXT: s_mov_b32 s6, 0
808 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0
809 ; GFX6-NEXT: v_mov_b32_e32 v9, 0
810 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
811 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
812 ; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
813 ; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
814 ; GFX6-NEXT: v_lshlrev_b32_e32 v10, 3, v0
815 ; GFX6-NEXT: v_mov_b32_e32 v11, v9
816 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
817 ; GFX6-NEXT: s_waitcnt vmcnt(1)
818 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
819 ; GFX6-NEXT: v_ffbh_i32_e32 v9, v4
820 ; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2
821 ; GFX6-NEXT: v_ffbh_i32_e32 v13, v2
822 ; GFX6-NEXT: s_waitcnt vmcnt(0)
823 ; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8
824 ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8
825 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6
826 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6
827 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
828 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9
829 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12
830 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13
831 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14
832 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15
833 ; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16
834 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17
835 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
836 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12
837 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14
838 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16
839 ; GFX6-NEXT: v_min_u32_e32 v0, v9, v0
840 ; GFX6-NEXT: v_min_u32_e32 v9, v13, v12
841 ; GFX6-NEXT: v_min_u32_e32 v12, v15, v14
842 ; GFX6-NEXT: v_min_u32_e32 v13, v17, v16
843 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
844 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
845 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
846 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9
847 ; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12
848 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12
849 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13
850 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13
851 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
852 ; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
853 ; GFX6-NEXT: v_min_u32_e32 v7, 1, v7
854 ; GFX6-NEXT: v_min_u32_e32 v5, 1, v5
855 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
856 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
857 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7
858 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v5
859 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
860 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
861 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1
862 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4
863 ; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14
864 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
865 ; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9
866 ; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v12
867 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
868 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
869 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
870 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
871 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
872 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
873 ; GFX6-NEXT: v_or_b32_e32 v1, v0, v3
874 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v4
875 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64
876 ; GFX6-NEXT: s_endpgm
878 ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
880 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
881 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
882 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
883 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
884 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
885 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
886 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
887 ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
888 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
889 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
890 ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
891 ; GFX8-NEXT: v_mov_b32_e32 v10, s1
892 ; GFX8-NEXT: s_waitcnt vmcnt(1)
893 ; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
894 ; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
895 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
896 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
897 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
898 ; GFX8-NEXT: s_waitcnt vmcnt(0)
899 ; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
900 ; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
901 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
902 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
903 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
904 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
905 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
906 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
907 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
908 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
909 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
910 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
911 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
912 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
913 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
914 ; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
915 ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
916 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
917 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
918 ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
919 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
920 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
921 ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
922 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
923 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
924 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
925 ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
926 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
927 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
928 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
929 ; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
930 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
931 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
932 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
933 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
934 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
935 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
936 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
937 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
938 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v14
939 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
940 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v11
941 ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
942 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
943 ; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
944 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
945 ; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
946 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
947 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
948 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
949 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
950 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
951 ; GFX8-NEXT: s_endpgm
953 ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
955 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
956 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
957 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
958 ; GFX11-NEXT: s_clause 0x1
959 ; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
960 ; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
961 ; GFX11-NEXT: s_waitcnt vmcnt(1)
962 ; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
963 ; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
964 ; GFX11-NEXT: s_waitcnt vmcnt(0)
965 ; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8
966 ; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6
967 ; GFX11-NEXT: v_cls_i32_e32 v10, v4
968 ; GFX11-NEXT: v_cls_i32_e32 v12, v2
969 ; GFX11-NEXT: v_cls_i32_e32 v14, v8
970 ; GFX11-NEXT: v_cls_i32_e32 v16, v6
971 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9
972 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
973 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13
974 ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v15
975 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
976 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
977 ; GFX11-NEXT: v_add_nc_u32_e32 v14, -1, v14
978 ; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v16
979 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
980 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
981 ; GFX11-NEXT: v_add_nc_u32_e32 v13, 32, v13
982 ; GFX11-NEXT: v_add_nc_u32_e32 v15, 32, v15
983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
984 ; GFX11-NEXT: v_min_u32_e32 v9, v10, v9
985 ; GFX11-NEXT: v_min_u32_e32 v10, v12, v11
986 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
987 ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13
988 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15
989 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
990 ; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4]
991 ; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2]
992 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
993 ; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
994 ; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
995 ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
996 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10
997 ; GFX11-NEXT: v_min_u32_e32 v3, 1, v3
998 ; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
999 ; GFX11-NEXT: v_min_u32_e32 v7, 1, v7
1000 ; GFX11-NEXT: v_min_u32_e32 v5, 1, v5
1001 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11
1002 ; GFX11-NEXT: v_or_b32_e32 v3, v4, v3
1003 ; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
1004 ; GFX11-NEXT: v_or_b32_e32 v2, v8, v7
1005 ; GFX11-NEXT: v_or_b32_e32 v4, v6, v5
1006 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12
1007 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
1008 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
1009 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
1010 ; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4
1011 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1012 ; GFX11-NEXT: v_ldexp_f32 v3, v3, v9
1013 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v10
1014 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1015 ; GFX11-NEXT: v_ldexp_f32 v2, v2, v11
1016 ; GFX11-NEXT: v_ldexp_f32 v4, v4, v5
1017 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0
1018 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
1019 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
1020 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
1021 ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
1022 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1023 ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
1024 ; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
1025 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
1026 ; GFX11-NEXT: s_nop 0
1027 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1028 ; GFX11-NEXT: s_endpgm
1029 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1030 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
1031 %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid
1032 %value = load <4 x i64>, ptr addrspace(1) %in.gep
1033 %result = sitofp <4 x i64> %value to <4 x half>
1034 store <4 x half> %result, ptr addrspace(1) %out.gep
1038 declare i32 @llvm.amdgcn.workitem.id.x() #1
1040 attributes #0 = { nounwind }
1041 attributes #1 = { nounwind readnone }