1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
6 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
8 define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 {
9 ; GFX6-LABEL: s_sint_to_fp_i64_to_f16:
11 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
12 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
13 ; GFX6-NEXT: s_mov_b32 s6, -1
14 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX6-NEXT: s_mov_b32 s4, s0
16 ; GFX6-NEXT: s_mov_b32 s5, s1
17 ; GFX6-NEXT: s_flbit_i32 s0, s3
18 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
19 ; GFX6-NEXT: s_add_i32 s0, s0, -1
20 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
21 ; GFX6-NEXT: s_add_i32 s1, s1, 32
22 ; GFX6-NEXT: s_min_u32 s8, s0, s1
23 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
24 ; GFX6-NEXT: s_min_u32 s0, s0, 1
25 ; GFX6-NEXT: s_or_b32 s0, s1, s0
26 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
27 ; GFX6-NEXT: s_sub_i32 s0, 32, s8
28 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
29 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
30 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
33 ; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
35 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
36 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX8-NEXT: s_xor_b32 s5, s2, s3
38 ; GFX8-NEXT: s_flbit_i32 s4, s3
39 ; GFX8-NEXT: s_ashr_i32 s5, s5, 31
40 ; GFX8-NEXT: s_add_i32 s4, s4, -1
41 ; GFX8-NEXT: s_add_i32 s5, s5, 32
42 ; GFX8-NEXT: s_min_u32 s4, s4, s5
43 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
44 ; GFX8-NEXT: s_min_u32 s2, s2, 1
45 ; GFX8-NEXT: s_or_b32 s2, s3, s2
46 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
47 ; GFX8-NEXT: s_sub_i32 s2, 32, s4
48 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
49 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
50 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
51 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
52 ; GFX8-NEXT: flat_store_short v[0:1], v2
55 ; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
57 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
58 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
59 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
60 ; GFX11-NEXT: s_xor_b32 s4, s2, s3
61 ; GFX11-NEXT: s_cls_i32 s5, s3
62 ; GFX11-NEXT: s_ashr_i32 s4, s4, 31
63 ; GFX11-NEXT: s_add_i32 s5, s5, -1
64 ; GFX11-NEXT: s_add_i32 s4, s4, 32
65 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
66 ; GFX11-NEXT: s_min_u32 s4, s5, s4
67 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
68 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
69 ; GFX11-NEXT: s_min_u32 s2, s2, 1
70 ; GFX11-NEXT: s_or_b32 s2, s3, s2
71 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
72 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
73 ; GFX11-NEXT: s_sub_i32 s2, 32, s4
74 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
75 ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
76 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
77 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
78 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
80 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81 ; GFX11-NEXT: s_endpgm
82 %result = sitofp i64 %in to half
83 store half %result, ptr addrspace(1) %out
87 define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
88 ; GFX6-LABEL: v_sint_to_fp_i64_to_f16:
90 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
91 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
92 ; GFX6-NEXT: s_mov_b32 s6, 0
93 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0
94 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
95 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
97 ; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
98 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0
99 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
100 ; GFX6-NEXT: s_waitcnt vmcnt(0)
101 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
102 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4
103 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
104 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5
105 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
106 ; GFX6-NEXT: v_min_u32_e32 v0, v5, v0
107 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
108 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
109 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
110 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
111 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0
112 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0
113 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
114 ; GFX6-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64
115 ; GFX6-NEXT: s_endpgm
117 ; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
119 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
120 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
121 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
122 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
124 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
125 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
126 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
127 ; GFX8-NEXT: s_waitcnt vmcnt(0)
128 ; GFX8-NEXT: v_xor_b32_e32 v3, v1, v2
129 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v2
130 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v3
131 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
132 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 32, v3
133 ; GFX8-NEXT: v_min_u32_e32 v3, v4, v3
134 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
135 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3
136 ; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
137 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
138 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
139 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
140 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
141 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
142 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
143 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
144 ; GFX8-NEXT: flat_store_short v[0:1], v3
145 ; GFX8-NEXT: s_endpgm
147 ; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
149 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
150 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0
151 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
152 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2
153 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
155 ; GFX11-NEXT: s_waitcnt vmcnt(0)
156 ; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1
157 ; GFX11-NEXT: v_cls_i32_e32 v4, v1
158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
159 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3
160 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4
161 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
162 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3
163 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3
164 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
165 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
166 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
167 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
168 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
169 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3
170 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
171 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
172 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
173 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2
174 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
175 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
176 ; GFX11-NEXT: s_nop 0
177 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
178 ; GFX11-NEXT: s_endpgm
179 %tid = call i32 @llvm.amdgcn.workitem.id.x()
180 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
181 %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
182 %val = load i64, ptr addrspace(1) %in.gep
183 %result = sitofp i64 %val to half
184 store half %result, ptr addrspace(1) %out.gep
188 define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 {
189 ; GFX6-LABEL: s_sint_to_fp_i64_to_f32:
191 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
192 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
193 ; GFX6-NEXT: s_mov_b32 s6, -1
194 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
195 ; GFX6-NEXT: s_mov_b32 s4, s0
196 ; GFX6-NEXT: s_mov_b32 s5, s1
197 ; GFX6-NEXT: s_flbit_i32 s0, s3
198 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
199 ; GFX6-NEXT: s_add_i32 s0, s0, -1
200 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
201 ; GFX6-NEXT: s_add_i32 s1, s1, 32
202 ; GFX6-NEXT: s_min_u32 s8, s0, s1
203 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
204 ; GFX6-NEXT: s_min_u32 s0, s0, 1
205 ; GFX6-NEXT: s_or_b32 s0, s1, s0
206 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
207 ; GFX6-NEXT: s_sub_i32 s0, 32, s8
208 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
209 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
210 ; GFX6-NEXT: s_endpgm
212 ; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
214 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
215 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX8-NEXT: s_xor_b32 s5, s2, s3
217 ; GFX8-NEXT: s_flbit_i32 s4, s3
218 ; GFX8-NEXT: s_ashr_i32 s5, s5, 31
219 ; GFX8-NEXT: s_add_i32 s4, s4, -1
220 ; GFX8-NEXT: s_add_i32 s5, s5, 32
221 ; GFX8-NEXT: s_min_u32 s4, s4, s5
222 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
223 ; GFX8-NEXT: s_min_u32 s2, s2, 1
224 ; GFX8-NEXT: s_or_b32 s2, s3, s2
225 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
226 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
227 ; GFX8-NEXT: s_sub_i32 s0, 32, s4
228 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
229 ; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
230 ; GFX8-NEXT: flat_store_dword v[0:1], v2
231 ; GFX8-NEXT: s_endpgm
233 ; GFX11-LABEL: s_sint_to_fp_i64_to_f32:
235 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
236 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
237 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
238 ; GFX11-NEXT: s_xor_b32 s4, s2, s3
239 ; GFX11-NEXT: s_cls_i32 s5, s3
240 ; GFX11-NEXT: s_ashr_i32 s4, s4, 31
241 ; GFX11-NEXT: s_add_i32 s5, s5, -1
242 ; GFX11-NEXT: s_add_i32 s4, s4, 32
243 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
244 ; GFX11-NEXT: s_min_u32 s4, s5, s4
245 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
246 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
247 ; GFX11-NEXT: s_min_u32 s2, s2, 1
248 ; GFX11-NEXT: s_or_b32 s2, s3, s2
249 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
250 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
251 ; GFX11-NEXT: s_sub_i32 s2, 32, s4
252 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
253 ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
254 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
255 ; GFX11-NEXT: s_nop 0
256 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
257 ; GFX11-NEXT: s_endpgm
258 %result = sitofp i64 %in to float
259 store float %result, ptr addrspace(1) %out
263 define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
264 ; GFX6-LABEL: v_sint_to_fp_i64_to_f32:
266 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
267 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
268 ; GFX6-NEXT: s_mov_b32 s6, 0
269 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0
270 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
271 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
272 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
273 ; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
274 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
275 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
276 ; GFX6-NEXT: s_waitcnt vmcnt(0)
277 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
278 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4
279 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
280 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5
281 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
282 ; GFX6-NEXT: v_min_u32_e32 v0, v5, v0
283 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
284 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
285 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
286 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
287 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0
288 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0
289 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
290 ; GFX6-NEXT: s_endpgm
292 ; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
294 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
295 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
296 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
297 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
298 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
299 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
300 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
301 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
302 ; GFX8-NEXT: s_waitcnt vmcnt(0)
303 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v2
304 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v2
305 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
306 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
307 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
308 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v0
309 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
310 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
311 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
312 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
313 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
314 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
315 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
316 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
317 ; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
318 ; GFX8-NEXT: flat_store_dword v[0:1], v2
319 ; GFX8-NEXT: s_endpgm
321 ; GFX11-LABEL: v_sint_to_fp_i64_to_f32:
323 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
324 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0
325 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
326 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2
327 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
328 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
330 ; GFX11-NEXT: s_waitcnt vmcnt(0)
331 ; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1
332 ; GFX11-NEXT: v_cls_i32_e32 v4, v1
333 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
334 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3
335 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4
336 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
337 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3
338 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3
339 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
340 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1]
341 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
342 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
343 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
344 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3
345 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
346 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
347 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
348 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
349 ; GFX11-NEXT: s_nop 0
350 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
351 ; GFX11-NEXT: s_endpgm
352 %tid = call i32 @llvm.amdgcn.workitem.id.x()
353 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
354 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
355 %val = load i64, ptr addrspace(1) %in.gep
356 %result = sitofp i64 %val to float
357 store float %result, ptr addrspace(1) %out.gep
361 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{
362 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32:
364 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
365 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
366 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
367 ; GFX6-NEXT: s_mov_b32 s2, -1
368 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
369 ; GFX6-NEXT: s_flbit_i32 s8, s7
370 ; GFX6-NEXT: s_xor_b32 s9, s6, s7
371 ; GFX6-NEXT: s_flbit_i32 s10, s5
372 ; GFX6-NEXT: s_xor_b32 s11, s4, s5
373 ; GFX6-NEXT: s_add_i32 s8, s8, -1
374 ; GFX6-NEXT: s_ashr_i32 s9, s9, 31
375 ; GFX6-NEXT: s_add_i32 s10, s10, -1
376 ; GFX6-NEXT: s_ashr_i32 s11, s11, 31
377 ; GFX6-NEXT: s_add_i32 s9, s9, 32
378 ; GFX6-NEXT: s_add_i32 s11, s11, 32
379 ; GFX6-NEXT: s_min_u32 s8, s8, s9
380 ; GFX6-NEXT: s_min_u32 s9, s10, s11
381 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
382 ; GFX6-NEXT: s_sub_i32 s8, 32, s8
383 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
384 ; GFX6-NEXT: s_sub_i32 s9, 32, s9
385 ; GFX6-NEXT: s_min_u32 s6, s6, 1
386 ; GFX6-NEXT: s_min_u32 s4, s4, 1
387 ; GFX6-NEXT: s_or_b32 s6, s7, s6
388 ; GFX6-NEXT: s_or_b32 s4, s5, s4
389 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
390 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
391 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8
392 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9
393 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
394 ; GFX6-NEXT: s_endpgm
396 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
398 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
399 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
400 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX8-NEXT: s_xor_b32 s3, s6, s7
402 ; GFX8-NEXT: s_flbit_i32 s2, s7
403 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
404 ; GFX8-NEXT: s_add_i32 s2, s2, -1
405 ; GFX8-NEXT: s_add_i32 s3, s3, 32
406 ; GFX8-NEXT: s_min_u32 s9, s2, s3
407 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
408 ; GFX8-NEXT: s_min_u32 s2, s2, 1
409 ; GFX8-NEXT: s_or_b32 s2, s3, s2
410 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
411 ; GFX8-NEXT: s_xor_b32 s2, s4, s5
412 ; GFX8-NEXT: s_flbit_i32 s8, s5
413 ; GFX8-NEXT: s_ashr_i32 s2, s2, 31
414 ; GFX8-NEXT: s_add_i32 s8, s8, -1
415 ; GFX8-NEXT: s_add_i32 s2, s2, 32
416 ; GFX8-NEXT: s_min_u32 s6, s8, s2
417 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
418 ; GFX8-NEXT: s_min_u32 s2, s2, 1
419 ; GFX8-NEXT: s_or_b32 s2, s3, s2
420 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
421 ; GFX8-NEXT: s_sub_i32 s2, 32, s9
422 ; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
423 ; GFX8-NEXT: s_sub_i32 s2, 32, s6
424 ; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
425 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
426 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
427 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
428 ; GFX8-NEXT: s_endpgm
430 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32:
432 ; GFX11-NEXT: s_clause 0x1
433 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
434 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
435 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
436 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
437 ; GFX11-NEXT: s_xor_b32 s3, s6, s7
438 ; GFX11-NEXT: s_xor_b32 s9, s4, s5
439 ; GFX11-NEXT: s_cls_i32 s2, s7
440 ; GFX11-NEXT: s_cls_i32 s8, s5
441 ; GFX11-NEXT: s_ashr_i32 s3, s3, 31
442 ; GFX11-NEXT: s_ashr_i32 s9, s9, 31
443 ; GFX11-NEXT: s_add_i32 s2, s2, -1
444 ; GFX11-NEXT: s_add_i32 s8, s8, -1
445 ; GFX11-NEXT: s_add_i32 s3, s3, 32
446 ; GFX11-NEXT: s_add_i32 s9, s9, 32
447 ; GFX11-NEXT: s_min_u32 s10, s2, s3
448 ; GFX11-NEXT: s_min_u32 s8, s8, s9
449 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
450 ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
451 ; GFX11-NEXT: s_min_u32 s2, s2, 1
452 ; GFX11-NEXT: s_min_u32 s4, s4, 1
453 ; GFX11-NEXT: s_or_b32 s2, s3, s2
454 ; GFX11-NEXT: s_or_b32 s3, s5, s4
455 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
456 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s3
457 ; GFX11-NEXT: s_sub_i32 s2, 32, s10
458 ; GFX11-NEXT: s_sub_i32 s3, 32, s8
459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
460 ; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
461 ; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
462 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
463 ; GFX11-NEXT: s_nop 0
464 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
465 ; GFX11-NEXT: s_endpgm
466 %result = sitofp <2 x i64> %in to <2 x float>
467 store <2 x float> %result, ptr addrspace(1) %out
471 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
472 ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32:
474 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
475 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
476 ; GFX6-NEXT: s_mov_b32 s6, 0
477 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0
478 ; GFX6-NEXT: v_mov_b32_e32 v9, 0
479 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
481 ; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
482 ; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
483 ; GFX6-NEXT: v_lshlrev_b32_e32 v10, 4, v0
484 ; GFX6-NEXT: v_mov_b32_e32 v11, v9
485 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
486 ; GFX6-NEXT: s_waitcnt vmcnt(1)
487 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
488 ; GFX6-NEXT: v_ffbh_i32_e32 v9, v4
489 ; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2
490 ; GFX6-NEXT: v_ffbh_i32_e32 v13, v2
491 ; GFX6-NEXT: s_waitcnt vmcnt(0)
492 ; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8
493 ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8
494 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6
495 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6
496 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
497 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9
498 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12
499 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13
500 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14
501 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15
502 ; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16
503 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17
504 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
505 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12
506 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14
507 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16
508 ; GFX6-NEXT: v_min_u32_e32 v0, v9, v0
509 ; GFX6-NEXT: v_min_u32_e32 v9, v13, v12
510 ; GFX6-NEXT: v_min_u32_e32 v12, v15, v14
511 ; GFX6-NEXT: v_min_u32_e32 v13, v17, v16
512 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
513 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
514 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
515 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9
516 ; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12
517 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12
518 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13
519 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13
520 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
521 ; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
522 ; GFX6-NEXT: v_min_u32_e32 v7, 1, v7
523 ; GFX6-NEXT: v_min_u32_e32 v5, 1, v5
524 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
525 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
526 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7
527 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v5
528 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
529 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
530 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1
531 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4
532 ; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14
533 ; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v2
534 ; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9
535 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v12
536 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64
537 ; GFX6-NEXT: s_endpgm
539 ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
541 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
542 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
543 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
544 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
545 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
546 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
547 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
548 ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
549 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
550 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
551 ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
552 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
553 ; GFX8-NEXT: v_mov_b32_e32 v10, s1
554 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
555 ; GFX8-NEXT: s_waitcnt vmcnt(1)
556 ; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
557 ; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
558 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
559 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
560 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
561 ; GFX8-NEXT: s_waitcnt vmcnt(0)
562 ; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
563 ; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
564 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
565 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
566 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
567 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
568 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
569 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
570 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
571 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
572 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
573 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
574 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
575 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
576 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
577 ; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
578 ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
579 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
580 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
581 ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
582 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
583 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
584 ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
585 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
586 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
587 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
588 ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
589 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
590 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
591 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
592 ; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
593 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
594 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
595 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
596 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v1
597 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
598 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
599 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
600 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
601 ; GFX8-NEXT: v_ldexp_f32 v1, v3, v14
602 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
603 ; GFX8-NEXT: v_ldexp_f32 v3, v5, v11
604 ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
605 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
606 ; GFX8-NEXT: s_endpgm
608 ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32:
610 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
611 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0
612 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
613 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8
614 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
615 ; GFX11-NEXT: s_clause 0x1
616 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16
617 ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3]
618 ; GFX11-NEXT: s_waitcnt vmcnt(1)
619 ; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3
620 ; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1
621 ; GFX11-NEXT: s_waitcnt vmcnt(0)
622 ; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7
623 ; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5
624 ; GFX11-NEXT: v_cls_i32_e32 v10, v3
625 ; GFX11-NEXT: v_cls_i32_e32 v12, v1
626 ; GFX11-NEXT: v_cls_i32_e32 v14, v7
627 ; GFX11-NEXT: v_cls_i32_e32 v16, v5
628 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9
629 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
630 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13
631 ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v15
632 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
633 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
634 ; GFX11-NEXT: v_add_nc_u32_e32 v14, -1, v14
635 ; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v16
636 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
637 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
638 ; GFX11-NEXT: v_add_nc_u32_e32 v13, 32, v13
639 ; GFX11-NEXT: v_add_nc_u32_e32 v15, 32, v15
640 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
641 ; GFX11-NEXT: v_min_u32_e32 v9, v10, v9
642 ; GFX11-NEXT: v_min_u32_e32 v10, v12, v11
643 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
644 ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13
645 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15
646 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
647 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3]
648 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
649 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
650 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7]
651 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5]
652 ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
653 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10
654 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
655 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
656 ; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
657 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
658 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11
659 ; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
660 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
661 ; GFX11-NEXT: v_or_b32_e32 v1, v7, v6
662 ; GFX11-NEXT: v_or_b32_e32 v3, v5, v4
663 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12
664 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
665 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
666 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
667 ; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v3
668 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8
669 ; GFX11-NEXT: v_ldexp_f32 v3, v2, v9
670 ; GFX11-NEXT: v_ldexp_f32 v2, v0, v10
671 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v11
672 ; GFX11-NEXT: v_ldexp_f32 v0, v5, v4
673 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
674 ; GFX11-NEXT: s_nop 0
675 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
676 ; GFX11-NEXT: s_endpgm
677 %tid = call i32 @llvm.amdgcn.workitem.id.x()
678 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
679 %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
680 %value = load <4 x i64>, ptr addrspace(1) %in.gep
681 %result = sitofp <4 x i64> %value to <4 x float>
682 store <4 x float> %result, ptr addrspace(1) %out.gep
686 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{
687 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16:
689 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
690 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
691 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
692 ; GFX6-NEXT: s_mov_b32 s2, -1
693 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
694 ; GFX6-NEXT: s_flbit_i32 s8, s7
695 ; GFX6-NEXT: s_xor_b32 s9, s6, s7
696 ; GFX6-NEXT: s_flbit_i32 s10, s5
697 ; GFX6-NEXT: s_xor_b32 s11, s4, s5
698 ; GFX6-NEXT: s_add_i32 s8, s8, -1
699 ; GFX6-NEXT: s_ashr_i32 s9, s9, 31
700 ; GFX6-NEXT: s_add_i32 s10, s10, -1
701 ; GFX6-NEXT: s_ashr_i32 s11, s11, 31
702 ; GFX6-NEXT: s_add_i32 s9, s9, 32
703 ; GFX6-NEXT: s_add_i32 s11, s11, 32
704 ; GFX6-NEXT: s_min_u32 s8, s8, s9
705 ; GFX6-NEXT: s_min_u32 s9, s10, s11
706 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
707 ; GFX6-NEXT: s_sub_i32 s8, 32, s8
708 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
709 ; GFX6-NEXT: s_sub_i32 s9, 32, s9
710 ; GFX6-NEXT: s_min_u32 s6, s6, 1
711 ; GFX6-NEXT: s_min_u32 s4, s4, 1
712 ; GFX6-NEXT: s_or_b32 s6, s7, s6
713 ; GFX6-NEXT: s_or_b32 s4, s5, s4
714 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
715 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
716 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8
717 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9
718 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
719 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
720 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
721 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
722 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
723 ; GFX6-NEXT: s_endpgm
725 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
727 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
728 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
729 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX8-NEXT: s_xor_b32 s3, s6, s7
731 ; GFX8-NEXT: s_flbit_i32 s2, s7
732 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
733 ; GFX8-NEXT: s_add_i32 s2, s2, -1
734 ; GFX8-NEXT: s_add_i32 s3, s3, 32
735 ; GFX8-NEXT: s_min_u32 s8, s2, s3
736 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
737 ; GFX8-NEXT: s_min_u32 s2, s2, 1
738 ; GFX8-NEXT: s_or_b32 s2, s3, s2
739 ; GFX8-NEXT: s_xor_b32 s3, s4, s5
740 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
741 ; GFX8-NEXT: s_flbit_i32 s2, s5
742 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
743 ; GFX8-NEXT: s_add_i32 s2, s2, -1
744 ; GFX8-NEXT: s_add_i32 s3, s3, 32
745 ; GFX8-NEXT: s_min_u32 s7, s2, s3
746 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7
747 ; GFX8-NEXT: s_min_u32 s2, s2, 1
748 ; GFX8-NEXT: s_or_b32 s2, s3, s2
749 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2
750 ; GFX8-NEXT: s_sub_i32 s6, 32, s8
751 ; GFX8-NEXT: s_sub_i32 s2, 32, s7
752 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
753 ; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
754 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
755 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
756 ; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
757 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
758 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
759 ; GFX8-NEXT: flat_store_dword v[0:1], v2
760 ; GFX8-NEXT: s_endpgm
762 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16:
764 ; GFX11-NEXT: s_clause 0x1
765 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
766 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
767 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
768 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
769 ; GFX11-NEXT: s_xor_b32 s3, s6, s7
770 ; GFX11-NEXT: s_xor_b32 s9, s4, s5
771 ; GFX11-NEXT: s_cls_i32 s2, s7
772 ; GFX11-NEXT: s_cls_i32 s8, s5
773 ; GFX11-NEXT: s_ashr_i32 s3, s3, 31
774 ; GFX11-NEXT: s_ashr_i32 s9, s9, 31
775 ; GFX11-NEXT: s_add_i32 s2, s2, -1
776 ; GFX11-NEXT: s_add_i32 s8, s8, -1
777 ; GFX11-NEXT: s_add_i32 s3, s3, 32
778 ; GFX11-NEXT: s_add_i32 s9, s9, 32
779 ; GFX11-NEXT: s_min_u32 s10, s2, s3
780 ; GFX11-NEXT: s_min_u32 s8, s8, s9
781 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
782 ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
783 ; GFX11-NEXT: s_min_u32 s2, s2, 1
784 ; GFX11-NEXT: s_min_u32 s4, s4, 1
785 ; GFX11-NEXT: s_or_b32 s2, s3, s2
786 ; GFX11-NEXT: s_or_b32 s3, s5, s4
787 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
788 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s3
789 ; GFX11-NEXT: s_sub_i32 s2, 32, s10
790 ; GFX11-NEXT: s_sub_i32 s3, 32, s8
791 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
792 ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
793 ; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
794 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
795 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
796 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
797 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
798 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
799 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
800 ; GFX11-NEXT: s_nop 0
801 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
802 ; GFX11-NEXT: s_endpgm
803 %result = sitofp <2 x i64> %in to <2 x half>
804 store <2 x half> %result, ptr addrspace(1) %out
808 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
809 ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16:
811 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
812 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
813 ; GFX6-NEXT: s_mov_b32 s6, 0
814 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0
815 ; GFX6-NEXT: v_mov_b32_e32 v9, 0
816 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
817 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
818 ; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
819 ; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
820 ; GFX6-NEXT: v_lshlrev_b32_e32 v10, 3, v0
821 ; GFX6-NEXT: v_mov_b32_e32 v11, v9
822 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
823 ; GFX6-NEXT: s_waitcnt vmcnt(1)
824 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
825 ; GFX6-NEXT: v_ffbh_i32_e32 v9, v4
826 ; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2
827 ; GFX6-NEXT: v_ffbh_i32_e32 v13, v2
828 ; GFX6-NEXT: s_waitcnt vmcnt(0)
829 ; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8
830 ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8
831 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6
832 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6
833 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
834 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9
835 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12
836 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13
837 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14
838 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15
839 ; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16
840 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17
841 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
842 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12
843 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14
844 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16
845 ; GFX6-NEXT: v_min_u32_e32 v0, v9, v0
846 ; GFX6-NEXT: v_min_u32_e32 v9, v13, v12
847 ; GFX6-NEXT: v_min_u32_e32 v12, v15, v14
848 ; GFX6-NEXT: v_min_u32_e32 v13, v17, v16
849 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
850 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
851 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
852 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9
853 ; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12
854 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12
855 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13
856 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13
857 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
858 ; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
859 ; GFX6-NEXT: v_min_u32_e32 v7, 1, v7
860 ; GFX6-NEXT: v_min_u32_e32 v5, 1, v5
861 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
862 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
863 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7
864 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v5
865 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
866 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
867 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1
868 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4
869 ; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14
870 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
871 ; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9
872 ; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v12
873 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
874 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
875 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
876 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
877 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
878 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
879 ; GFX6-NEXT: v_or_b32_e32 v1, v0, v3
880 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v4
881 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64
882 ; GFX6-NEXT: s_endpgm
884 ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
886 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
887 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
888 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
889 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
890 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
891 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
892 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
893 ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
894 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
895 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
896 ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
897 ; GFX8-NEXT: v_mov_b32_e32 v10, s1
898 ; GFX8-NEXT: s_waitcnt vmcnt(1)
899 ; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
900 ; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
901 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
902 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
903 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
904 ; GFX8-NEXT: s_waitcnt vmcnt(0)
905 ; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
906 ; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
907 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
908 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
909 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
910 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
911 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
912 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
913 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
914 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
915 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
916 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
917 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
918 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
919 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
920 ; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
921 ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
922 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
923 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
924 ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
925 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
926 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
927 ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
928 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
929 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
930 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
931 ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
932 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
933 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
934 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
935 ; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
936 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
937 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
938 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
939 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
940 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
941 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
942 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
943 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
944 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v14
945 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
946 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v11
947 ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
948 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
949 ; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
950 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
951 ; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
952 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
953 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
954 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
955 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
956 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
957 ; GFX8-NEXT: s_endpgm
959 ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
961 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
962 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0
963 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
964 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8
965 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
966 ; GFX11-NEXT: s_clause 0x1
967 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16
968 ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3]
969 ; GFX11-NEXT: s_waitcnt vmcnt(1)
970 ; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3
971 ; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1
972 ; GFX11-NEXT: s_waitcnt vmcnt(0)
973 ; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7
974 ; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5
975 ; GFX11-NEXT: v_cls_i32_e32 v10, v3
976 ; GFX11-NEXT: v_cls_i32_e32 v12, v1
977 ; GFX11-NEXT: v_cls_i32_e32 v14, v7
978 ; GFX11-NEXT: v_cls_i32_e32 v16, v5
979 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9
980 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11
981 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13
982 ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v15
983 ; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v10
984 ; GFX11-NEXT: v_add_nc_u32_e32 v12, -1, v12
985 ; GFX11-NEXT: v_add_nc_u32_e32 v14, -1, v14
986 ; GFX11-NEXT: v_add_nc_u32_e32 v16, -1, v16
987 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 32, v9
988 ; GFX11-NEXT: v_add_nc_u32_e32 v11, 32, v11
989 ; GFX11-NEXT: v_add_nc_u32_e32 v13, 32, v13
990 ; GFX11-NEXT: v_add_nc_u32_e32 v15, 32, v15
991 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
992 ; GFX11-NEXT: v_min_u32_e32 v9, v10, v9
993 ; GFX11-NEXT: v_min_u32_e32 v10, v12, v11
994 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
995 ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13
996 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15
997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
998 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3]
999 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
1000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1001 ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7]
1002 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5]
1003 ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
1004 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10
1005 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
1006 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
1007 ; GFX11-NEXT: v_min_u32_e32 v6, 1, v6
1008 ; GFX11-NEXT: v_min_u32_e32 v4, 1, v4
1009 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11
1010 ; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
1011 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
1012 ; GFX11-NEXT: v_or_b32_e32 v1, v7, v6
1013 ; GFX11-NEXT: v_or_b32_e32 v3, v5, v4
1014 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12
1015 ; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2
1016 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
1017 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
1018 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3
1019 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8
1020 ; GFX11-NEXT: v_ldexp_f32 v2, v2, v9
1021 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v10
1022 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v11
1023 ; GFX11-NEXT: v_ldexp_f32 v3, v3, v4
1024 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1025 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
1026 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
1027 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1028 ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1
1029 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
1030 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1031 ; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2
1032 ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4
1033 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
1034 ; GFX11-NEXT: s_nop 0
1035 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1036 ; GFX11-NEXT: s_endpgm
1037 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1038 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
1039 %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid
1040 %value = load <4 x i64>, ptr addrspace(1) %in.gep
1041 %result = sitofp <4 x i64> %value to <4 x half>
1042 store <4 x half> %result, ptr addrspace(1) %out.gep
1046 declare i32 @llvm.amdgcn.workitem.id.x() #1
1048 attributes #0 = { nounwind }
1049 attributes #1 = { nounwind readnone }