1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
5 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
7 define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
8 ; GFX6-LABEL: s_sint_to_fp_i64_to_f16:
10 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
11 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
12 ; GFX6-NEXT: s_mov_b32 s6, -1
13 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX6-NEXT: s_mov_b32 s4, s0
15 ; GFX6-NEXT: s_mov_b32 s5, s1
16 ; GFX6-NEXT: s_flbit_i32 s0, s3
17 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
18 ; GFX6-NEXT: s_add_i32 s0, s0, -1
19 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
20 ; GFX6-NEXT: s_add_i32 s1, s1, 32
21 ; GFX6-NEXT: s_min_u32 s8, s0, s1
22 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
23 ; GFX6-NEXT: s_min_u32 s0, s0, 1
24 ; GFX6-NEXT: s_or_b32 s0, s1, s0
25 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
26 ; GFX6-NEXT: s_sub_i32 s0, 32, s8
27 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
28 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
29 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
32 ; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
34 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
35 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
36 ; GFX8-NEXT: s_xor_b32 s5, s2, s3
37 ; GFX8-NEXT: s_flbit_i32 s4, s3
38 ; GFX8-NEXT: s_ashr_i32 s5, s5, 31
39 ; GFX8-NEXT: s_add_i32 s4, s4, -1
40 ; GFX8-NEXT: s_add_i32 s5, s5, 32
41 ; GFX8-NEXT: s_min_u32 s4, s4, s5
42 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
43 ; GFX8-NEXT: s_min_u32 s2, s2, 1
44 ; GFX8-NEXT: s_or_b32 s2, s3, s2
45 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
46 ; GFX8-NEXT: s_sub_i32 s2, 32, s4
47 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
48 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
49 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
50 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
51 ; GFX8-NEXT: flat_store_short v[0:1], v2
53 %result = sitofp i64 %in to half
54 store half %result, half addrspace(1)* %out
58 define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
59 ; GFX6-LABEL: v_sint_to_fp_i64_to_f16:
61 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
62 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
63 ; GFX6-NEXT: s_mov_b32 s6, 0
64 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0
65 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
66 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
68 ; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
69 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0
70 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
71 ; GFX6-NEXT: s_waitcnt vmcnt(0)
72 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
73 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4
74 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
75 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5
76 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
77 ; GFX6-NEXT: v_min_u32_e32 v0, v5, v0
78 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
79 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
80 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
81 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
82 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0
83 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0
84 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
85 ; GFX6-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64
88 ; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
90 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
91 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
92 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
93 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
94 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
95 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
96 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
97 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
98 ; GFX8-NEXT: s_waitcnt vmcnt(0)
99 ; GFX8-NEXT: v_xor_b32_e32 v3, v1, v2
100 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v2
101 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v3
102 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
103 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 32, v3
104 ; GFX8-NEXT: v_min_u32_e32 v3, v4, v3
105 ; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
106 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3
107 ; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
108 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
109 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
110 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
111 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
112 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
113 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
114 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
115 ; GFX8-NEXT: flat_store_short v[0:1], v3
116 ; GFX8-NEXT: s_endpgm
117 %tid = call i32 @llvm.amdgcn.workitem.id.x()
118 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
119 %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
120 %val = load i64, i64 addrspace(1)* %in.gep
121 %result = sitofp i64 %val to half
122 store half %result, half addrspace(1)* %out.gep
126 define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
127 ; GFX6-LABEL: s_sint_to_fp_i64_to_f32:
129 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
130 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
131 ; GFX6-NEXT: s_mov_b32 s6, -1
132 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
133 ; GFX6-NEXT: s_mov_b32 s4, s0
134 ; GFX6-NEXT: s_mov_b32 s5, s1
135 ; GFX6-NEXT: s_flbit_i32 s0, s3
136 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
137 ; GFX6-NEXT: s_add_i32 s0, s0, -1
138 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
139 ; GFX6-NEXT: s_add_i32 s1, s1, 32
140 ; GFX6-NEXT: s_min_u32 s8, s0, s1
141 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
142 ; GFX6-NEXT: s_min_u32 s0, s0, 1
143 ; GFX6-NEXT: s_or_b32 s0, s1, s0
144 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0
145 ; GFX6-NEXT: s_sub_i32 s0, 32, s8
146 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
147 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
148 ; GFX6-NEXT: s_endpgm
150 ; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
152 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
153 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX8-NEXT: s_xor_b32 s5, s2, s3
155 ; GFX8-NEXT: s_flbit_i32 s4, s3
156 ; GFX8-NEXT: s_ashr_i32 s5, s5, 31
157 ; GFX8-NEXT: s_add_i32 s4, s4, -1
158 ; GFX8-NEXT: s_add_i32 s5, s5, 32
159 ; GFX8-NEXT: s_min_u32 s4, s4, s5
160 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
161 ; GFX8-NEXT: s_min_u32 s2, s2, 1
162 ; GFX8-NEXT: s_or_b32 s2, s3, s2
163 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
164 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
165 ; GFX8-NEXT: s_sub_i32 s0, 32, s4
166 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
167 ; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
168 ; GFX8-NEXT: flat_store_dword v[0:1], v2
169 ; GFX8-NEXT: s_endpgm
170 %result = sitofp i64 %in to float
171 store float %result, float addrspace(1)* %out
175 define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
176 ; GFX6-LABEL: v_sint_to_fp_i64_to_f32:
178 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
179 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
180 ; GFX6-NEXT: s_mov_b32 s6, 0
181 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0
182 ; GFX6-NEXT: v_mov_b32_e32 v2, 0
183 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
185 ; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
186 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
187 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
188 ; GFX6-NEXT: s_waitcnt vmcnt(0)
189 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
190 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4
191 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
192 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5
193 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
194 ; GFX6-NEXT: v_min_u32_e32 v0, v5, v0
195 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
196 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
197 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
198 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
199 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0
200 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0
201 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
202 ; GFX6-NEXT: s_endpgm
204 ; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
206 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
207 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
208 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
209 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
211 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
212 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
213 ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
214 ; GFX8-NEXT: s_waitcnt vmcnt(0)
215 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v2
216 ; GFX8-NEXT: v_ffbh_i32_e32 v4, v2
217 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
218 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4
219 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
220 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v0
221 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
222 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
223 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
224 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
225 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
226 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
227 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
228 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
229 ; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
230 ; GFX8-NEXT: flat_store_dword v[0:1], v2
231 ; GFX8-NEXT: s_endpgm
232 %tid = call i32 @llvm.amdgcn.workitem.id.x()
233 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
234 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
235 %val = load i64, i64 addrspace(1)* %in.gep
236 %result = sitofp i64 %val to float
237 store float %result, float addrspace(1)* %out.gep
241 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
242 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32:
244 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
245 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
246 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
247 ; GFX6-NEXT: s_mov_b32 s2, -1
248 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX6-NEXT: s_flbit_i32 s8, s7
250 ; GFX6-NEXT: s_xor_b32 s9, s6, s7
251 ; GFX6-NEXT: s_flbit_i32 s10, s5
252 ; GFX6-NEXT: s_xor_b32 s11, s4, s5
253 ; GFX6-NEXT: s_add_i32 s8, s8, -1
254 ; GFX6-NEXT: s_ashr_i32 s9, s9, 31
255 ; GFX6-NEXT: s_add_i32 s10, s10, -1
256 ; GFX6-NEXT: s_ashr_i32 s11, s11, 31
257 ; GFX6-NEXT: s_add_i32 s9, s9, 32
258 ; GFX6-NEXT: s_add_i32 s11, s11, 32
259 ; GFX6-NEXT: s_min_u32 s8, s8, s9
260 ; GFX6-NEXT: s_min_u32 s9, s10, s11
261 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
262 ; GFX6-NEXT: s_sub_i32 s8, 32, s8
263 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
264 ; GFX6-NEXT: s_sub_i32 s9, 32, s9
265 ; GFX6-NEXT: s_min_u32 s6, s6, 1
266 ; GFX6-NEXT: s_min_u32 s4, s4, 1
267 ; GFX6-NEXT: s_or_b32 s6, s7, s6
268 ; GFX6-NEXT: s_or_b32 s4, s5, s4
269 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
270 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
271 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8
272 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9
273 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
274 ; GFX6-NEXT: s_endpgm
276 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
278 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
279 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
280 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX8-NEXT: s_xor_b32 s3, s6, s7
282 ; GFX8-NEXT: s_flbit_i32 s2, s7
283 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
284 ; GFX8-NEXT: s_add_i32 s2, s2, -1
285 ; GFX8-NEXT: s_add_i32 s3, s3, 32
286 ; GFX8-NEXT: s_min_u32 s9, s2, s3
287 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
288 ; GFX8-NEXT: s_min_u32 s2, s2, 1
289 ; GFX8-NEXT: s_or_b32 s2, s3, s2
290 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
291 ; GFX8-NEXT: s_xor_b32 s2, s4, s5
292 ; GFX8-NEXT: s_flbit_i32 s8, s5
293 ; GFX8-NEXT: s_ashr_i32 s2, s2, 31
294 ; GFX8-NEXT: s_add_i32 s8, s8, -1
295 ; GFX8-NEXT: s_add_i32 s2, s2, 32
296 ; GFX8-NEXT: s_min_u32 s6, s8, s2
297 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
298 ; GFX8-NEXT: s_min_u32 s2, s2, 1
299 ; GFX8-NEXT: s_or_b32 s2, s3, s2
300 ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
301 ; GFX8-NEXT: s_sub_i32 s2, 32, s9
302 ; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
303 ; GFX8-NEXT: s_sub_i32 s2, 32, s6
304 ; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
305 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
306 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
307 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
308 ; GFX8-NEXT: s_endpgm
309 %result = sitofp <2 x i64> %in to <2 x float>
310 store <2 x float> %result, <2 x float> addrspace(1)* %out
314 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
315 ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32:
317 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
318 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
319 ; GFX6-NEXT: s_mov_b32 s6, 0
320 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0
321 ; GFX6-NEXT: v_mov_b32_e32 v9, 0
322 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
324 ; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
325 ; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
326 ; GFX6-NEXT: v_lshlrev_b32_e32 v10, 4, v0
327 ; GFX6-NEXT: v_mov_b32_e32 v11, v9
328 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
329 ; GFX6-NEXT: s_waitcnt vmcnt(1)
330 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
331 ; GFX6-NEXT: v_ffbh_i32_e32 v9, v4
332 ; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2
333 ; GFX6-NEXT: v_ffbh_i32_e32 v13, v2
334 ; GFX6-NEXT: s_waitcnt vmcnt(0)
335 ; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8
336 ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8
337 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6
338 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6
339 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
340 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9
341 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12
342 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13
343 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14
344 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15
345 ; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16
346 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17
347 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
348 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12
349 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14
350 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16
351 ; GFX6-NEXT: v_min_u32_e32 v0, v9, v0
352 ; GFX6-NEXT: v_min_u32_e32 v9, v13, v12
353 ; GFX6-NEXT: v_min_u32_e32 v12, v15, v14
354 ; GFX6-NEXT: v_min_u32_e32 v13, v17, v16
355 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
356 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
357 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
358 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9
359 ; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12
360 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12
361 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13
362 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13
363 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
364 ; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
365 ; GFX6-NEXT: v_min_u32_e32 v7, 1, v7
366 ; GFX6-NEXT: v_min_u32_e32 v5, 1, v5
367 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
368 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
369 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7
370 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v5
371 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
372 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
373 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1
374 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4
375 ; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14
376 ; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v2
377 ; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9
378 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v12
379 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64
380 ; GFX6-NEXT: s_endpgm
382 ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
384 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
385 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
386 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
387 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
389 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
390 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
391 ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
392 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
393 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
394 ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
395 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
396 ; GFX8-NEXT: v_mov_b32_e32 v10, s1
397 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
398 ; GFX8-NEXT: s_waitcnt vmcnt(1)
399 ; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
400 ; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
401 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
402 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
403 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
404 ; GFX8-NEXT: s_waitcnt vmcnt(0)
405 ; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
406 ; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
407 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
408 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
409 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
410 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
411 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
412 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
413 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
414 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
415 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
416 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
417 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
418 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
419 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
420 ; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
421 ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
422 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
423 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
424 ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
425 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
426 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
427 ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
428 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
429 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
430 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
431 ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
432 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
433 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
434 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
435 ; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
436 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
437 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
438 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
439 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v1
440 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
441 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
442 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
443 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
444 ; GFX8-NEXT: v_ldexp_f32 v1, v3, v14
445 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
446 ; GFX8-NEXT: v_ldexp_f32 v3, v5, v11
447 ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
448 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
449 ; GFX8-NEXT: s_endpgm
450 %tid = call i32 @llvm.amdgcn.workitem.id.x()
451 %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
452 %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
453 %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
454 %result = sitofp <4 x i64> %value to <4 x float>
455 store <4 x float> %result, <4 x float> addrspace(1)* %out.gep
459 define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
460 ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16:
462 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
463 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
464 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
465 ; GFX6-NEXT: s_mov_b32 s2, -1
466 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX6-NEXT: s_flbit_i32 s8, s7
468 ; GFX6-NEXT: s_xor_b32 s9, s6, s7
469 ; GFX6-NEXT: s_flbit_i32 s10, s5
470 ; GFX6-NEXT: s_xor_b32 s11, s4, s5
471 ; GFX6-NEXT: s_add_i32 s8, s8, -1
472 ; GFX6-NEXT: s_ashr_i32 s9, s9, 31
473 ; GFX6-NEXT: s_add_i32 s10, s10, -1
474 ; GFX6-NEXT: s_ashr_i32 s11, s11, 31
475 ; GFX6-NEXT: s_add_i32 s9, s9, 32
476 ; GFX6-NEXT: s_add_i32 s11, s11, 32
477 ; GFX6-NEXT: s_min_u32 s8, s8, s9
478 ; GFX6-NEXT: s_min_u32 s9, s10, s11
479 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
480 ; GFX6-NEXT: s_sub_i32 s8, 32, s8
481 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
482 ; GFX6-NEXT: s_sub_i32 s9, 32, s9
483 ; GFX6-NEXT: s_min_u32 s6, s6, 1
484 ; GFX6-NEXT: s_min_u32 s4, s4, 1
485 ; GFX6-NEXT: s_or_b32 s6, s7, s6
486 ; GFX6-NEXT: s_or_b32 s4, s5, s4
487 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6
488 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
489 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8
490 ; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9
491 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
492 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
493 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
494 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
495 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
496 ; GFX6-NEXT: s_endpgm
498 ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
500 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
501 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
502 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
503 ; GFX8-NEXT: s_xor_b32 s3, s6, s7
504 ; GFX8-NEXT: s_flbit_i32 s2, s7
505 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
506 ; GFX8-NEXT: s_add_i32 s2, s2, -1
507 ; GFX8-NEXT: s_add_i32 s3, s3, 32
508 ; GFX8-NEXT: s_min_u32 s8, s2, s3
509 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
510 ; GFX8-NEXT: s_min_u32 s2, s2, 1
511 ; GFX8-NEXT: s_or_b32 s2, s3, s2
512 ; GFX8-NEXT: s_xor_b32 s3, s4, s5
513 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
514 ; GFX8-NEXT: s_flbit_i32 s2, s5
515 ; GFX8-NEXT: s_ashr_i32 s3, s3, 31
516 ; GFX8-NEXT: s_add_i32 s2, s2, -1
517 ; GFX8-NEXT: s_add_i32 s3, s3, 32
518 ; GFX8-NEXT: s_min_u32 s7, s2, s3
519 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7
520 ; GFX8-NEXT: s_min_u32 s2, s2, 1
521 ; GFX8-NEXT: s_or_b32 s2, s3, s2
522 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2
523 ; GFX8-NEXT: s_sub_i32 s6, 32, s8
524 ; GFX8-NEXT: s_sub_i32 s2, 32, s7
525 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
526 ; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
527 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
528 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
529 ; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
530 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
531 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
532 ; GFX8-NEXT: flat_store_dword v[0:1], v2
533 ; GFX8-NEXT: s_endpgm
534 %result = sitofp <2 x i64> %in to <2 x half>
535 store <2 x half> %result, <2 x half> addrspace(1)* %out
539 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
540 ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16:
542 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
543 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
544 ; GFX6-NEXT: s_mov_b32 s6, 0
545 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0
546 ; GFX6-NEXT: v_mov_b32_e32 v9, 0
547 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
548 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
549 ; GFX6-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
550 ; GFX6-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
551 ; GFX6-NEXT: v_lshlrev_b32_e32 v10, 3, v0
552 ; GFX6-NEXT: v_mov_b32_e32 v11, v9
553 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
554 ; GFX6-NEXT: s_waitcnt vmcnt(1)
555 ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4
556 ; GFX6-NEXT: v_ffbh_i32_e32 v9, v4
557 ; GFX6-NEXT: v_xor_b32_e32 v12, v1, v2
558 ; GFX6-NEXT: v_ffbh_i32_e32 v13, v2
559 ; GFX6-NEXT: s_waitcnt vmcnt(0)
560 ; GFX6-NEXT: v_xor_b32_e32 v14, v7, v8
561 ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8
562 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6
563 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6
564 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0
565 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9
566 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12
567 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13
568 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14
569 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15
570 ; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16
571 ; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17
572 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0
573 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12
574 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14
575 ; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16
576 ; GFX6-NEXT: v_min_u32_e32 v0, v9, v0
577 ; GFX6-NEXT: v_min_u32_e32 v9, v13, v12
578 ; GFX6-NEXT: v_min_u32_e32 v12, v15, v14
579 ; GFX6-NEXT: v_min_u32_e32 v13, v17, v16
580 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
581 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
582 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
583 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9
584 ; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12
585 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12
586 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13
587 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13
588 ; GFX6-NEXT: v_min_u32_e32 v3, 1, v3
589 ; GFX6-NEXT: v_min_u32_e32 v0, 1, v0
590 ; GFX6-NEXT: v_min_u32_e32 v7, 1, v7
591 ; GFX6-NEXT: v_min_u32_e32 v5, 1, v5
592 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3
593 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
594 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7
595 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v5
596 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3
597 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0
598 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1
599 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4
600 ; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14
601 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2
602 ; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9
603 ; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v12
604 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
605 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
606 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
607 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
608 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
609 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1
610 ; GFX6-NEXT: v_or_b32_e32 v1, v0, v3
611 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v4
612 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64
613 ; GFX6-NEXT: s_endpgm
615 ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
617 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
618 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
619 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
620 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
621 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
622 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
623 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
624 ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
625 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
626 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
627 ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
628 ; GFX8-NEXT: v_mov_b32_e32 v10, s1
629 ; GFX8-NEXT: s_waitcnt vmcnt(1)
630 ; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
631 ; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
632 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v4
633 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v2
634 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0
635 ; GFX8-NEXT: s_waitcnt vmcnt(0)
636 ; GFX8-NEXT: v_xor_b32_e32 v14, v7, v8
637 ; GFX8-NEXT: v_xor_b32_e32 v16, v5, v6
638 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v8
639 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v6
640 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12
641 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14
642 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16
643 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11
644 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13
645 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15
646 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17
647 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
648 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12
649 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14
650 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16
651 ; GFX8-NEXT: v_min_u32_e32 v0, v11, v0
652 ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12
653 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14
654 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16
655 ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v0, v[3:4]
656 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
657 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v11, v[1:2]
658 ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v12, v[7:8]
659 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v13, v[5:6]
660 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3
661 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
662 ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7
663 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5
664 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
665 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
666 ; GFX8-NEXT: v_or_b32_e32 v1, v8, v7
667 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v5
668 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3
669 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0
670 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
671 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4
672 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v11
673 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v12
674 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v13
675 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v14
676 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2
677 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v11
678 ; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
679 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
680 ; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
681 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
682 ; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
683 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
684 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
685 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
686 ; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
687 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
688 ; GFX8-NEXT: s_endpgm
689 %tid = call i32 @llvm.amdgcn.workitem.id.x()
690 %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
691 %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
692 %value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
693 %result = sitofp <4 x i64> %value to <4 x half>
694 store <4 x half> %result, <4 x half> addrspace(1)* %out.gep
698 declare i32 @llvm.amdgcn.workitem.id.x() #1
700 attributes #0 = { nounwind }
701 attributes #1 = { nounwind readnone }