1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
7 define amdgpu_kernel void @sitofp_i16_to_f16(
8 ; SI-LABEL: sitofp_i16_to_f16:
9 ; SI: ; %bb.0: ; %entry
10 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, -1
13 ; SI-NEXT: s_mov_b32 s10, s6
14 ; SI-NEXT: s_mov_b32 s11, s7
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s8, s2
17 ; SI-NEXT: s_mov_b32 s9, s3
18 ; SI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
19 ; SI-NEXT: s_mov_b32 s4, s0
20 ; SI-NEXT: s_mov_b32 s5, s1
21 ; SI-NEXT: s_waitcnt vmcnt(0)
22 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
23 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
24 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
27 ; VI-LABEL: sitofp_i16_to_f16:
28 ; VI: ; %bb.0: ; %entry
29 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
30 ; VI-NEXT: s_mov_b32 s7, 0xf000
31 ; VI-NEXT: s_mov_b32 s6, -1
32 ; VI-NEXT: s_mov_b32 s10, s6
33 ; VI-NEXT: s_mov_b32 s11, s7
34 ; VI-NEXT: s_waitcnt lgkmcnt(0)
35 ; VI-NEXT: s_mov_b32 s8, s2
36 ; VI-NEXT: s_mov_b32 s9, s3
37 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
38 ; VI-NEXT: s_mov_b32 s4, s0
39 ; VI-NEXT: s_mov_b32 s5, s1
40 ; VI-NEXT: s_waitcnt vmcnt(0)
41 ; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
42 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
45 ; GFX11-TRUE16-LABEL: sitofp_i16_to_f16:
46 ; GFX11-TRUE16: ; %bb.0: ; %entry
47 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
48 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
49 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
50 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
51 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
52 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
54 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
55 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
56 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
57 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
58 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
59 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l
60 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
61 ; GFX11-TRUE16-NEXT: s_endpgm
63 ; GFX11-FAKE16-LABEL: sitofp_i16_to_f16:
64 ; GFX11-FAKE16: ; %bb.0: ; %entry
65 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
66 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
67 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
68 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
69 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
70 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
72 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
73 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
74 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
75 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
76 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0
78 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
79 ; GFX11-FAKE16-NEXT: s_endpgm
81 ptr addrspace(1) %a) {
83 %a.val = load i16, ptr addrspace(1) %a
84 %r.val = sitofp i16 %a.val to half
85 store half %r.val, ptr addrspace(1) %r
89 define amdgpu_kernel void @sitofp_i32_to_f16(
90 ; SI-LABEL: sitofp_i32_to_f16:
91 ; SI: ; %bb.0: ; %entry
92 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
93 ; SI-NEXT: s_mov_b32 s7, 0xf000
94 ; SI-NEXT: s_mov_b32 s6, -1
95 ; SI-NEXT: s_mov_b32 s10, s6
96 ; SI-NEXT: s_mov_b32 s11, s7
97 ; SI-NEXT: s_waitcnt lgkmcnt(0)
98 ; SI-NEXT: s_mov_b32 s8, s2
99 ; SI-NEXT: s_mov_b32 s9, s3
100 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
101 ; SI-NEXT: s_mov_b32 s4, s0
102 ; SI-NEXT: s_mov_b32 s5, s1
103 ; SI-NEXT: s_waitcnt vmcnt(0)
104 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
105 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
106 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
109 ; VI-LABEL: sitofp_i32_to_f16:
110 ; VI: ; %bb.0: ; %entry
111 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
112 ; VI-NEXT: s_mov_b32 s7, 0xf000
113 ; VI-NEXT: s_mov_b32 s6, -1
114 ; VI-NEXT: s_mov_b32 s10, s6
115 ; VI-NEXT: s_mov_b32 s11, s7
116 ; VI-NEXT: s_waitcnt lgkmcnt(0)
117 ; VI-NEXT: s_mov_b32 s8, s2
118 ; VI-NEXT: s_mov_b32 s9, s3
119 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
120 ; VI-NEXT: s_mov_b32 s4, s0
121 ; VI-NEXT: s_mov_b32 s5, s1
122 ; VI-NEXT: s_waitcnt vmcnt(0)
123 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
124 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
125 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
128 ; GFX11-TRUE16-LABEL: sitofp_i32_to_f16:
129 ; GFX11-TRUE16: ; %bb.0: ; %entry
130 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
131 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
132 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
133 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
134 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
135 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
137 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
138 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
139 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
140 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
141 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
142 ; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
143 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
144 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
145 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
146 ; GFX11-TRUE16-NEXT: s_endpgm
148 ; GFX11-FAKE16-LABEL: sitofp_i32_to_f16:
149 ; GFX11-FAKE16: ; %bb.0: ; %entry
150 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
151 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
152 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
153 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
154 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
155 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
156 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
157 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
158 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
159 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
160 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
161 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
162 ; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
163 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
164 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
165 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
166 ; GFX11-FAKE16-NEXT: s_endpgm
168 ptr addrspace(1) %a) {
170 %a.val = load i32, ptr addrspace(1) %a
171 %r.val = sitofp i32 %a.val to half
172 store half %r.val, ptr addrspace(1) %r
176 ; f16 = sitofp i64 is in sint_to_fp.i64.ll
178 define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
179 ; SI-LABEL: sitofp_v2i16_to_v2f16:
180 ; SI: ; %bb.0: ; %entry
181 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
182 ; SI-NEXT: s_mov_b32 s7, 0xf000
183 ; SI-NEXT: s_mov_b32 s6, -1
184 ; SI-NEXT: s_mov_b32 s10, s6
185 ; SI-NEXT: s_mov_b32 s11, s7
186 ; SI-NEXT: s_waitcnt lgkmcnt(0)
187 ; SI-NEXT: s_mov_b32 s8, s2
188 ; SI-NEXT: s_mov_b32 s9, s3
189 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
190 ; SI-NEXT: s_mov_b32 s4, s0
191 ; SI-NEXT: s_mov_b32 s5, s1
192 ; SI-NEXT: s_waitcnt vmcnt(0)
193 ; SI-NEXT: v_bfe_i32 v1, v0, 0, 16
194 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0
195 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
196 ; SI-NEXT: v_cvt_f32_i32_e32 v1, v1
197 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
198 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
199 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
200 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
201 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
204 ; VI-LABEL: sitofp_v2i16_to_v2f16:
205 ; VI: ; %bb.0: ; %entry
206 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
207 ; VI-NEXT: s_mov_b32 s7, 0xf000
208 ; VI-NEXT: s_mov_b32 s6, -1
209 ; VI-NEXT: s_mov_b32 s10, s6
210 ; VI-NEXT: s_mov_b32 s11, s7
211 ; VI-NEXT: s_waitcnt lgkmcnt(0)
212 ; VI-NEXT: s_mov_b32 s8, s2
213 ; VI-NEXT: s_mov_b32 s9, s3
214 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
215 ; VI-NEXT: s_mov_b32 s4, s0
216 ; VI-NEXT: s_mov_b32 s5, s1
217 ; VI-NEXT: s_waitcnt vmcnt(0)
218 ; VI-NEXT: v_cvt_f16_i16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
219 ; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
220 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
221 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
224 ; GFX11-TRUE16-LABEL: sitofp_v2i16_to_v2f16:
225 ; GFX11-TRUE16: ; %bb.0: ; %entry
226 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
227 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
228 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
229 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
230 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
231 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
232 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
233 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
234 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
235 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
236 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
237 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
238 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
239 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l
240 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
241 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l
242 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
243 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
244 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
245 ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
246 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
247 ; GFX11-TRUE16-NEXT: s_endpgm
249 ; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16:
250 ; GFX11-FAKE16: ; %bb.0: ; %entry
251 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
252 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
253 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
254 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
255 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
256 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
257 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
258 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
259 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
260 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
261 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
262 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
263 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
264 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0
265 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
266 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1
267 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
268 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
269 ; GFX11-FAKE16-NEXT: s_endpgm
271 ptr addrspace(1) %a) {
273 %a.val = load <2 x i16>, ptr addrspace(1) %a
274 %r.val = sitofp <2 x i16> %a.val to <2 x half>
275 store <2 x half> %r.val, ptr addrspace(1) %r
279 define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
280 ; SI-LABEL: sitofp_v2i32_to_v2f16:
281 ; SI: ; %bb.0: ; %entry
282 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
283 ; SI-NEXT: s_mov_b32 s7, 0xf000
284 ; SI-NEXT: s_mov_b32 s6, -1
285 ; SI-NEXT: s_mov_b32 s10, s6
286 ; SI-NEXT: s_mov_b32 s11, s7
287 ; SI-NEXT: s_waitcnt lgkmcnt(0)
288 ; SI-NEXT: s_mov_b32 s8, s2
289 ; SI-NEXT: s_mov_b32 s9, s3
290 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
291 ; SI-NEXT: s_mov_b32 s4, s0
292 ; SI-NEXT: s_mov_b32 s5, s1
293 ; SI-NEXT: s_waitcnt vmcnt(0)
294 ; SI-NEXT: v_cvt_f32_i32_e32 v1, v1
295 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0
296 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
297 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
298 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
299 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
300 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
303 ; VI-LABEL: sitofp_v2i32_to_v2f16:
304 ; VI: ; %bb.0: ; %entry
305 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
306 ; VI-NEXT: s_mov_b32 s7, 0xf000
307 ; VI-NEXT: s_mov_b32 s6, -1
308 ; VI-NEXT: s_mov_b32 s10, s6
309 ; VI-NEXT: s_mov_b32 s11, s7
310 ; VI-NEXT: s_waitcnt lgkmcnt(0)
311 ; VI-NEXT: s_mov_b32 s8, s2
312 ; VI-NEXT: s_mov_b32 s9, s3
313 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
314 ; VI-NEXT: s_mov_b32 s4, s0
315 ; VI-NEXT: s_mov_b32 s5, s1
316 ; VI-NEXT: s_waitcnt vmcnt(0)
317 ; VI-NEXT: v_cvt_f32_i32_e32 v1, v1
318 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
319 ; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
320 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
321 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
322 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
325 ; GFX11-TRUE16-LABEL: sitofp_v2i32_to_v2f16:
326 ; GFX11-TRUE16: ; %bb.0: ; %entry
327 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
328 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
329 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
330 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
331 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
332 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
334 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
335 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
336 ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
337 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
338 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
339 ; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
340 ; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
341 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
342 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
343 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
344 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
345 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
346 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
347 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
348 ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
349 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
350 ; GFX11-TRUE16-NEXT: s_endpgm
352 ; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16:
353 ; GFX11-FAKE16: ; %bb.0: ; %entry
354 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
355 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
356 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
357 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
358 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
359 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
361 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
362 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
363 ; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
364 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
365 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
366 ; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
367 ; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
368 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
369 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
370 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
371 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
372 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
373 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
374 ; GFX11-FAKE16-NEXT: s_endpgm
376 ptr addrspace(1) %a) {
378 %a.val = load <2 x i32>, ptr addrspace(1) %a
379 %r.val = sitofp <2 x i32> %a.val to <2 x half>
380 store <2 x half> %r.val, ptr addrspace(1) %r
384 define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
385 ; SI-LABEL: s_sint_to_fp_i1_to_f16:
387 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
388 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
389 ; SI-NEXT: s_mov_b32 s3, 0xf000
390 ; SI-NEXT: s_mov_b32 s2, -1
391 ; SI-NEXT: s_mov_b32 s6, s2
392 ; SI-NEXT: s_mov_b32 s7, s3
393 ; SI-NEXT: s_waitcnt lgkmcnt(0)
394 ; SI-NEXT: s_mov_b32 s12, s10
395 ; SI-NEXT: s_mov_b32 s13, s11
396 ; SI-NEXT: s_mov_b32 s14, s2
397 ; SI-NEXT: s_mov_b32 s15, s3
398 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
399 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
400 ; SI-NEXT: s_waitcnt vmcnt(1)
401 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
402 ; SI-NEXT: s_waitcnt vmcnt(0)
403 ; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
404 ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
405 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
406 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
407 ; SI-NEXT: s_mov_b32 s0, s8
408 ; SI-NEXT: s_mov_b32 s1, s9
409 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
412 ; VI-LABEL: s_sint_to_fp_i1_to_f16:
414 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
415 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
416 ; VI-NEXT: s_mov_b32 s3, 0xf000
417 ; VI-NEXT: s_mov_b32 s2, -1
418 ; VI-NEXT: s_mov_b32 s6, s2
419 ; VI-NEXT: s_mov_b32 s7, s3
420 ; VI-NEXT: s_waitcnt lgkmcnt(0)
421 ; VI-NEXT: s_mov_b32 s12, s10
422 ; VI-NEXT: s_mov_b32 s13, s11
423 ; VI-NEXT: s_mov_b32 s14, s2
424 ; VI-NEXT: s_mov_b32 s15, s3
425 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
426 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
427 ; VI-NEXT: s_waitcnt vmcnt(1)
428 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
429 ; VI-NEXT: s_waitcnt vmcnt(0)
430 ; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
431 ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
432 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
433 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
434 ; VI-NEXT: s_mov_b32 s0, s8
435 ; VI-NEXT: s_mov_b32 s1, s9
436 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
439 ; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16:
440 ; GFX11-TRUE16: ; %bb.0:
441 ; GFX11-TRUE16-NEXT: s_clause 0x1
442 ; GFX11-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
443 ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
444 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
445 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
446 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
447 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
448 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6
449 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7
450 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10
452 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11
453 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
454 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
455 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8
456 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9
457 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
458 ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
459 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
460 ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
461 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
462 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
463 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
464 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
465 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
466 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
467 ; GFX11-TRUE16-NEXT: s_endpgm
469 ; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16:
470 ; GFX11-FAKE16: ; %bb.0:
471 ; GFX11-FAKE16-NEXT: s_clause 0x1
472 ; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
473 ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
474 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
475 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
476 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
477 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
478 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6
479 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7
480 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10
482 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11
483 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0
484 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
485 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8
486 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9
487 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
488 ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
489 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
490 ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
491 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
492 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
493 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
494 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
495 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
496 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
497 ; GFX11-FAKE16-NEXT: s_endpgm
498 %a = load float, ptr addrspace(1) %in0
499 %b = load float, ptr addrspace(1) %in1
500 %acmp = fcmp oge float %a, 0.000000e+00
501 %bcmp = fcmp oge float %b, 1.000000e+00
502 %result = xor i1 %acmp, %bcmp
503 %fp = sitofp i1 %result to half
504 store half %fp, ptr addrspace(1) %out
508 ; v2f16 = sitofp v2i64 is in sint_to_fp.i64.ll