1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
6 define amdgpu_kernel void @fptosi_f16_to_i16(
7 ; SI-LABEL: fptosi_f16_to_i16:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
10 ; SI-NEXT: s_mov_b32 s7, 0xf000
11 ; SI-NEXT: s_mov_b32 s6, -1
12 ; SI-NEXT: s_mov_b32 s10, s6
13 ; SI-NEXT: s_mov_b32 s11, s7
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s8, s2
16 ; SI-NEXT: s_mov_b32 s9, s3
17 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
18 ; SI-NEXT: s_mov_b32 s4, s0
19 ; SI-NEXT: s_mov_b32 s5, s1
20 ; SI-NEXT: s_waitcnt vmcnt(0)
21 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
22 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
23 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
26 ; VI-LABEL: fptosi_f16_to_i16:
27 ; VI: ; %bb.0: ; %entry
28 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
29 ; VI-NEXT: s_mov_b32 s7, 0xf000
30 ; VI-NEXT: s_mov_b32 s6, -1
31 ; VI-NEXT: s_mov_b32 s10, s6
32 ; VI-NEXT: s_mov_b32 s11, s7
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: s_mov_b32 s8, s2
35 ; VI-NEXT: s_mov_b32 s9, s3
36 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
37 ; VI-NEXT: s_mov_b32 s4, s0
38 ; VI-NEXT: s_mov_b32 s5, s1
39 ; VI-NEXT: s_waitcnt vmcnt(0)
40 ; VI-NEXT: v_cvt_i16_f16_e32 v0, v0
41 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
44 ; GFX11-LABEL: fptosi_f16_to_i16:
45 ; GFX11: ; %bb.0: ; %entry
46 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
47 ; GFX11-NEXT: s_mov_b32 s6, -1
48 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
49 ; GFX11-NEXT: s_mov_b32 s10, s6
50 ; GFX11-NEXT: s_mov_b32 s11, s7
51 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX11-NEXT: s_mov_b32 s8, s2
53 ; GFX11-NEXT: s_mov_b32 s9, s3
54 ; GFX11-NEXT: s_mov_b32 s4, s0
55 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
56 ; GFX11-NEXT: s_mov_b32 s5, s1
57 ; GFX11-NEXT: s_waitcnt vmcnt(0)
58 ; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
59 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
61 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
62 ; GFX11-NEXT: s_endpgm
64 ptr addrspace(1) %a) {
66 %a.val = load half, ptr addrspace(1) %a
67 %r.val = fptosi half %a.val to i16
68 store i16 %r.val, ptr addrspace(1) %r
72 define amdgpu_kernel void @fptosi_f16_to_i32(
73 ; SI-LABEL: fptosi_f16_to_i32:
74 ; SI: ; %bb.0: ; %entry
75 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
76 ; SI-NEXT: s_mov_b32 s7, 0xf000
77 ; SI-NEXT: s_mov_b32 s6, -1
78 ; SI-NEXT: s_mov_b32 s10, s6
79 ; SI-NEXT: s_mov_b32 s11, s7
80 ; SI-NEXT: s_waitcnt lgkmcnt(0)
81 ; SI-NEXT: s_mov_b32 s8, s2
82 ; SI-NEXT: s_mov_b32 s9, s3
83 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
84 ; SI-NEXT: s_mov_b32 s4, s0
85 ; SI-NEXT: s_mov_b32 s5, s1
86 ; SI-NEXT: s_waitcnt vmcnt(0)
87 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
88 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
89 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
92 ; VI-LABEL: fptosi_f16_to_i32:
93 ; VI: ; %bb.0: ; %entry
94 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
95 ; VI-NEXT: s_mov_b32 s7, 0xf000
96 ; VI-NEXT: s_mov_b32 s6, -1
97 ; VI-NEXT: s_mov_b32 s10, s6
98 ; VI-NEXT: s_mov_b32 s11, s7
99 ; VI-NEXT: s_waitcnt lgkmcnt(0)
100 ; VI-NEXT: s_mov_b32 s8, s2
101 ; VI-NEXT: s_mov_b32 s9, s3
102 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
103 ; VI-NEXT: s_mov_b32 s4, s0
104 ; VI-NEXT: s_mov_b32 s5, s1
105 ; VI-NEXT: s_waitcnt vmcnt(0)
106 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
107 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
108 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
111 ; GFX11-LABEL: fptosi_f16_to_i32:
112 ; GFX11: ; %bb.0: ; %entry
113 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
114 ; GFX11-NEXT: s_mov_b32 s6, -1
115 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
116 ; GFX11-NEXT: s_mov_b32 s10, s6
117 ; GFX11-NEXT: s_mov_b32 s11, s7
118 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
119 ; GFX11-NEXT: s_mov_b32 s8, s2
120 ; GFX11-NEXT: s_mov_b32 s9, s3
121 ; GFX11-NEXT: s_mov_b32 s4, s0
122 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
123 ; GFX11-NEXT: s_mov_b32 s5, s1
124 ; GFX11-NEXT: s_waitcnt vmcnt(0)
125 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
127 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
128 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
129 ; GFX11-NEXT: s_nop 0
130 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
131 ; GFX11-NEXT: s_endpgm
133 ptr addrspace(1) %a) {
135 %a.val = load half, ptr addrspace(1) %a
136 %r.val = fptosi half %a.val to i32
137 store i32 %r.val, ptr addrspace(1) %r
141 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
142 ; test checks code generated for 'i64 = fp_to_sint f32'.
144 define amdgpu_kernel void @fptosi_f16_to_i64(
145 ; SI-LABEL: fptosi_f16_to_i64:
146 ; SI: ; %bb.0: ; %entry
147 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
148 ; SI-NEXT: s_mov_b32 s7, 0xf000
149 ; SI-NEXT: s_mov_b32 s6, -1
150 ; SI-NEXT: s_mov_b32 s10, s6
151 ; SI-NEXT: s_mov_b32 s11, s7
152 ; SI-NEXT: s_waitcnt lgkmcnt(0)
153 ; SI-NEXT: s_mov_b32 s8, s2
154 ; SI-NEXT: s_mov_b32 s9, s3
155 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
156 ; SI-NEXT: s_mov_b32 s4, s0
157 ; SI-NEXT: s_mov_b32 s5, s1
158 ; SI-NEXT: s_waitcnt vmcnt(0)
159 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
160 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
161 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
162 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
165 ; VI-LABEL: fptosi_f16_to_i64:
166 ; VI: ; %bb.0: ; %entry
167 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
168 ; VI-NEXT: s_mov_b32 s7, 0xf000
169 ; VI-NEXT: s_mov_b32 s6, -1
170 ; VI-NEXT: s_mov_b32 s10, s6
171 ; VI-NEXT: s_mov_b32 s11, s7
172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
173 ; VI-NEXT: s_mov_b32 s8, s2
174 ; VI-NEXT: s_mov_b32 s9, s3
175 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
176 ; VI-NEXT: s_mov_b32 s4, s0
177 ; VI-NEXT: s_mov_b32 s5, s1
178 ; VI-NEXT: s_waitcnt vmcnt(0)
179 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
180 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
181 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
182 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
185 ; GFX11-LABEL: fptosi_f16_to_i64:
186 ; GFX11: ; %bb.0: ; %entry
187 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
188 ; GFX11-NEXT: s_mov_b32 s6, -1
189 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
190 ; GFX11-NEXT: s_mov_b32 s10, s6
191 ; GFX11-NEXT: s_mov_b32 s11, s7
192 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
193 ; GFX11-NEXT: s_mov_b32 s8, s2
194 ; GFX11-NEXT: s_mov_b32 s9, s3
195 ; GFX11-NEXT: s_mov_b32 s4, s0
196 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
197 ; GFX11-NEXT: s_mov_b32 s5, s1
198 ; GFX11-NEXT: s_waitcnt vmcnt(0)
199 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
200 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
201 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
202 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
203 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
204 ; GFX11-NEXT: s_nop 0
205 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206 ; GFX11-NEXT: s_endpgm
208 ptr addrspace(1) %a) {
210 %a.val = load half, ptr addrspace(1) %a
211 %r.val = fptosi half %a.val to i64
212 store i64 %r.val, ptr addrspace(1) %r
216 define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
217 ; SI-LABEL: fptosi_v2f16_to_v2i16:
218 ; SI: ; %bb.0: ; %entry
219 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
220 ; SI-NEXT: s_mov_b32 s7, 0xf000
221 ; SI-NEXT: s_mov_b32 s6, -1
222 ; SI-NEXT: s_mov_b32 s10, s6
223 ; SI-NEXT: s_mov_b32 s11, s7
224 ; SI-NEXT: s_waitcnt lgkmcnt(0)
225 ; SI-NEXT: s_mov_b32 s8, s2
226 ; SI-NEXT: s_mov_b32 s9, s3
227 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
228 ; SI-NEXT: s_mov_b32 s4, s0
229 ; SI-NEXT: s_mov_b32 s5, s1
230 ; SI-NEXT: s_waitcnt vmcnt(0)
231 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
232 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
233 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
234 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
235 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
236 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1
237 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
238 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
239 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
242 ; VI-LABEL: fptosi_v2f16_to_v2i16:
243 ; VI: ; %bb.0: ; %entry
244 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
245 ; VI-NEXT: s_mov_b32 s7, 0xf000
246 ; VI-NEXT: s_mov_b32 s6, -1
247 ; VI-NEXT: s_mov_b32 s10, s6
248 ; VI-NEXT: s_mov_b32 s11, s7
249 ; VI-NEXT: s_waitcnt lgkmcnt(0)
250 ; VI-NEXT: s_mov_b32 s8, s2
251 ; VI-NEXT: s_mov_b32 s9, s3
252 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
253 ; VI-NEXT: s_mov_b32 s4, s0
254 ; VI-NEXT: s_mov_b32 s5, s1
255 ; VI-NEXT: s_waitcnt vmcnt(0)
256 ; VI-NEXT: v_cvt_i16_f16_e32 v1, v0
257 ; VI-NEXT: v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
258 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
259 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
262 ; GFX11-LABEL: fptosi_v2f16_to_v2i16:
263 ; GFX11: ; %bb.0: ; %entry
264 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
265 ; GFX11-NEXT: s_mov_b32 s6, -1
266 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
267 ; GFX11-NEXT: s_mov_b32 s10, s6
268 ; GFX11-NEXT: s_mov_b32 s11, s7
269 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
270 ; GFX11-NEXT: s_mov_b32 s8, s2
271 ; GFX11-NEXT: s_mov_b32 s9, s3
272 ; GFX11-NEXT: s_mov_b32 s4, s0
273 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
274 ; GFX11-NEXT: s_mov_b32 s5, s1
275 ; GFX11-NEXT: s_waitcnt vmcnt(0)
276 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
277 ; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
278 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
279 ; GFX11-NEXT: v_cvt_i16_f16_e32 v1, v1
280 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
281 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
282 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
283 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
284 ; GFX11-NEXT: s_nop 0
285 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
286 ; GFX11-NEXT: s_endpgm
288 ptr addrspace(1) %a) {
290 %a.val = load <2 x half>, ptr addrspace(1) %a
291 %r.val = fptosi <2 x half> %a.val to <2 x i16>
292 store <2 x i16> %r.val, ptr addrspace(1) %r
296 define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
297 ; SI-LABEL: fptosi_v2f16_to_v2i32:
298 ; SI: ; %bb.0: ; %entry
299 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
300 ; SI-NEXT: s_mov_b32 s7, 0xf000
301 ; SI-NEXT: s_mov_b32 s6, -1
302 ; SI-NEXT: s_mov_b32 s10, s6
303 ; SI-NEXT: s_mov_b32 s11, s7
304 ; SI-NEXT: s_waitcnt lgkmcnt(0)
305 ; SI-NEXT: s_mov_b32 s8, s2
306 ; SI-NEXT: s_mov_b32 s9, s3
307 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
308 ; SI-NEXT: s_mov_b32 s4, s0
309 ; SI-NEXT: s_mov_b32 s5, s1
310 ; SI-NEXT: s_waitcnt vmcnt(0)
311 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
312 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
313 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
314 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
315 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1
316 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
319 ; VI-LABEL: fptosi_v2f16_to_v2i32:
320 ; VI: ; %bb.0: ; %entry
321 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
322 ; VI-NEXT: s_mov_b32 s7, 0xf000
323 ; VI-NEXT: s_mov_b32 s6, -1
324 ; VI-NEXT: s_mov_b32 s10, s6
325 ; VI-NEXT: s_mov_b32 s11, s7
326 ; VI-NEXT: s_waitcnt lgkmcnt(0)
327 ; VI-NEXT: s_mov_b32 s8, s2
328 ; VI-NEXT: s_mov_b32 s9, s3
329 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
330 ; VI-NEXT: s_mov_b32 s4, s0
331 ; VI-NEXT: s_mov_b32 s5, s1
332 ; VI-NEXT: s_waitcnt vmcnt(0)
333 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
334 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
335 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v1
336 ; VI-NEXT: v_cvt_i32_f32_e32 v1, v2
337 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
340 ; GFX11-LABEL: fptosi_v2f16_to_v2i32:
341 ; GFX11: ; %bb.0: ; %entry
342 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
343 ; GFX11-NEXT: s_mov_b32 s6, -1
344 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
345 ; GFX11-NEXT: s_mov_b32 s10, s6
346 ; GFX11-NEXT: s_mov_b32 s11, s7
347 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
348 ; GFX11-NEXT: s_mov_b32 s8, s2
349 ; GFX11-NEXT: s_mov_b32 s9, s3
350 ; GFX11-NEXT: s_mov_b32 s4, s0
351 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
352 ; GFX11-NEXT: s_mov_b32 s5, s1
353 ; GFX11-NEXT: s_waitcnt vmcnt(0)
354 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
355 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
356 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
357 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
358 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
359 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
360 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
361 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
362 ; GFX11-NEXT: s_nop 0
363 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
364 ; GFX11-NEXT: s_endpgm
366 ptr addrspace(1) %a) {
368 %a.val = load <2 x half>, ptr addrspace(1) %a
369 %r.val = fptosi <2 x half> %a.val to <2 x i32>
370 store <2 x i32> %r.val, ptr addrspace(1) %r
374 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
375 ; test checks code generated for 'i64 = fp_to_sint f32'.
377 define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
378 ; SI-LABEL: fptosi_v2f16_to_v2i64:
379 ; SI: ; %bb.0: ; %entry
380 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
381 ; SI-NEXT: s_mov_b32 s7, 0xf000
382 ; SI-NEXT: s_mov_b32 s6, -1
383 ; SI-NEXT: s_mov_b32 s10, s6
384 ; SI-NEXT: s_mov_b32 s11, s7
385 ; SI-NEXT: s_waitcnt lgkmcnt(0)
386 ; SI-NEXT: s_mov_b32 s8, s2
387 ; SI-NEXT: s_mov_b32 s9, s3
388 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
389 ; SI-NEXT: s_mov_b32 s4, s0
390 ; SI-NEXT: s_mov_b32 s5, s1
391 ; SI-NEXT: s_waitcnt vmcnt(0)
392 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
393 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
394 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
395 ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
396 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
397 ; SI-NEXT: v_cvt_i32_f32_e32 v2, v2
398 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
399 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
402 ; VI-LABEL: fptosi_v2f16_to_v2i64:
403 ; VI: ; %bb.0: ; %entry
404 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
405 ; VI-NEXT: s_mov_b32 s7, 0xf000
406 ; VI-NEXT: s_mov_b32 s6, -1
407 ; VI-NEXT: s_mov_b32 s10, s6
408 ; VI-NEXT: s_mov_b32 s11, s7
409 ; VI-NEXT: s_waitcnt lgkmcnt(0)
410 ; VI-NEXT: s_mov_b32 s8, s2
411 ; VI-NEXT: s_mov_b32 s9, s3
412 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
413 ; VI-NEXT: s_mov_b32 s4, s0
414 ; VI-NEXT: s_mov_b32 s5, s1
415 ; VI-NEXT: s_waitcnt vmcnt(0)
416 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
417 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
418 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v1
419 ; VI-NEXT: v_cvt_i32_f32_e32 v2, v2
420 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
421 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
422 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
425 ; GFX11-LABEL: fptosi_v2f16_to_v2i64:
426 ; GFX11: ; %bb.0: ; %entry
427 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
428 ; GFX11-NEXT: s_mov_b32 s6, -1
429 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
430 ; GFX11-NEXT: s_mov_b32 s10, s6
431 ; GFX11-NEXT: s_mov_b32 s11, s7
432 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX11-NEXT: s_mov_b32 s8, s2
434 ; GFX11-NEXT: s_mov_b32 s9, s3
435 ; GFX11-NEXT: s_mov_b32 s4, s0
436 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
437 ; GFX11-NEXT: s_mov_b32 s5, s1
438 ; GFX11-NEXT: s_waitcnt vmcnt(0)
439 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
440 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
441 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
442 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
443 ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
444 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
445 ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v1
446 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
447 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
448 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
449 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
450 ; GFX11-NEXT: s_nop 0
451 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
452 ; GFX11-NEXT: s_endpgm
454 ptr addrspace(1) %a) {
456 %a.val = load <2 x half>, ptr addrspace(1) %a
457 %r.val = fptosi <2 x half> %a.val to <2 x i64>
458 store <2 x i64> %r.val, ptr addrspace(1) %r
462 define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
463 ; SI-LABEL: fptosi_f16_to_i1:
464 ; SI: ; %bb.0: ; %entry
465 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
466 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
467 ; SI-NEXT: s_mov_b32 s3, 0xf000
468 ; SI-NEXT: s_mov_b32 s2, -1
469 ; SI-NEXT: s_waitcnt lgkmcnt(0)
470 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
471 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, -1.0, v0
472 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
473 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
476 ; VI-LABEL: fptosi_f16_to_i1:
477 ; VI: ; %bb.0: ; %entry
478 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
479 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
480 ; VI-NEXT: s_mov_b32 s3, 0xf000
481 ; VI-NEXT: s_mov_b32 s2, -1
482 ; VI-NEXT: s_waitcnt lgkmcnt(0)
483 ; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 0xbc00, s4
484 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
485 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
488 ; GFX11-LABEL: fptosi_f16_to_i1:
489 ; GFX11: ; %bb.0: ; %entry
490 ; GFX11-NEXT: s_clause 0x1
491 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
492 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
493 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
494 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0xbc00, s2
496 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
497 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
498 ; GFX11-NEXT: s_mov_b32 s2, -1
499 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
500 ; GFX11-NEXT: s_nop 0
501 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
502 ; GFX11-NEXT: s_endpgm
504 %conv = fptosi half %in to i1
505 store i1 %conv, ptr addrspace(1) %out