1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
6 define amdgpu_kernel void @fptoui_f16_to_i16(
7 ; SI-LABEL: fptoui_f16_to_i16:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
10 ; SI-NEXT: s_mov_b32 s7, 0xf000
11 ; SI-NEXT: s_mov_b32 s6, -1
12 ; SI-NEXT: s_mov_b32 s10, s6
13 ; SI-NEXT: s_mov_b32 s11, s7
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s8, s2
16 ; SI-NEXT: s_mov_b32 s9, s3
17 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
18 ; SI-NEXT: s_mov_b32 s4, s0
19 ; SI-NEXT: s_mov_b32 s5, s1
20 ; SI-NEXT: s_waitcnt vmcnt(0)
21 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
22 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
23 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
26 ; VI-LABEL: fptoui_f16_to_i16:
27 ; VI: ; %bb.0: ; %entry
28 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
29 ; VI-NEXT: s_mov_b32 s7, 0xf000
30 ; VI-NEXT: s_mov_b32 s6, -1
31 ; VI-NEXT: s_mov_b32 s10, s6
32 ; VI-NEXT: s_mov_b32 s11, s7
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: s_mov_b32 s8, s2
35 ; VI-NEXT: s_mov_b32 s9, s3
36 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
37 ; VI-NEXT: s_mov_b32 s4, s0
38 ; VI-NEXT: s_mov_b32 s5, s1
39 ; VI-NEXT: s_waitcnt vmcnt(0)
40 ; VI-NEXT: v_cvt_u16_f16_e32 v0, v0
41 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
44 ; GFX11-LABEL: fptoui_f16_to_i16:
45 ; GFX11: ; %bb.0: ; %entry
46 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
47 ; GFX11-NEXT: s_mov_b32 s6, -1
48 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
49 ; GFX11-NEXT: s_mov_b32 s10, s6
50 ; GFX11-NEXT: s_mov_b32 s11, s7
51 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX11-NEXT: s_mov_b32 s8, s2
53 ; GFX11-NEXT: s_mov_b32 s9, s3
54 ; GFX11-NEXT: s_mov_b32 s4, s0
55 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
56 ; GFX11-NEXT: s_mov_b32 s5, s1
57 ; GFX11-NEXT: s_waitcnt vmcnt(0)
58 ; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
59 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
61 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
62 ; GFX11-NEXT: s_endpgm
64 ptr addrspace(1) %a) {
66 %a.val = load half, ptr addrspace(1) %a
67 %r.val = fptoui half %a.val to i16
68 store i16 %r.val, ptr addrspace(1) %r
72 define amdgpu_kernel void @fptoui_f16_to_i32(
73 ; SI-LABEL: fptoui_f16_to_i32:
74 ; SI: ; %bb.0: ; %entry
75 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
76 ; SI-NEXT: s_mov_b32 s7, 0xf000
77 ; SI-NEXT: s_mov_b32 s6, -1
78 ; SI-NEXT: s_mov_b32 s10, s6
79 ; SI-NEXT: s_mov_b32 s11, s7
80 ; SI-NEXT: s_waitcnt lgkmcnt(0)
81 ; SI-NEXT: s_mov_b32 s8, s2
82 ; SI-NEXT: s_mov_b32 s9, s3
83 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
84 ; SI-NEXT: s_mov_b32 s4, s0
85 ; SI-NEXT: s_mov_b32 s5, s1
86 ; SI-NEXT: s_waitcnt vmcnt(0)
87 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
88 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
89 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
92 ; VI-LABEL: fptoui_f16_to_i32:
93 ; VI: ; %bb.0: ; %entry
94 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
95 ; VI-NEXT: s_mov_b32 s7, 0xf000
96 ; VI-NEXT: s_mov_b32 s6, -1
97 ; VI-NEXT: s_mov_b32 s10, s6
98 ; VI-NEXT: s_mov_b32 s11, s7
99 ; VI-NEXT: s_waitcnt lgkmcnt(0)
100 ; VI-NEXT: s_mov_b32 s8, s2
101 ; VI-NEXT: s_mov_b32 s9, s3
102 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
103 ; VI-NEXT: s_mov_b32 s4, s0
104 ; VI-NEXT: s_mov_b32 s5, s1
105 ; VI-NEXT: s_waitcnt vmcnt(0)
106 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
107 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
108 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
111 ; GFX11-LABEL: fptoui_f16_to_i32:
112 ; GFX11: ; %bb.0: ; %entry
113 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
114 ; GFX11-NEXT: s_mov_b32 s6, -1
115 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
116 ; GFX11-NEXT: s_mov_b32 s10, s6
117 ; GFX11-NEXT: s_mov_b32 s11, s7
118 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
119 ; GFX11-NEXT: s_mov_b32 s8, s2
120 ; GFX11-NEXT: s_mov_b32 s9, s3
121 ; GFX11-NEXT: s_mov_b32 s4, s0
122 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
123 ; GFX11-NEXT: s_mov_b32 s5, s1
124 ; GFX11-NEXT: s_waitcnt vmcnt(0)
125 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
127 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
128 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
129 ; GFX11-NEXT: s_nop 0
130 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
131 ; GFX11-NEXT: s_endpgm
133 ptr addrspace(1) %a) {
135 %a.val = load half, ptr addrspace(1) %a
136 %r.val = fptoui half %a.val to i32
137 store i32 %r.val, ptr addrspace(1) %r
141 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
142 ; test checks code generated for 'i64 = fp_to_uint f32'.
144 define amdgpu_kernel void @fptoui_f16_to_i64(
145 ; SI-LABEL: fptoui_f16_to_i64:
146 ; SI: ; %bb.0: ; %entry
147 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
148 ; SI-NEXT: s_mov_b32 s7, 0xf000
149 ; SI-NEXT: s_mov_b32 s6, -1
150 ; SI-NEXT: s_mov_b32 s10, s6
151 ; SI-NEXT: s_mov_b32 s11, s7
152 ; SI-NEXT: s_waitcnt lgkmcnt(0)
153 ; SI-NEXT: s_mov_b32 s8, s2
154 ; SI-NEXT: s_mov_b32 s9, s3
155 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
156 ; SI-NEXT: s_mov_b32 s4, s0
157 ; SI-NEXT: s_mov_b32 s5, s1
158 ; SI-NEXT: v_mov_b32_e32 v1, 0
159 ; SI-NEXT: s_waitcnt vmcnt(0)
160 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
161 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
162 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
165 ; VI-LABEL: fptoui_f16_to_i64:
166 ; VI: ; %bb.0: ; %entry
167 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
168 ; VI-NEXT: s_mov_b32 s7, 0xf000
169 ; VI-NEXT: s_mov_b32 s6, -1
170 ; VI-NEXT: s_mov_b32 s10, s6
171 ; VI-NEXT: s_mov_b32 s11, s7
172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
173 ; VI-NEXT: s_mov_b32 s8, s2
174 ; VI-NEXT: s_mov_b32 s9, s3
175 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
176 ; VI-NEXT: s_mov_b32 s4, s0
177 ; VI-NEXT: s_mov_b32 s5, s1
178 ; VI-NEXT: v_mov_b32_e32 v1, 0
179 ; VI-NEXT: s_waitcnt vmcnt(0)
180 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
181 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
182 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
185 ; GFX11-LABEL: fptoui_f16_to_i64:
186 ; GFX11: ; %bb.0: ; %entry
187 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
188 ; GFX11-NEXT: s_mov_b32 s6, -1
189 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
190 ; GFX11-NEXT: s_mov_b32 s10, s6
191 ; GFX11-NEXT: s_mov_b32 s11, s7
192 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
193 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX11-NEXT: s_mov_b32 s8, s2
195 ; GFX11-NEXT: s_mov_b32 s9, s3
196 ; GFX11-NEXT: s_mov_b32 s4, s0
197 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
198 ; GFX11-NEXT: s_mov_b32 s5, s1
199 ; GFX11-NEXT: s_waitcnt vmcnt(0)
200 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
201 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
202 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
203 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
204 ; GFX11-NEXT: s_nop 0
205 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206 ; GFX11-NEXT: s_endpgm
208 ptr addrspace(1) %a) {
210 %a.val = load half, ptr addrspace(1) %a
211 %r.val = fptoui half %a.val to i64
212 store i64 %r.val, ptr addrspace(1) %r
216 define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
217 ; SI-LABEL: fptoui_v2f16_to_v2i16:
218 ; SI: ; %bb.0: ; %entry
219 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
220 ; SI-NEXT: s_mov_b32 s7, 0xf000
221 ; SI-NEXT: s_mov_b32 s6, -1
222 ; SI-NEXT: s_mov_b32 s10, s6
223 ; SI-NEXT: s_mov_b32 s11, s7
224 ; SI-NEXT: s_waitcnt lgkmcnt(0)
225 ; SI-NEXT: s_mov_b32 s8, s2
226 ; SI-NEXT: s_mov_b32 s9, s3
227 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
228 ; SI-NEXT: s_mov_b32 s4, s0
229 ; SI-NEXT: s_mov_b32 s5, s1
230 ; SI-NEXT: s_waitcnt vmcnt(0)
231 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
232 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
233 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
234 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1
235 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
236 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
237 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
238 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
241 ; VI-LABEL: fptoui_v2f16_to_v2i16:
242 ; VI: ; %bb.0: ; %entry
243 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
244 ; VI-NEXT: s_mov_b32 s7, 0xf000
245 ; VI-NEXT: s_mov_b32 s6, -1
246 ; VI-NEXT: s_mov_b32 s10, s6
247 ; VI-NEXT: s_mov_b32 s11, s7
248 ; VI-NEXT: s_waitcnt lgkmcnt(0)
249 ; VI-NEXT: s_mov_b32 s8, s2
250 ; VI-NEXT: s_mov_b32 s9, s3
251 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
252 ; VI-NEXT: s_mov_b32 s4, s0
253 ; VI-NEXT: s_mov_b32 s5, s1
254 ; VI-NEXT: s_waitcnt vmcnt(0)
255 ; VI-NEXT: v_cvt_u16_f16_e32 v1, v0
256 ; VI-NEXT: v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
257 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
258 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
261 ; GFX11-LABEL: fptoui_v2f16_to_v2i16:
262 ; GFX11: ; %bb.0: ; %entry
263 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
264 ; GFX11-NEXT: s_mov_b32 s6, -1
265 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
266 ; GFX11-NEXT: s_mov_b32 s10, s6
267 ; GFX11-NEXT: s_mov_b32 s11, s7
268 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
269 ; GFX11-NEXT: s_mov_b32 s8, s2
270 ; GFX11-NEXT: s_mov_b32 s9, s3
271 ; GFX11-NEXT: s_mov_b32 s4, s0
272 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
273 ; GFX11-NEXT: s_mov_b32 s5, s1
274 ; GFX11-NEXT: s_waitcnt vmcnt(0)
275 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
276 ; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
277 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
278 ; GFX11-NEXT: v_cvt_u16_f16_e32 v1, v1
279 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
280 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
281 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
282 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
283 ; GFX11-NEXT: s_nop 0
284 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
285 ; GFX11-NEXT: s_endpgm
287 ptr addrspace(1) %a) {
289 %a.val = load <2 x half>, ptr addrspace(1) %a
290 %r.val = fptoui <2 x half> %a.val to <2 x i16>
291 store <2 x i16> %r.val, ptr addrspace(1) %r
295 define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
296 ; SI-LABEL: fptoui_v2f16_to_v2i32:
297 ; SI: ; %bb.0: ; %entry
298 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
299 ; SI-NEXT: s_mov_b32 s7, 0xf000
300 ; SI-NEXT: s_mov_b32 s6, -1
301 ; SI-NEXT: s_mov_b32 s10, s6
302 ; SI-NEXT: s_mov_b32 s11, s7
303 ; SI-NEXT: s_waitcnt lgkmcnt(0)
304 ; SI-NEXT: s_mov_b32 s8, s2
305 ; SI-NEXT: s_mov_b32 s9, s3
306 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
307 ; SI-NEXT: s_mov_b32 s4, s0
308 ; SI-NEXT: s_mov_b32 s5, s1
309 ; SI-NEXT: s_waitcnt vmcnt(0)
310 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
311 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
312 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
313 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
314 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1
315 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
318 ; VI-LABEL: fptoui_v2f16_to_v2i32:
319 ; VI: ; %bb.0: ; %entry
320 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
321 ; VI-NEXT: s_mov_b32 s7, 0xf000
322 ; VI-NEXT: s_mov_b32 s6, -1
323 ; VI-NEXT: s_mov_b32 s10, s6
324 ; VI-NEXT: s_mov_b32 s11, s7
325 ; VI-NEXT: s_waitcnt lgkmcnt(0)
326 ; VI-NEXT: s_mov_b32 s8, s2
327 ; VI-NEXT: s_mov_b32 s9, s3
328 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
329 ; VI-NEXT: s_mov_b32 s4, s0
330 ; VI-NEXT: s_mov_b32 s5, s1
331 ; VI-NEXT: s_waitcnt vmcnt(0)
332 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
333 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
334 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v1
335 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
336 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
339 ; GFX11-LABEL: fptoui_v2f16_to_v2i32:
340 ; GFX11: ; %bb.0: ; %entry
341 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
342 ; GFX11-NEXT: s_mov_b32 s6, -1
343 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
344 ; GFX11-NEXT: s_mov_b32 s10, s6
345 ; GFX11-NEXT: s_mov_b32 s11, s7
346 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX11-NEXT: s_mov_b32 s8, s2
348 ; GFX11-NEXT: s_mov_b32 s9, s3
349 ; GFX11-NEXT: s_mov_b32 s4, s0
350 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
351 ; GFX11-NEXT: s_mov_b32 s5, s1
352 ; GFX11-NEXT: s_waitcnt vmcnt(0)
353 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
354 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
355 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
356 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
357 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
358 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
359 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
360 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
361 ; GFX11-NEXT: s_nop 0
362 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
363 ; GFX11-NEXT: s_endpgm
365 ptr addrspace(1) %a) {
367 %a.val = load <2 x half>, ptr addrspace(1) %a
368 %r.val = fptoui <2 x half> %a.val to <2 x i32>
369 store <2 x i32> %r.val, ptr addrspace(1) %r
373 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
374 ; test checks code generated for 'i64 = fp_to_uint f32'.
376 define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
377 ; SI-LABEL: fptoui_v2f16_to_v2i64:
378 ; SI: ; %bb.0: ; %entry
379 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
380 ; SI-NEXT: s_mov_b32 s7, 0xf000
381 ; SI-NEXT: s_mov_b32 s6, -1
382 ; SI-NEXT: s_mov_b32 s10, s6
383 ; SI-NEXT: s_mov_b32 s11, s7
384 ; SI-NEXT: s_waitcnt lgkmcnt(0)
385 ; SI-NEXT: s_mov_b32 s8, s2
386 ; SI-NEXT: s_mov_b32 s9, s3
387 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
388 ; SI-NEXT: s_mov_b32 s4, s0
389 ; SI-NEXT: s_mov_b32 s5, s1
390 ; SI-NEXT: v_mov_b32_e32 v3, 0
391 ; SI-NEXT: s_waitcnt vmcnt(0)
392 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
393 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
394 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
395 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
396 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v1
397 ; SI-NEXT: v_mov_b32_e32 v1, 0
398 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
401 ; VI-LABEL: fptoui_v2f16_to_v2i64:
402 ; VI: ; %bb.0: ; %entry
403 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
404 ; VI-NEXT: s_mov_b32 s7, 0xf000
405 ; VI-NEXT: s_mov_b32 s6, -1
406 ; VI-NEXT: s_mov_b32 s10, s6
407 ; VI-NEXT: s_mov_b32 s11, s7
408 ; VI-NEXT: s_waitcnt lgkmcnt(0)
409 ; VI-NEXT: s_mov_b32 s8, s2
410 ; VI-NEXT: s_mov_b32 s9, s3
411 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
412 ; VI-NEXT: s_mov_b32 s4, s0
413 ; VI-NEXT: s_mov_b32 s5, s1
414 ; VI-NEXT: v_mov_b32_e32 v3, 0
415 ; VI-NEXT: s_waitcnt vmcnt(0)
416 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
417 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
418 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v1
419 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
420 ; VI-NEXT: v_mov_b32_e32 v1, 0
421 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
424 ; GFX11-LABEL: fptoui_v2f16_to_v2i64:
425 ; GFX11: ; %bb.0: ; %entry
426 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
427 ; GFX11-NEXT: s_mov_b32 s6, -1
428 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
429 ; GFX11-NEXT: s_mov_b32 s10, s6
430 ; GFX11-NEXT: s_mov_b32 s11, s7
431 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
432 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX11-NEXT: s_mov_b32 s8, s2
434 ; GFX11-NEXT: s_mov_b32 s9, s3
435 ; GFX11-NEXT: s_mov_b32 s4, s0
436 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
437 ; GFX11-NEXT: s_mov_b32 s5, s1
438 ; GFX11-NEXT: s_waitcnt vmcnt(0)
439 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
440 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
441 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
442 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
443 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
444 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
445 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
446 ; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
447 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
448 ; GFX11-NEXT: s_nop 0
449 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
450 ; GFX11-NEXT: s_endpgm
452 ptr addrspace(1) %a) {
454 %a.val = load <2 x half>, ptr addrspace(1) %a
455 %r.val = fptoui <2 x half> %a.val to <2 x i64>
456 store <2 x i64> %r.val, ptr addrspace(1) %r
460 define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
461 ; SI-LABEL: fptoui_f16_to_i1:
462 ; SI: ; %bb.0: ; %entry
463 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
464 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
465 ; SI-NEXT: s_mov_b32 s3, 0xf000
466 ; SI-NEXT: s_waitcnt lgkmcnt(0)
467 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
468 ; SI-NEXT: s_mov_b32 s2, -1
469 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 1.0, v0
470 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
471 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
474 ; VI-LABEL: fptoui_f16_to_i1:
475 ; VI: ; %bb.0: ; %entry
476 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
477 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
478 ; VI-NEXT: s_mov_b32 s3, 0xf000
479 ; VI-NEXT: s_mov_b32 s2, -1
480 ; VI-NEXT: s_waitcnt lgkmcnt(0)
481 ; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s4
482 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
483 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
486 ; GFX11-LABEL: fptoui_f16_to_i1:
487 ; GFX11: ; %bb.0: ; %entry
488 ; GFX11-NEXT: s_clause 0x1
489 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
490 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
491 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
492 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
493 ; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2
494 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
495 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
496 ; GFX11-NEXT: s_mov_b32 s2, -1
497 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
498 ; GFX11-NEXT: s_nop 0
499 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
500 ; GFX11-NEXT: s_endpgm
502 %conv = fptoui half %in to i1
503 store i1 %conv, ptr addrspace(1) %out