1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s
6 define amdgpu_kernel void @fcmp_f16_lt(
7 ; SI-LABEL: fcmp_f16_lt:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
11 ; SI-NEXT: s_mov_b32 s11, 0xf000
12 ; SI-NEXT: s_mov_b32 s10, -1
13 ; SI-NEXT: s_mov_b32 s14, s10
14 ; SI-NEXT: s_mov_b32 s15, s11
15 ; SI-NEXT: s_mov_b32 s2, s10
16 ; SI-NEXT: s_mov_b32 s3, s11
17 ; SI-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NEXT: s_mov_b32 s12, s6
19 ; SI-NEXT: s_mov_b32 s13, s7
20 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
21 ; SI-NEXT: s_waitcnt vmcnt(0)
22 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: s_mov_b32 s8, s4
25 ; SI-NEXT: s_mov_b32 s9, s5
26 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
27 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
28 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
29 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
30 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
33 ; VI-LABEL: fcmp_f16_lt:
34 ; VI: ; %bb.0: ; %entry
35 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
36 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
37 ; VI-NEXT: s_mov_b32 s3, 0xf000
38 ; VI-NEXT: s_mov_b32 s2, -1
39 ; VI-NEXT: s_mov_b32 s14, s2
40 ; VI-NEXT: s_waitcnt lgkmcnt(0)
41 ; VI-NEXT: s_mov_b32 s12, s6
42 ; VI-NEXT: s_mov_b32 s13, s7
43 ; VI-NEXT: s_mov_b32 s15, s3
44 ; VI-NEXT: s_mov_b32 s10, s2
45 ; VI-NEXT: s_mov_b32 s11, s3
46 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
47 ; VI-NEXT: s_waitcnt vmcnt(0)
48 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
49 ; VI-NEXT: s_waitcnt vmcnt(0)
50 ; VI-NEXT: s_mov_b32 s0, s4
51 ; VI-NEXT: s_mov_b32 s1, s5
52 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
53 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
57 ; GFX11-LABEL: fcmp_f16_lt:
58 ; GFX11: ; %bb.0: ; %entry
59 ; GFX11-NEXT: s_clause 0x1
60 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
61 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
62 ; GFX11-NEXT: s_mov_b32 s10, -1
63 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
64 ; GFX11-NEXT: s_mov_b32 s14, s10
65 ; GFX11-NEXT: s_mov_b32 s15, s11
66 ; GFX11-NEXT: s_mov_b32 s2, s10
67 ; GFX11-NEXT: s_mov_b32 s3, s11
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: s_mov_b32 s12, s6
70 ; GFX11-NEXT: s_mov_b32 s13, s7
71 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
72 ; GFX11-NEXT: s_waitcnt vmcnt(0)
73 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: s_mov_b32 s8, s4
76 ; GFX11-NEXT: s_mov_b32 s9, s5
77 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
78 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
79 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
81 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
82 ; GFX11-NEXT: s_endpgm
85 ptr addrspace(1) %b) {
87 %a.val = load volatile half, ptr addrspace(1) %a
88 %b.val = load volatile half, ptr addrspace(1) %b
89 %r.val = fcmp olt half %a.val, %b.val
90 %r.val.sext = sext i1 %r.val to i32
91 store i32 %r.val.sext, ptr addrspace(1) %r
95 define amdgpu_kernel void @fcmp_f16_lt_abs(
96 ; SI-LABEL: fcmp_f16_lt_abs:
97 ; SI: ; %bb.0: ; %entry
98 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
99 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
100 ; SI-NEXT: s_mov_b32 s11, 0xf000
101 ; SI-NEXT: s_mov_b32 s10, -1
102 ; SI-NEXT: s_mov_b32 s14, s10
103 ; SI-NEXT: s_mov_b32 s15, s11
104 ; SI-NEXT: s_mov_b32 s2, s10
105 ; SI-NEXT: s_mov_b32 s3, s11
106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
107 ; SI-NEXT: s_mov_b32 s12, s6
108 ; SI-NEXT: s_mov_b32 s13, s7
109 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
110 ; SI-NEXT: s_waitcnt vmcnt(0)
111 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
112 ; SI-NEXT: s_waitcnt vmcnt(0)
113 ; SI-NEXT: s_mov_b32 s8, s4
114 ; SI-NEXT: s_mov_b32 s9, s5
115 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
116 ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
117 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
118 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
119 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
122 ; VI-LABEL: fcmp_f16_lt_abs:
123 ; VI: ; %bb.0: ; %entry
124 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
125 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
126 ; VI-NEXT: s_mov_b32 s3, 0xf000
127 ; VI-NEXT: s_mov_b32 s2, -1
128 ; VI-NEXT: s_mov_b32 s14, s2
129 ; VI-NEXT: s_waitcnt lgkmcnt(0)
130 ; VI-NEXT: s_mov_b32 s12, s6
131 ; VI-NEXT: s_mov_b32 s13, s7
132 ; VI-NEXT: s_mov_b32 s15, s3
133 ; VI-NEXT: s_mov_b32 s10, s2
134 ; VI-NEXT: s_mov_b32 s11, s3
135 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
136 ; VI-NEXT: s_waitcnt vmcnt(0)
137 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
138 ; VI-NEXT: s_waitcnt vmcnt(0)
139 ; VI-NEXT: s_mov_b32 s0, s4
140 ; VI-NEXT: s_mov_b32 s1, s5
141 ; VI-NEXT: v_cmp_lt_f16_e64 s[4:5], |v0|, |v1|
142 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
143 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
146 ; GFX11-LABEL: fcmp_f16_lt_abs:
147 ; GFX11: ; %bb.0: ; %entry
148 ; GFX11-NEXT: s_clause 0x1
149 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
150 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
151 ; GFX11-NEXT: s_mov_b32 s10, -1
152 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
153 ; GFX11-NEXT: s_mov_b32 s14, s10
154 ; GFX11-NEXT: s_mov_b32 s15, s11
155 ; GFX11-NEXT: s_mov_b32 s2, s10
156 ; GFX11-NEXT: s_mov_b32 s3, s11
157 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX11-NEXT: s_mov_b32 s12, s6
159 ; GFX11-NEXT: s_mov_b32 s13, s7
160 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
161 ; GFX11-NEXT: s_waitcnt vmcnt(0)
162 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
163 ; GFX11-NEXT: s_waitcnt vmcnt(0)
164 ; GFX11-NEXT: s_mov_b32 s8, s4
165 ; GFX11-NEXT: s_mov_b32 s9, s5
166 ; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1|
167 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
168 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0
169 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
170 ; GFX11-NEXT: s_nop 0
171 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
172 ; GFX11-NEXT: s_endpgm
175 ptr addrspace(1) %b) {
177 %a.val = load volatile half, ptr addrspace(1) %a
178 %b.val = load volatile half, ptr addrspace(1) %b
179 %a.abs = call half @llvm.fabs.f16(half %a.val)
180 %b.abs = call half @llvm.fabs.f16(half %b.val)
181 %r.val = fcmp olt half %a.abs, %b.abs
182 %r.val.sext = sext i1 %r.val to i32
183 store i32 %r.val.sext, ptr addrspace(1) %r
187 define amdgpu_kernel void @fcmp_f16_eq(
188 ; SI-LABEL: fcmp_f16_eq:
189 ; SI: ; %bb.0: ; %entry
190 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
191 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
192 ; SI-NEXT: s_mov_b32 s11, 0xf000
193 ; SI-NEXT: s_mov_b32 s10, -1
194 ; SI-NEXT: s_mov_b32 s14, s10
195 ; SI-NEXT: s_mov_b32 s15, s11
196 ; SI-NEXT: s_mov_b32 s2, s10
197 ; SI-NEXT: s_mov_b32 s3, s11
198 ; SI-NEXT: s_waitcnt lgkmcnt(0)
199 ; SI-NEXT: s_mov_b32 s12, s6
200 ; SI-NEXT: s_mov_b32 s13, s7
201 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
202 ; SI-NEXT: s_waitcnt vmcnt(0)
203 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
204 ; SI-NEXT: s_waitcnt vmcnt(0)
205 ; SI-NEXT: s_mov_b32 s8, s4
206 ; SI-NEXT: s_mov_b32 s9, s5
207 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
208 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
209 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
210 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
211 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
214 ; VI-LABEL: fcmp_f16_eq:
215 ; VI: ; %bb.0: ; %entry
216 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
217 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
218 ; VI-NEXT: s_mov_b32 s3, 0xf000
219 ; VI-NEXT: s_mov_b32 s2, -1
220 ; VI-NEXT: s_mov_b32 s14, s2
221 ; VI-NEXT: s_waitcnt lgkmcnt(0)
222 ; VI-NEXT: s_mov_b32 s12, s6
223 ; VI-NEXT: s_mov_b32 s13, s7
224 ; VI-NEXT: s_mov_b32 s15, s3
225 ; VI-NEXT: s_mov_b32 s10, s2
226 ; VI-NEXT: s_mov_b32 s11, s3
227 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
228 ; VI-NEXT: s_waitcnt vmcnt(0)
229 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
230 ; VI-NEXT: s_waitcnt vmcnt(0)
231 ; VI-NEXT: s_mov_b32 s0, s4
232 ; VI-NEXT: s_mov_b32 s1, s5
233 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
234 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
235 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
238 ; GFX11-LABEL: fcmp_f16_eq:
239 ; GFX11: ; %bb.0: ; %entry
240 ; GFX11-NEXT: s_clause 0x1
241 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
242 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
243 ; GFX11-NEXT: s_mov_b32 s10, -1
244 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
245 ; GFX11-NEXT: s_mov_b32 s14, s10
246 ; GFX11-NEXT: s_mov_b32 s15, s11
247 ; GFX11-NEXT: s_mov_b32 s2, s10
248 ; GFX11-NEXT: s_mov_b32 s3, s11
249 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX11-NEXT: s_mov_b32 s12, s6
251 ; GFX11-NEXT: s_mov_b32 s13, s7
252 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
253 ; GFX11-NEXT: s_waitcnt vmcnt(0)
254 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
255 ; GFX11-NEXT: s_waitcnt vmcnt(0)
256 ; GFX11-NEXT: s_mov_b32 s8, s4
257 ; GFX11-NEXT: s_mov_b32 s9, s5
258 ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
259 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
260 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
261 ; GFX11-NEXT: s_nop 0
262 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
263 ; GFX11-NEXT: s_endpgm
266 ptr addrspace(1) %b) {
268 %a.val = load volatile half, ptr addrspace(1) %a
269 %b.val = load volatile half, ptr addrspace(1) %b
270 %r.val = fcmp oeq half %a.val, %b.val
271 %r.val.sext = sext i1 %r.val to i32
272 store i32 %r.val.sext, ptr addrspace(1) %r
276 define amdgpu_kernel void @fcmp_f16_le(
277 ; SI-LABEL: fcmp_f16_le:
278 ; SI: ; %bb.0: ; %entry
279 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
280 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
281 ; SI-NEXT: s_mov_b32 s11, 0xf000
282 ; SI-NEXT: s_mov_b32 s10, -1
283 ; SI-NEXT: s_mov_b32 s14, s10
284 ; SI-NEXT: s_mov_b32 s15, s11
285 ; SI-NEXT: s_mov_b32 s2, s10
286 ; SI-NEXT: s_mov_b32 s3, s11
287 ; SI-NEXT: s_waitcnt lgkmcnt(0)
288 ; SI-NEXT: s_mov_b32 s12, s6
289 ; SI-NEXT: s_mov_b32 s13, s7
290 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
291 ; SI-NEXT: s_waitcnt vmcnt(0)
292 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
293 ; SI-NEXT: s_waitcnt vmcnt(0)
294 ; SI-NEXT: s_mov_b32 s8, s4
295 ; SI-NEXT: s_mov_b32 s9, s5
296 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
297 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
298 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
299 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
300 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
303 ; VI-LABEL: fcmp_f16_le:
304 ; VI: ; %bb.0: ; %entry
305 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
306 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
307 ; VI-NEXT: s_mov_b32 s3, 0xf000
308 ; VI-NEXT: s_mov_b32 s2, -1
309 ; VI-NEXT: s_mov_b32 s14, s2
310 ; VI-NEXT: s_waitcnt lgkmcnt(0)
311 ; VI-NEXT: s_mov_b32 s12, s6
312 ; VI-NEXT: s_mov_b32 s13, s7
313 ; VI-NEXT: s_mov_b32 s15, s3
314 ; VI-NEXT: s_mov_b32 s10, s2
315 ; VI-NEXT: s_mov_b32 s11, s3
316 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
317 ; VI-NEXT: s_waitcnt vmcnt(0)
318 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
319 ; VI-NEXT: s_waitcnt vmcnt(0)
320 ; VI-NEXT: s_mov_b32 s0, s4
321 ; VI-NEXT: s_mov_b32 s1, s5
322 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
323 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
324 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
327 ; GFX11-LABEL: fcmp_f16_le:
328 ; GFX11: ; %bb.0: ; %entry
329 ; GFX11-NEXT: s_clause 0x1
330 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
331 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
332 ; GFX11-NEXT: s_mov_b32 s10, -1
333 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
334 ; GFX11-NEXT: s_mov_b32 s14, s10
335 ; GFX11-NEXT: s_mov_b32 s15, s11
336 ; GFX11-NEXT: s_mov_b32 s2, s10
337 ; GFX11-NEXT: s_mov_b32 s3, s11
338 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX11-NEXT: s_mov_b32 s12, s6
340 ; GFX11-NEXT: s_mov_b32 s13, s7
341 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
342 ; GFX11-NEXT: s_waitcnt vmcnt(0)
343 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
344 ; GFX11-NEXT: s_waitcnt vmcnt(0)
345 ; GFX11-NEXT: s_mov_b32 s8, s4
346 ; GFX11-NEXT: s_mov_b32 s9, s5
347 ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
348 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
349 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
350 ; GFX11-NEXT: s_nop 0
351 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
352 ; GFX11-NEXT: s_endpgm
355 ptr addrspace(1) %b) {
357 %a.val = load volatile half, ptr addrspace(1) %a
358 %b.val = load volatile half, ptr addrspace(1) %b
359 %r.val = fcmp ole half %a.val, %b.val
360 %r.val.sext = sext i1 %r.val to i32
361 store i32 %r.val.sext, ptr addrspace(1) %r
365 define amdgpu_kernel void @fcmp_f16_gt(
366 ; SI-LABEL: fcmp_f16_gt:
367 ; SI: ; %bb.0: ; %entry
368 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
369 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
370 ; SI-NEXT: s_mov_b32 s11, 0xf000
371 ; SI-NEXT: s_mov_b32 s10, -1
372 ; SI-NEXT: s_mov_b32 s14, s10
373 ; SI-NEXT: s_mov_b32 s15, s11
374 ; SI-NEXT: s_mov_b32 s2, s10
375 ; SI-NEXT: s_mov_b32 s3, s11
376 ; SI-NEXT: s_waitcnt lgkmcnt(0)
377 ; SI-NEXT: s_mov_b32 s12, s6
378 ; SI-NEXT: s_mov_b32 s13, s7
379 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
380 ; SI-NEXT: s_waitcnt vmcnt(0)
381 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
382 ; SI-NEXT: s_waitcnt vmcnt(0)
383 ; SI-NEXT: s_mov_b32 s8, s4
384 ; SI-NEXT: s_mov_b32 s9, s5
385 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
386 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
387 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
388 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
389 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
392 ; VI-LABEL: fcmp_f16_gt:
393 ; VI: ; %bb.0: ; %entry
394 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
395 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
396 ; VI-NEXT: s_mov_b32 s3, 0xf000
397 ; VI-NEXT: s_mov_b32 s2, -1
398 ; VI-NEXT: s_mov_b32 s14, s2
399 ; VI-NEXT: s_waitcnt lgkmcnt(0)
400 ; VI-NEXT: s_mov_b32 s12, s6
401 ; VI-NEXT: s_mov_b32 s13, s7
402 ; VI-NEXT: s_mov_b32 s15, s3
403 ; VI-NEXT: s_mov_b32 s10, s2
404 ; VI-NEXT: s_mov_b32 s11, s3
405 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
406 ; VI-NEXT: s_waitcnt vmcnt(0)
407 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
408 ; VI-NEXT: s_waitcnt vmcnt(0)
409 ; VI-NEXT: s_mov_b32 s0, s4
410 ; VI-NEXT: s_mov_b32 s1, s5
411 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
412 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
413 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
416 ; GFX11-LABEL: fcmp_f16_gt:
417 ; GFX11: ; %bb.0: ; %entry
418 ; GFX11-NEXT: s_clause 0x1
419 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
420 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
421 ; GFX11-NEXT: s_mov_b32 s10, -1
422 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
423 ; GFX11-NEXT: s_mov_b32 s14, s10
424 ; GFX11-NEXT: s_mov_b32 s15, s11
425 ; GFX11-NEXT: s_mov_b32 s2, s10
426 ; GFX11-NEXT: s_mov_b32 s3, s11
427 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX11-NEXT: s_mov_b32 s12, s6
429 ; GFX11-NEXT: s_mov_b32 s13, s7
430 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
431 ; GFX11-NEXT: s_waitcnt vmcnt(0)
432 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
433 ; GFX11-NEXT: s_waitcnt vmcnt(0)
434 ; GFX11-NEXT: s_mov_b32 s8, s4
435 ; GFX11-NEXT: s_mov_b32 s9, s5
436 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
437 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
438 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
439 ; GFX11-NEXT: s_nop 0
440 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
441 ; GFX11-NEXT: s_endpgm
444 ptr addrspace(1) %b) {
446 %a.val = load volatile half, ptr addrspace(1) %a
447 %b.val = load volatile half, ptr addrspace(1) %b
448 %r.val = fcmp ogt half %a.val, %b.val
449 %r.val.sext = sext i1 %r.val to i32
450 store i32 %r.val.sext, ptr addrspace(1) %r
454 define amdgpu_kernel void @fcmp_f16_lg(
455 ; SI-LABEL: fcmp_f16_lg:
456 ; SI: ; %bb.0: ; %entry
457 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
458 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
459 ; SI-NEXT: s_mov_b32 s11, 0xf000
460 ; SI-NEXT: s_mov_b32 s10, -1
461 ; SI-NEXT: s_mov_b32 s14, s10
462 ; SI-NEXT: s_mov_b32 s15, s11
463 ; SI-NEXT: s_mov_b32 s2, s10
464 ; SI-NEXT: s_mov_b32 s3, s11
465 ; SI-NEXT: s_waitcnt lgkmcnt(0)
466 ; SI-NEXT: s_mov_b32 s12, s6
467 ; SI-NEXT: s_mov_b32 s13, s7
468 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
469 ; SI-NEXT: s_waitcnt vmcnt(0)
470 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
471 ; SI-NEXT: s_waitcnt vmcnt(0)
472 ; SI-NEXT: s_mov_b32 s8, s4
473 ; SI-NEXT: s_mov_b32 s9, s5
474 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
475 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
476 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
477 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
478 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
481 ; VI-LABEL: fcmp_f16_lg:
482 ; VI: ; %bb.0: ; %entry
483 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
484 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
485 ; VI-NEXT: s_mov_b32 s3, 0xf000
486 ; VI-NEXT: s_mov_b32 s2, -1
487 ; VI-NEXT: s_mov_b32 s14, s2
488 ; VI-NEXT: s_waitcnt lgkmcnt(0)
489 ; VI-NEXT: s_mov_b32 s12, s6
490 ; VI-NEXT: s_mov_b32 s13, s7
491 ; VI-NEXT: s_mov_b32 s15, s3
492 ; VI-NEXT: s_mov_b32 s10, s2
493 ; VI-NEXT: s_mov_b32 s11, s3
494 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
495 ; VI-NEXT: s_waitcnt vmcnt(0)
496 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
497 ; VI-NEXT: s_waitcnt vmcnt(0)
498 ; VI-NEXT: s_mov_b32 s0, s4
499 ; VI-NEXT: s_mov_b32 s1, s5
500 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
501 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
502 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
505 ; GFX11-LABEL: fcmp_f16_lg:
506 ; GFX11: ; %bb.0: ; %entry
507 ; GFX11-NEXT: s_clause 0x1
508 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
509 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
510 ; GFX11-NEXT: s_mov_b32 s10, -1
511 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
512 ; GFX11-NEXT: s_mov_b32 s14, s10
513 ; GFX11-NEXT: s_mov_b32 s15, s11
514 ; GFX11-NEXT: s_mov_b32 s2, s10
515 ; GFX11-NEXT: s_mov_b32 s3, s11
516 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
517 ; GFX11-NEXT: s_mov_b32 s12, s6
518 ; GFX11-NEXT: s_mov_b32 s13, s7
519 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
520 ; GFX11-NEXT: s_waitcnt vmcnt(0)
521 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
522 ; GFX11-NEXT: s_waitcnt vmcnt(0)
523 ; GFX11-NEXT: s_mov_b32 s8, s4
524 ; GFX11-NEXT: s_mov_b32 s9, s5
525 ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
526 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
527 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
528 ; GFX11-NEXT: s_nop 0
529 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
530 ; GFX11-NEXT: s_endpgm
533 ptr addrspace(1) %b) {
535 %a.val = load volatile half, ptr addrspace(1) %a
536 %b.val = load volatile half, ptr addrspace(1) %b
537 %r.val = fcmp one half %a.val, %b.val
538 %r.val.sext = sext i1 %r.val to i32
539 store i32 %r.val.sext, ptr addrspace(1) %r
543 define amdgpu_kernel void @fcmp_f16_ge(
544 ; SI-LABEL: fcmp_f16_ge:
545 ; SI: ; %bb.0: ; %entry
546 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
547 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
548 ; SI-NEXT: s_mov_b32 s11, 0xf000
549 ; SI-NEXT: s_mov_b32 s10, -1
550 ; SI-NEXT: s_mov_b32 s14, s10
551 ; SI-NEXT: s_mov_b32 s15, s11
552 ; SI-NEXT: s_mov_b32 s2, s10
553 ; SI-NEXT: s_mov_b32 s3, s11
554 ; SI-NEXT: s_waitcnt lgkmcnt(0)
555 ; SI-NEXT: s_mov_b32 s12, s6
556 ; SI-NEXT: s_mov_b32 s13, s7
557 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
558 ; SI-NEXT: s_waitcnt vmcnt(0)
559 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
560 ; SI-NEXT: s_waitcnt vmcnt(0)
561 ; SI-NEXT: s_mov_b32 s8, s4
562 ; SI-NEXT: s_mov_b32 s9, s5
563 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
564 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
565 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
566 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
567 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
570 ; VI-LABEL: fcmp_f16_ge:
571 ; VI: ; %bb.0: ; %entry
572 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
573 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
574 ; VI-NEXT: s_mov_b32 s3, 0xf000
575 ; VI-NEXT: s_mov_b32 s2, -1
576 ; VI-NEXT: s_mov_b32 s14, s2
577 ; VI-NEXT: s_waitcnt lgkmcnt(0)
578 ; VI-NEXT: s_mov_b32 s12, s6
579 ; VI-NEXT: s_mov_b32 s13, s7
580 ; VI-NEXT: s_mov_b32 s15, s3
581 ; VI-NEXT: s_mov_b32 s10, s2
582 ; VI-NEXT: s_mov_b32 s11, s3
583 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
584 ; VI-NEXT: s_waitcnt vmcnt(0)
585 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
586 ; VI-NEXT: s_waitcnt vmcnt(0)
587 ; VI-NEXT: s_mov_b32 s0, s4
588 ; VI-NEXT: s_mov_b32 s1, s5
589 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
590 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
591 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
594 ; GFX11-LABEL: fcmp_f16_ge:
595 ; GFX11: ; %bb.0: ; %entry
596 ; GFX11-NEXT: s_clause 0x1
597 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
598 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
599 ; GFX11-NEXT: s_mov_b32 s10, -1
600 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
601 ; GFX11-NEXT: s_mov_b32 s14, s10
602 ; GFX11-NEXT: s_mov_b32 s15, s11
603 ; GFX11-NEXT: s_mov_b32 s2, s10
604 ; GFX11-NEXT: s_mov_b32 s3, s11
605 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
606 ; GFX11-NEXT: s_mov_b32 s12, s6
607 ; GFX11-NEXT: s_mov_b32 s13, s7
608 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
609 ; GFX11-NEXT: s_waitcnt vmcnt(0)
610 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
611 ; GFX11-NEXT: s_waitcnt vmcnt(0)
612 ; GFX11-NEXT: s_mov_b32 s8, s4
613 ; GFX11-NEXT: s_mov_b32 s9, s5
614 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
615 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
616 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
617 ; GFX11-NEXT: s_nop 0
618 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
619 ; GFX11-NEXT: s_endpgm
622 ptr addrspace(1) %b) {
624 %a.val = load volatile half, ptr addrspace(1) %a
625 %b.val = load volatile half, ptr addrspace(1) %b
626 %r.val = fcmp oge half %a.val, %b.val
627 %r.val.sext = sext i1 %r.val to i32
628 store i32 %r.val.sext, ptr addrspace(1) %r
632 define amdgpu_kernel void @fcmp_f16_o(
633 ; SI-LABEL: fcmp_f16_o:
634 ; SI: ; %bb.0: ; %entry
635 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
636 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
637 ; SI-NEXT: s_mov_b32 s11, 0xf000
638 ; SI-NEXT: s_mov_b32 s10, -1
639 ; SI-NEXT: s_mov_b32 s14, s10
640 ; SI-NEXT: s_mov_b32 s15, s11
641 ; SI-NEXT: s_mov_b32 s2, s10
642 ; SI-NEXT: s_mov_b32 s3, s11
643 ; SI-NEXT: s_waitcnt lgkmcnt(0)
644 ; SI-NEXT: s_mov_b32 s12, s6
645 ; SI-NEXT: s_mov_b32 s13, s7
646 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
647 ; SI-NEXT: s_waitcnt vmcnt(0)
648 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
649 ; SI-NEXT: s_waitcnt vmcnt(0)
650 ; SI-NEXT: s_mov_b32 s8, s4
651 ; SI-NEXT: s_mov_b32 s9, s5
652 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
653 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
654 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
655 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
656 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
659 ; VI-LABEL: fcmp_f16_o:
660 ; VI: ; %bb.0: ; %entry
661 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
662 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
663 ; VI-NEXT: s_mov_b32 s3, 0xf000
664 ; VI-NEXT: s_mov_b32 s2, -1
665 ; VI-NEXT: s_mov_b32 s14, s2
666 ; VI-NEXT: s_waitcnt lgkmcnt(0)
667 ; VI-NEXT: s_mov_b32 s12, s6
668 ; VI-NEXT: s_mov_b32 s13, s7
669 ; VI-NEXT: s_mov_b32 s15, s3
670 ; VI-NEXT: s_mov_b32 s10, s2
671 ; VI-NEXT: s_mov_b32 s11, s3
672 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
673 ; VI-NEXT: s_waitcnt vmcnt(0)
674 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
675 ; VI-NEXT: s_waitcnt vmcnt(0)
676 ; VI-NEXT: s_mov_b32 s0, s4
677 ; VI-NEXT: s_mov_b32 s1, s5
678 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
679 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
680 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
683 ; GFX11-LABEL: fcmp_f16_o:
684 ; GFX11: ; %bb.0: ; %entry
685 ; GFX11-NEXT: s_clause 0x1
686 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
687 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
688 ; GFX11-NEXT: s_mov_b32 s10, -1
689 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
690 ; GFX11-NEXT: s_mov_b32 s14, s10
691 ; GFX11-NEXT: s_mov_b32 s15, s11
692 ; GFX11-NEXT: s_mov_b32 s2, s10
693 ; GFX11-NEXT: s_mov_b32 s3, s11
694 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
695 ; GFX11-NEXT: s_mov_b32 s12, s6
696 ; GFX11-NEXT: s_mov_b32 s13, s7
697 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
698 ; GFX11-NEXT: s_waitcnt vmcnt(0)
699 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
700 ; GFX11-NEXT: s_waitcnt vmcnt(0)
701 ; GFX11-NEXT: s_mov_b32 s8, s4
702 ; GFX11-NEXT: s_mov_b32 s9, s5
703 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
704 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
705 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
706 ; GFX11-NEXT: s_nop 0
707 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
708 ; GFX11-NEXT: s_endpgm
711 ptr addrspace(1) %b) {
713 %a.val = load volatile half, ptr addrspace(1) %a
714 %b.val = load volatile half, ptr addrspace(1) %b
715 %r.val = fcmp ord half %a.val, %b.val
716 %r.val.sext = sext i1 %r.val to i32
717 store i32 %r.val.sext, ptr addrspace(1) %r
721 define amdgpu_kernel void @fcmp_f16_u(
722 ; SI-LABEL: fcmp_f16_u:
723 ; SI: ; %bb.0: ; %entry
724 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
725 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
726 ; SI-NEXT: s_mov_b32 s11, 0xf000
727 ; SI-NEXT: s_mov_b32 s10, -1
728 ; SI-NEXT: s_mov_b32 s14, s10
729 ; SI-NEXT: s_mov_b32 s15, s11
730 ; SI-NEXT: s_mov_b32 s2, s10
731 ; SI-NEXT: s_mov_b32 s3, s11
732 ; SI-NEXT: s_waitcnt lgkmcnt(0)
733 ; SI-NEXT: s_mov_b32 s12, s6
734 ; SI-NEXT: s_mov_b32 s13, s7
735 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
736 ; SI-NEXT: s_waitcnt vmcnt(0)
737 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
738 ; SI-NEXT: s_waitcnt vmcnt(0)
739 ; SI-NEXT: s_mov_b32 s8, s4
740 ; SI-NEXT: s_mov_b32 s9, s5
741 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
742 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
743 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
744 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
745 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
748 ; VI-LABEL: fcmp_f16_u:
749 ; VI: ; %bb.0: ; %entry
750 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
751 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
752 ; VI-NEXT: s_mov_b32 s3, 0xf000
753 ; VI-NEXT: s_mov_b32 s2, -1
754 ; VI-NEXT: s_mov_b32 s14, s2
755 ; VI-NEXT: s_waitcnt lgkmcnt(0)
756 ; VI-NEXT: s_mov_b32 s12, s6
757 ; VI-NEXT: s_mov_b32 s13, s7
758 ; VI-NEXT: s_mov_b32 s15, s3
759 ; VI-NEXT: s_mov_b32 s10, s2
760 ; VI-NEXT: s_mov_b32 s11, s3
761 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
762 ; VI-NEXT: s_waitcnt vmcnt(0)
763 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
764 ; VI-NEXT: s_waitcnt vmcnt(0)
765 ; VI-NEXT: s_mov_b32 s0, s4
766 ; VI-NEXT: s_mov_b32 s1, s5
767 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
768 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
769 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
772 ; GFX11-LABEL: fcmp_f16_u:
773 ; GFX11: ; %bb.0: ; %entry
774 ; GFX11-NEXT: s_clause 0x1
775 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
776 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
777 ; GFX11-NEXT: s_mov_b32 s10, -1
778 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
779 ; GFX11-NEXT: s_mov_b32 s14, s10
780 ; GFX11-NEXT: s_mov_b32 s15, s11
781 ; GFX11-NEXT: s_mov_b32 s2, s10
782 ; GFX11-NEXT: s_mov_b32 s3, s11
783 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
784 ; GFX11-NEXT: s_mov_b32 s12, s6
785 ; GFX11-NEXT: s_mov_b32 s13, s7
786 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
787 ; GFX11-NEXT: s_waitcnt vmcnt(0)
788 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
789 ; GFX11-NEXT: s_waitcnt vmcnt(0)
790 ; GFX11-NEXT: s_mov_b32 s8, s4
791 ; GFX11-NEXT: s_mov_b32 s9, s5
792 ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
793 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
794 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
795 ; GFX11-NEXT: s_nop 0
796 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
797 ; GFX11-NEXT: s_endpgm
800 ptr addrspace(1) %b) {
802 %a.val = load volatile half, ptr addrspace(1) %a
803 %b.val = load volatile half, ptr addrspace(1) %b
804 %r.val = fcmp uno half %a.val, %b.val
805 %r.val.sext = sext i1 %r.val to i32
806 store i32 %r.val.sext, ptr addrspace(1) %r
810 define amdgpu_kernel void @fcmp_f16_nge(
811 ; SI-LABEL: fcmp_f16_nge:
812 ; SI: ; %bb.0: ; %entry
813 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
814 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
815 ; SI-NEXT: s_mov_b32 s11, 0xf000
816 ; SI-NEXT: s_mov_b32 s10, -1
817 ; SI-NEXT: s_mov_b32 s14, s10
818 ; SI-NEXT: s_mov_b32 s15, s11
819 ; SI-NEXT: s_mov_b32 s2, s10
820 ; SI-NEXT: s_mov_b32 s3, s11
821 ; SI-NEXT: s_waitcnt lgkmcnt(0)
822 ; SI-NEXT: s_mov_b32 s12, s6
823 ; SI-NEXT: s_mov_b32 s13, s7
824 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
825 ; SI-NEXT: s_waitcnt vmcnt(0)
826 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
827 ; SI-NEXT: s_waitcnt vmcnt(0)
828 ; SI-NEXT: s_mov_b32 s8, s4
829 ; SI-NEXT: s_mov_b32 s9, s5
830 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
831 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
832 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
833 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
834 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
837 ; VI-LABEL: fcmp_f16_nge:
838 ; VI: ; %bb.0: ; %entry
839 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
840 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
841 ; VI-NEXT: s_mov_b32 s3, 0xf000
842 ; VI-NEXT: s_mov_b32 s2, -1
843 ; VI-NEXT: s_mov_b32 s14, s2
844 ; VI-NEXT: s_waitcnt lgkmcnt(0)
845 ; VI-NEXT: s_mov_b32 s12, s6
846 ; VI-NEXT: s_mov_b32 s13, s7
847 ; VI-NEXT: s_mov_b32 s15, s3
848 ; VI-NEXT: s_mov_b32 s10, s2
849 ; VI-NEXT: s_mov_b32 s11, s3
850 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
851 ; VI-NEXT: s_waitcnt vmcnt(0)
852 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
853 ; VI-NEXT: s_waitcnt vmcnt(0)
854 ; VI-NEXT: s_mov_b32 s0, s4
855 ; VI-NEXT: s_mov_b32 s1, s5
856 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
857 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
858 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
861 ; GFX11-LABEL: fcmp_f16_nge:
862 ; GFX11: ; %bb.0: ; %entry
863 ; GFX11-NEXT: s_clause 0x1
864 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
865 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
866 ; GFX11-NEXT: s_mov_b32 s10, -1
867 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
868 ; GFX11-NEXT: s_mov_b32 s14, s10
869 ; GFX11-NEXT: s_mov_b32 s15, s11
870 ; GFX11-NEXT: s_mov_b32 s2, s10
871 ; GFX11-NEXT: s_mov_b32 s3, s11
872 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX11-NEXT: s_mov_b32 s12, s6
874 ; GFX11-NEXT: s_mov_b32 s13, s7
875 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
876 ; GFX11-NEXT: s_waitcnt vmcnt(0)
877 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
878 ; GFX11-NEXT: s_waitcnt vmcnt(0)
879 ; GFX11-NEXT: s_mov_b32 s8, s4
880 ; GFX11-NEXT: s_mov_b32 s9, s5
881 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
882 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
883 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
884 ; GFX11-NEXT: s_nop 0
885 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
886 ; GFX11-NEXT: s_endpgm
889 ptr addrspace(1) %b) {
891 %a.val = load volatile half, ptr addrspace(1) %a
892 %b.val = load volatile half, ptr addrspace(1) %b
893 %r.val = fcmp ult half %a.val, %b.val
894 %r.val.sext = sext i1 %r.val to i32
895 store i32 %r.val.sext, ptr addrspace(1) %r
899 define amdgpu_kernel void @fcmp_f16_nlg(
900 ; SI-LABEL: fcmp_f16_nlg:
901 ; SI: ; %bb.0: ; %entry
902 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
903 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
904 ; SI-NEXT: s_mov_b32 s11, 0xf000
905 ; SI-NEXT: s_mov_b32 s10, -1
906 ; SI-NEXT: s_mov_b32 s14, s10
907 ; SI-NEXT: s_mov_b32 s15, s11
908 ; SI-NEXT: s_mov_b32 s2, s10
909 ; SI-NEXT: s_mov_b32 s3, s11
910 ; SI-NEXT: s_waitcnt lgkmcnt(0)
911 ; SI-NEXT: s_mov_b32 s12, s6
912 ; SI-NEXT: s_mov_b32 s13, s7
913 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
914 ; SI-NEXT: s_waitcnt vmcnt(0)
915 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
916 ; SI-NEXT: s_waitcnt vmcnt(0)
917 ; SI-NEXT: s_mov_b32 s8, s4
918 ; SI-NEXT: s_mov_b32 s9, s5
919 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
920 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
921 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
922 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
923 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
926 ; VI-LABEL: fcmp_f16_nlg:
927 ; VI: ; %bb.0: ; %entry
928 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
929 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
930 ; VI-NEXT: s_mov_b32 s3, 0xf000
931 ; VI-NEXT: s_mov_b32 s2, -1
932 ; VI-NEXT: s_mov_b32 s14, s2
933 ; VI-NEXT: s_waitcnt lgkmcnt(0)
934 ; VI-NEXT: s_mov_b32 s12, s6
935 ; VI-NEXT: s_mov_b32 s13, s7
936 ; VI-NEXT: s_mov_b32 s15, s3
937 ; VI-NEXT: s_mov_b32 s10, s2
938 ; VI-NEXT: s_mov_b32 s11, s3
939 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
940 ; VI-NEXT: s_waitcnt vmcnt(0)
941 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
942 ; VI-NEXT: s_waitcnt vmcnt(0)
943 ; VI-NEXT: s_mov_b32 s0, s4
944 ; VI-NEXT: s_mov_b32 s1, s5
945 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
946 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
947 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
950 ; GFX11-LABEL: fcmp_f16_nlg:
951 ; GFX11: ; %bb.0: ; %entry
952 ; GFX11-NEXT: s_clause 0x1
953 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
954 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
955 ; GFX11-NEXT: s_mov_b32 s10, -1
956 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
957 ; GFX11-NEXT: s_mov_b32 s14, s10
958 ; GFX11-NEXT: s_mov_b32 s15, s11
959 ; GFX11-NEXT: s_mov_b32 s2, s10
960 ; GFX11-NEXT: s_mov_b32 s3, s11
961 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
962 ; GFX11-NEXT: s_mov_b32 s12, s6
963 ; GFX11-NEXT: s_mov_b32 s13, s7
964 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
965 ; GFX11-NEXT: s_waitcnt vmcnt(0)
966 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
967 ; GFX11-NEXT: s_waitcnt vmcnt(0)
968 ; GFX11-NEXT: s_mov_b32 s8, s4
969 ; GFX11-NEXT: s_mov_b32 s9, s5
970 ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
971 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
972 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
973 ; GFX11-NEXT: s_nop 0
974 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
975 ; GFX11-NEXT: s_endpgm
978 ptr addrspace(1) %b) {
980 %a.val = load volatile half, ptr addrspace(1) %a
981 %b.val = load volatile half, ptr addrspace(1) %b
982 %r.val = fcmp ueq half %a.val, %b.val
983 %r.val.sext = sext i1 %r.val to i32
984 store i32 %r.val.sext, ptr addrspace(1) %r
988 define amdgpu_kernel void @fcmp_f16_ngt(
989 ; SI-LABEL: fcmp_f16_ngt:
990 ; SI: ; %bb.0: ; %entry
991 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
992 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
993 ; SI-NEXT: s_mov_b32 s11, 0xf000
994 ; SI-NEXT: s_mov_b32 s10, -1
995 ; SI-NEXT: s_mov_b32 s14, s10
996 ; SI-NEXT: s_mov_b32 s15, s11
997 ; SI-NEXT: s_mov_b32 s2, s10
998 ; SI-NEXT: s_mov_b32 s3, s11
999 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1000 ; SI-NEXT: s_mov_b32 s12, s6
1001 ; SI-NEXT: s_mov_b32 s13, s7
1002 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1003 ; SI-NEXT: s_waitcnt vmcnt(0)
1004 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
1005 ; SI-NEXT: s_waitcnt vmcnt(0)
1006 ; SI-NEXT: s_mov_b32 s8, s4
1007 ; SI-NEXT: s_mov_b32 s9, s5
1008 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1009 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1010 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
1011 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1012 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1015 ; VI-LABEL: fcmp_f16_ngt:
1016 ; VI: ; %bb.0: ; %entry
1017 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1018 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1019 ; VI-NEXT: s_mov_b32 s3, 0xf000
1020 ; VI-NEXT: s_mov_b32 s2, -1
1021 ; VI-NEXT: s_mov_b32 s14, s2
1022 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1023 ; VI-NEXT: s_mov_b32 s12, s6
1024 ; VI-NEXT: s_mov_b32 s13, s7
1025 ; VI-NEXT: s_mov_b32 s15, s3
1026 ; VI-NEXT: s_mov_b32 s10, s2
1027 ; VI-NEXT: s_mov_b32 s11, s3
1028 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1029 ; VI-NEXT: s_waitcnt vmcnt(0)
1030 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1031 ; VI-NEXT: s_waitcnt vmcnt(0)
1032 ; VI-NEXT: s_mov_b32 s0, s4
1033 ; VI-NEXT: s_mov_b32 s1, s5
1034 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
1035 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1036 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1039 ; GFX11-LABEL: fcmp_f16_ngt:
1040 ; GFX11: ; %bb.0: ; %entry
1041 ; GFX11-NEXT: s_clause 0x1
1042 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1043 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1044 ; GFX11-NEXT: s_mov_b32 s10, -1
1045 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1046 ; GFX11-NEXT: s_mov_b32 s14, s10
1047 ; GFX11-NEXT: s_mov_b32 s15, s11
1048 ; GFX11-NEXT: s_mov_b32 s2, s10
1049 ; GFX11-NEXT: s_mov_b32 s3, s11
1050 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1051 ; GFX11-NEXT: s_mov_b32 s12, s6
1052 ; GFX11-NEXT: s_mov_b32 s13, s7
1053 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1054 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1055 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
1056 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1057 ; GFX11-NEXT: s_mov_b32 s8, s4
1058 ; GFX11-NEXT: s_mov_b32 s9, s5
1059 ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
1060 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1061 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1062 ; GFX11-NEXT: s_nop 0
1063 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1064 ; GFX11-NEXT: s_endpgm
1065 ptr addrspace(1) %r,
1066 ptr addrspace(1) %a,
1067 ptr addrspace(1) %b) {
1069 %a.val = load volatile half, ptr addrspace(1) %a
1070 %b.val = load volatile half, ptr addrspace(1) %b
1071 %r.val = fcmp ule half %a.val, %b.val
1072 %r.val.sext = sext i1 %r.val to i32
1073 store i32 %r.val.sext, ptr addrspace(1) %r
1077 define amdgpu_kernel void @fcmp_f16_nle(
1078 ; SI-LABEL: fcmp_f16_nle:
1079 ; SI: ; %bb.0: ; %entry
1080 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1081 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1082 ; SI-NEXT: s_mov_b32 s11, 0xf000
1083 ; SI-NEXT: s_mov_b32 s10, -1
1084 ; SI-NEXT: s_mov_b32 s14, s10
1085 ; SI-NEXT: s_mov_b32 s15, s11
1086 ; SI-NEXT: s_mov_b32 s2, s10
1087 ; SI-NEXT: s_mov_b32 s3, s11
1088 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1089 ; SI-NEXT: s_mov_b32 s12, s6
1090 ; SI-NEXT: s_mov_b32 s13, s7
1091 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1092 ; SI-NEXT: s_waitcnt vmcnt(0)
1093 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
1094 ; SI-NEXT: s_waitcnt vmcnt(0)
1095 ; SI-NEXT: s_mov_b32 s8, s4
1096 ; SI-NEXT: s_mov_b32 s9, s5
1097 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1098 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1099 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
1100 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1101 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1104 ; VI-LABEL: fcmp_f16_nle:
1105 ; VI: ; %bb.0: ; %entry
1106 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1107 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1108 ; VI-NEXT: s_mov_b32 s3, 0xf000
1109 ; VI-NEXT: s_mov_b32 s2, -1
1110 ; VI-NEXT: s_mov_b32 s14, s2
1111 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1112 ; VI-NEXT: s_mov_b32 s12, s6
1113 ; VI-NEXT: s_mov_b32 s13, s7
1114 ; VI-NEXT: s_mov_b32 s15, s3
1115 ; VI-NEXT: s_mov_b32 s10, s2
1116 ; VI-NEXT: s_mov_b32 s11, s3
1117 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1118 ; VI-NEXT: s_waitcnt vmcnt(0)
1119 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1120 ; VI-NEXT: s_waitcnt vmcnt(0)
1121 ; VI-NEXT: s_mov_b32 s0, s4
1122 ; VI-NEXT: s_mov_b32 s1, s5
1123 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
1124 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1125 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1128 ; GFX11-LABEL: fcmp_f16_nle:
1129 ; GFX11: ; %bb.0: ; %entry
1130 ; GFX11-NEXT: s_clause 0x1
1131 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1132 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1133 ; GFX11-NEXT: s_mov_b32 s10, -1
1134 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1135 ; GFX11-NEXT: s_mov_b32 s14, s10
1136 ; GFX11-NEXT: s_mov_b32 s15, s11
1137 ; GFX11-NEXT: s_mov_b32 s2, s10
1138 ; GFX11-NEXT: s_mov_b32 s3, s11
1139 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1140 ; GFX11-NEXT: s_mov_b32 s12, s6
1141 ; GFX11-NEXT: s_mov_b32 s13, s7
1142 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1143 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1144 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
1145 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1146 ; GFX11-NEXT: s_mov_b32 s8, s4
1147 ; GFX11-NEXT: s_mov_b32 s9, s5
1148 ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
1149 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1150 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1151 ; GFX11-NEXT: s_nop 0
1152 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1153 ; GFX11-NEXT: s_endpgm
1154 ptr addrspace(1) %r,
1155 ptr addrspace(1) %a,
1156 ptr addrspace(1) %b) {
1158 %a.val = load volatile half, ptr addrspace(1) %a
1159 %b.val = load volatile half, ptr addrspace(1) %b
1160 %r.val = fcmp ugt half %a.val, %b.val
1161 %r.val.sext = sext i1 %r.val to i32
1162 store i32 %r.val.sext, ptr addrspace(1) %r
1166 define amdgpu_kernel void @fcmp_f16_neq(
1167 ; SI-LABEL: fcmp_f16_neq:
1168 ; SI: ; %bb.0: ; %entry
1169 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1170 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1171 ; SI-NEXT: s_mov_b32 s11, 0xf000
1172 ; SI-NEXT: s_mov_b32 s10, -1
1173 ; SI-NEXT: s_mov_b32 s14, s10
1174 ; SI-NEXT: s_mov_b32 s15, s11
1175 ; SI-NEXT: s_mov_b32 s2, s10
1176 ; SI-NEXT: s_mov_b32 s3, s11
1177 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1178 ; SI-NEXT: s_mov_b32 s12, s6
1179 ; SI-NEXT: s_mov_b32 s13, s7
1180 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1181 ; SI-NEXT: s_waitcnt vmcnt(0)
1182 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
1183 ; SI-NEXT: s_waitcnt vmcnt(0)
1184 ; SI-NEXT: s_mov_b32 s8, s4
1185 ; SI-NEXT: s_mov_b32 s9, s5
1186 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1187 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1188 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
1189 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1190 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1193 ; VI-LABEL: fcmp_f16_neq:
1194 ; VI: ; %bb.0: ; %entry
1195 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1196 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1197 ; VI-NEXT: s_mov_b32 s3, 0xf000
1198 ; VI-NEXT: s_mov_b32 s2, -1
1199 ; VI-NEXT: s_mov_b32 s14, s2
1200 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1201 ; VI-NEXT: s_mov_b32 s12, s6
1202 ; VI-NEXT: s_mov_b32 s13, s7
1203 ; VI-NEXT: s_mov_b32 s15, s3
1204 ; VI-NEXT: s_mov_b32 s10, s2
1205 ; VI-NEXT: s_mov_b32 s11, s3
1206 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1207 ; VI-NEXT: s_waitcnt vmcnt(0)
1208 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1209 ; VI-NEXT: s_waitcnt vmcnt(0)
1210 ; VI-NEXT: s_mov_b32 s0, s4
1211 ; VI-NEXT: s_mov_b32 s1, s5
1212 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
1213 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1214 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1217 ; GFX11-LABEL: fcmp_f16_neq:
1218 ; GFX11: ; %bb.0: ; %entry
1219 ; GFX11-NEXT: s_clause 0x1
1220 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1221 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1222 ; GFX11-NEXT: s_mov_b32 s10, -1
1223 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1224 ; GFX11-NEXT: s_mov_b32 s14, s10
1225 ; GFX11-NEXT: s_mov_b32 s15, s11
1226 ; GFX11-NEXT: s_mov_b32 s2, s10
1227 ; GFX11-NEXT: s_mov_b32 s3, s11
1228 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1229 ; GFX11-NEXT: s_mov_b32 s12, s6
1230 ; GFX11-NEXT: s_mov_b32 s13, s7
1231 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1232 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1233 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
1234 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1235 ; GFX11-NEXT: s_mov_b32 s8, s4
1236 ; GFX11-NEXT: s_mov_b32 s9, s5
1237 ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
1238 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1239 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1240 ; GFX11-NEXT: s_nop 0
1241 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1242 ; GFX11-NEXT: s_endpgm
1243 ptr addrspace(1) %r,
1244 ptr addrspace(1) %a,
1245 ptr addrspace(1) %b) {
1247 %a.val = load volatile half, ptr addrspace(1) %a
1248 %b.val = load volatile half, ptr addrspace(1) %b
1249 %r.val = fcmp une half %a.val, %b.val
1250 %r.val.sext = sext i1 %r.val to i32
1251 store i32 %r.val.sext, ptr addrspace(1) %r
1255 define amdgpu_kernel void @fcmp_f16_nlt(
1256 ; SI-LABEL: fcmp_f16_nlt:
1257 ; SI: ; %bb.0: ; %entry
1258 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1259 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1260 ; SI-NEXT: s_mov_b32 s11, 0xf000
1261 ; SI-NEXT: s_mov_b32 s10, -1
1262 ; SI-NEXT: s_mov_b32 s14, s10
1263 ; SI-NEXT: s_mov_b32 s15, s11
1264 ; SI-NEXT: s_mov_b32 s2, s10
1265 ; SI-NEXT: s_mov_b32 s3, s11
1266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1267 ; SI-NEXT: s_mov_b32 s12, s6
1268 ; SI-NEXT: s_mov_b32 s13, s7
1269 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1270 ; SI-NEXT: s_waitcnt vmcnt(0)
1271 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
1272 ; SI-NEXT: s_waitcnt vmcnt(0)
1273 ; SI-NEXT: s_mov_b32 s8, s4
1274 ; SI-NEXT: s_mov_b32 s9, s5
1275 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1276 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1277 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
1278 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1279 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1282 ; VI-LABEL: fcmp_f16_nlt:
1283 ; VI: ; %bb.0: ; %entry
1284 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1285 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1286 ; VI-NEXT: s_mov_b32 s3, 0xf000
1287 ; VI-NEXT: s_mov_b32 s2, -1
1288 ; VI-NEXT: s_mov_b32 s14, s2
1289 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1290 ; VI-NEXT: s_mov_b32 s12, s6
1291 ; VI-NEXT: s_mov_b32 s13, s7
1292 ; VI-NEXT: s_mov_b32 s15, s3
1293 ; VI-NEXT: s_mov_b32 s10, s2
1294 ; VI-NEXT: s_mov_b32 s11, s3
1295 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1296 ; VI-NEXT: s_waitcnt vmcnt(0)
1297 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1298 ; VI-NEXT: s_waitcnt vmcnt(0)
1299 ; VI-NEXT: s_mov_b32 s0, s4
1300 ; VI-NEXT: s_mov_b32 s1, s5
1301 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
1302 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1303 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1306 ; GFX11-LABEL: fcmp_f16_nlt:
1307 ; GFX11: ; %bb.0: ; %entry
1308 ; GFX11-NEXT: s_clause 0x1
1309 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1310 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1311 ; GFX11-NEXT: s_mov_b32 s10, -1
1312 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1313 ; GFX11-NEXT: s_mov_b32 s14, s10
1314 ; GFX11-NEXT: s_mov_b32 s15, s11
1315 ; GFX11-NEXT: s_mov_b32 s2, s10
1316 ; GFX11-NEXT: s_mov_b32 s3, s11
1317 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GFX11-NEXT: s_mov_b32 s12, s6
1319 ; GFX11-NEXT: s_mov_b32 s13, s7
1320 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1321 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1322 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
1323 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1324 ; GFX11-NEXT: s_mov_b32 s8, s4
1325 ; GFX11-NEXT: s_mov_b32 s9, s5
1326 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
1327 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1328 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1329 ; GFX11-NEXT: s_nop 0
1330 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1331 ; GFX11-NEXT: s_endpgm
1332 ptr addrspace(1) %r,
1333 ptr addrspace(1) %a,
1334 ptr addrspace(1) %b) {
1336 %a.val = load volatile half, ptr addrspace(1) %a
1337 %b.val = load volatile half, ptr addrspace(1) %b
1338 %r.val = fcmp uge half %a.val, %b.val
1339 %r.val.sext = sext i1 %r.val to i32
1340 store i32 %r.val.sext, ptr addrspace(1) %r
1344 define amdgpu_kernel void @fcmp_v2f16_lt(
1345 ; SI-LABEL: fcmp_v2f16_lt:
1346 ; SI: ; %bb.0: ; %entry
1347 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1348 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1349 ; SI-NEXT: s_mov_b32 s11, 0xf000
1350 ; SI-NEXT: s_mov_b32 s10, -1
1351 ; SI-NEXT: s_mov_b32 s14, s10
1352 ; SI-NEXT: s_mov_b32 s15, s11
1353 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1354 ; SI-NEXT: s_mov_b32 s12, s6
1355 ; SI-NEXT: s_mov_b32 s13, s7
1356 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1357 ; SI-NEXT: s_mov_b32 s2, s10
1358 ; SI-NEXT: s_mov_b32 s3, s11
1359 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1360 ; SI-NEXT: s_mov_b32 s8, s4
1361 ; SI-NEXT: s_mov_b32 s9, s5
1362 ; SI-NEXT: s_waitcnt vmcnt(1)
1363 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1364 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1365 ; SI-NEXT: s_waitcnt vmcnt(0)
1366 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1367 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1368 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1369 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1370 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
1371 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1372 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1
1373 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1374 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1377 ; VI-LABEL: fcmp_v2f16_lt:
1378 ; VI: ; %bb.0: ; %entry
1379 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1380 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1381 ; VI-NEXT: s_mov_b32 s3, 0xf000
1382 ; VI-NEXT: s_mov_b32 s2, -1
1383 ; VI-NEXT: s_mov_b32 s10, s2
1384 ; VI-NEXT: s_mov_b32 s11, s3
1385 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1386 ; VI-NEXT: s_mov_b32 s12, s6
1387 ; VI-NEXT: s_mov_b32 s13, s7
1388 ; VI-NEXT: s_mov_b32 s14, s2
1389 ; VI-NEXT: s_mov_b32 s15, s3
1390 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1391 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1392 ; VI-NEXT: s_mov_b32 s0, s4
1393 ; VI-NEXT: s_mov_b32 s1, s5
1394 ; VI-NEXT: s_waitcnt vmcnt(1)
1395 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1396 ; VI-NEXT: s_waitcnt vmcnt(0)
1397 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1398 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
1399 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1400 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2
1401 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1402 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1405 ; GFX11-LABEL: fcmp_v2f16_lt:
1406 ; GFX11: ; %bb.0: ; %entry
1407 ; GFX11-NEXT: s_clause 0x1
1408 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1409 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1410 ; GFX11-NEXT: s_mov_b32 s10, -1
1411 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1412 ; GFX11-NEXT: s_mov_b32 s2, s10
1413 ; GFX11-NEXT: s_mov_b32 s3, s11
1414 ; GFX11-NEXT: s_mov_b32 s14, s10
1415 ; GFX11-NEXT: s_mov_b32 s15, s11
1416 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1417 ; GFX11-NEXT: s_mov_b32 s12, s6
1418 ; GFX11-NEXT: s_mov_b32 s13, s7
1419 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
1420 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1421 ; GFX11-NEXT: s_mov_b32 s8, s4
1422 ; GFX11-NEXT: s_mov_b32 s9, s5
1423 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1424 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1425 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1426 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1427 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
1428 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1429 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1430 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
1431 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1432 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
1433 ; GFX11-NEXT: s_nop 0
1434 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1435 ; GFX11-NEXT: s_endpgm
1436 ptr addrspace(1) %r,
1437 ptr addrspace(1) %a,
1438 ptr addrspace(1) %b) {
1440 %a.val = load <2 x half>, ptr addrspace(1) %a
1441 %b.val = load <2 x half>, ptr addrspace(1) %b
1442 %r.val = fcmp olt <2 x half> %a.val, %b.val
1443 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1444 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1449 define amdgpu_kernel void @fcmp_v2f16_eq(
1450 ; SI-LABEL: fcmp_v2f16_eq:
1451 ; SI: ; %bb.0: ; %entry
1452 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1453 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1454 ; SI-NEXT: s_mov_b32 s11, 0xf000
1455 ; SI-NEXT: s_mov_b32 s10, -1
1456 ; SI-NEXT: s_mov_b32 s14, s10
1457 ; SI-NEXT: s_mov_b32 s15, s11
1458 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1459 ; SI-NEXT: s_mov_b32 s12, s6
1460 ; SI-NEXT: s_mov_b32 s13, s7
1461 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1462 ; SI-NEXT: s_mov_b32 s2, s10
1463 ; SI-NEXT: s_mov_b32 s3, s11
1464 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1465 ; SI-NEXT: s_mov_b32 s8, s4
1466 ; SI-NEXT: s_mov_b32 s9, s5
1467 ; SI-NEXT: s_waitcnt vmcnt(1)
1468 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1469 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1470 ; SI-NEXT: s_waitcnt vmcnt(0)
1471 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1472 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1473 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1474 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1475 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3
1476 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1477 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1
1478 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1479 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1482 ; VI-LABEL: fcmp_v2f16_eq:
1483 ; VI: ; %bb.0: ; %entry
1484 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1485 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1486 ; VI-NEXT: s_mov_b32 s3, 0xf000
1487 ; VI-NEXT: s_mov_b32 s2, -1
1488 ; VI-NEXT: s_mov_b32 s10, s2
1489 ; VI-NEXT: s_mov_b32 s11, s3
1490 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1491 ; VI-NEXT: s_mov_b32 s12, s6
1492 ; VI-NEXT: s_mov_b32 s13, s7
1493 ; VI-NEXT: s_mov_b32 s14, s2
1494 ; VI-NEXT: s_mov_b32 s15, s3
1495 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1496 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1497 ; VI-NEXT: s_mov_b32 s0, s4
1498 ; VI-NEXT: s_mov_b32 s1, s5
1499 ; VI-NEXT: s_waitcnt vmcnt(1)
1500 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1501 ; VI-NEXT: s_waitcnt vmcnt(0)
1502 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1503 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v1, v0
1504 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1505 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2
1506 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1507 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1510 ; GFX11-LABEL: fcmp_v2f16_eq:
1511 ; GFX11: ; %bb.0: ; %entry
1512 ; GFX11-NEXT: s_clause 0x1
1513 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1514 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1515 ; GFX11-NEXT: s_mov_b32 s10, -1
1516 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1517 ; GFX11-NEXT: s_mov_b32 s2, s10
1518 ; GFX11-NEXT: s_mov_b32 s3, s11
1519 ; GFX11-NEXT: s_mov_b32 s14, s10
1520 ; GFX11-NEXT: s_mov_b32 s15, s11
1521 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1522 ; GFX11-NEXT: s_mov_b32 s12, s6
1523 ; GFX11-NEXT: s_mov_b32 s13, s7
1524 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
1525 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1526 ; GFX11-NEXT: s_mov_b32 s8, s4
1527 ; GFX11-NEXT: s_mov_b32 s9, s5
1528 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1529 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1530 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1531 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1532 ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0
1533 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1534 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1535 ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
1536 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1537 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
1538 ; GFX11-NEXT: s_nop 0
1539 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1540 ; GFX11-NEXT: s_endpgm
1541 ptr addrspace(1) %r,
1542 ptr addrspace(1) %a,
1543 ptr addrspace(1) %b) {
1545 %a.val = load <2 x half>, ptr addrspace(1) %a
1546 %b.val = load <2 x half>, ptr addrspace(1) %b
1547 %r.val = fcmp oeq <2 x half> %a.val, %b.val
1548 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1549 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1553 define amdgpu_kernel void @fcmp_v2f16_le(
1554 ; SI-LABEL: fcmp_v2f16_le:
1555 ; SI: ; %bb.0: ; %entry
1556 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1557 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1558 ; SI-NEXT: s_mov_b32 s11, 0xf000
1559 ; SI-NEXT: s_mov_b32 s10, -1
1560 ; SI-NEXT: s_mov_b32 s14, s10
1561 ; SI-NEXT: s_mov_b32 s15, s11
1562 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1563 ; SI-NEXT: s_mov_b32 s12, s6
1564 ; SI-NEXT: s_mov_b32 s13, s7
1565 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1566 ; SI-NEXT: s_mov_b32 s2, s10
1567 ; SI-NEXT: s_mov_b32 s3, s11
1568 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1569 ; SI-NEXT: s_mov_b32 s8, s4
1570 ; SI-NEXT: s_mov_b32 s9, s5
1571 ; SI-NEXT: s_waitcnt vmcnt(1)
1572 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1573 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1574 ; SI-NEXT: s_waitcnt vmcnt(0)
1575 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1576 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1577 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1578 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1579 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3
1580 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1581 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1
1582 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1583 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1586 ; VI-LABEL: fcmp_v2f16_le:
1587 ; VI: ; %bb.0: ; %entry
1588 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1589 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1590 ; VI-NEXT: s_mov_b32 s3, 0xf000
1591 ; VI-NEXT: s_mov_b32 s2, -1
1592 ; VI-NEXT: s_mov_b32 s10, s2
1593 ; VI-NEXT: s_mov_b32 s11, s3
1594 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1595 ; VI-NEXT: s_mov_b32 s12, s6
1596 ; VI-NEXT: s_mov_b32 s13, s7
1597 ; VI-NEXT: s_mov_b32 s14, s2
1598 ; VI-NEXT: s_mov_b32 s15, s3
1599 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1600 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1601 ; VI-NEXT: s_mov_b32 s0, s4
1602 ; VI-NEXT: s_mov_b32 s1, s5
1603 ; VI-NEXT: s_waitcnt vmcnt(1)
1604 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1605 ; VI-NEXT: s_waitcnt vmcnt(0)
1606 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1607 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v1, v0
1608 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1609 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2
1610 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1611 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1614 ; GFX11-LABEL: fcmp_v2f16_le:
1615 ; GFX11: ; %bb.0: ; %entry
1616 ; GFX11-NEXT: s_clause 0x1
1617 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1618 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1619 ; GFX11-NEXT: s_mov_b32 s10, -1
1620 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1621 ; GFX11-NEXT: s_mov_b32 s2, s10
1622 ; GFX11-NEXT: s_mov_b32 s3, s11
1623 ; GFX11-NEXT: s_mov_b32 s14, s10
1624 ; GFX11-NEXT: s_mov_b32 s15, s11
1625 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1626 ; GFX11-NEXT: s_mov_b32 s12, s6
1627 ; GFX11-NEXT: s_mov_b32 s13, s7
1628 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
1629 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1630 ; GFX11-NEXT: s_mov_b32 s8, s4
1631 ; GFX11-NEXT: s_mov_b32 s9, s5
1632 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1633 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1634 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1635 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1636 ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0
1637 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1638 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1639 ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
1640 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1641 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
1642 ; GFX11-NEXT: s_nop 0
1643 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1644 ; GFX11-NEXT: s_endpgm
1645 ptr addrspace(1) %r,
1646 ptr addrspace(1) %a,
1647 ptr addrspace(1) %b) {
1649 %a.val = load <2 x half>, ptr addrspace(1) %a
1650 %b.val = load <2 x half>, ptr addrspace(1) %b
1651 %r.val = fcmp ole <2 x half> %a.val, %b.val
1652 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1653 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1657 define amdgpu_kernel void @fcmp_v2f16_gt(
1658 ; SI-LABEL: fcmp_v2f16_gt:
1659 ; SI: ; %bb.0: ; %entry
1660 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1661 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1662 ; SI-NEXT: s_mov_b32 s11, 0xf000
1663 ; SI-NEXT: s_mov_b32 s10, -1
1664 ; SI-NEXT: s_mov_b32 s14, s10
1665 ; SI-NEXT: s_mov_b32 s15, s11
1666 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1667 ; SI-NEXT: s_mov_b32 s12, s6
1668 ; SI-NEXT: s_mov_b32 s13, s7
1669 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1670 ; SI-NEXT: s_mov_b32 s2, s10
1671 ; SI-NEXT: s_mov_b32 s3, s11
1672 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1673 ; SI-NEXT: s_mov_b32 s8, s4
1674 ; SI-NEXT: s_mov_b32 s9, s5
1675 ; SI-NEXT: s_waitcnt vmcnt(1)
1676 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1677 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1678 ; SI-NEXT: s_waitcnt vmcnt(0)
1679 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1680 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1681 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1682 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1683 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
1684 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1685 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1
1686 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1687 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1690 ; VI-LABEL: fcmp_v2f16_gt:
1691 ; VI: ; %bb.0: ; %entry
1692 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1693 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1694 ; VI-NEXT: s_mov_b32 s3, 0xf000
1695 ; VI-NEXT: s_mov_b32 s2, -1
1696 ; VI-NEXT: s_mov_b32 s10, s2
1697 ; VI-NEXT: s_mov_b32 s11, s3
1698 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1699 ; VI-NEXT: s_mov_b32 s12, s6
1700 ; VI-NEXT: s_mov_b32 s13, s7
1701 ; VI-NEXT: s_mov_b32 s14, s2
1702 ; VI-NEXT: s_mov_b32 s15, s3
1703 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1704 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1705 ; VI-NEXT: s_mov_b32 s0, s4
1706 ; VI-NEXT: s_mov_b32 s1, s5
1707 ; VI-NEXT: s_waitcnt vmcnt(1)
1708 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1709 ; VI-NEXT: s_waitcnt vmcnt(0)
1710 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1711 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v1, v0
1712 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1713 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
1714 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1715 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1718 ; GFX11-LABEL: fcmp_v2f16_gt:
1719 ; GFX11: ; %bb.0: ; %entry
1720 ; GFX11-NEXT: s_clause 0x1
1721 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1722 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1723 ; GFX11-NEXT: s_mov_b32 s10, -1
1724 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1725 ; GFX11-NEXT: s_mov_b32 s2, s10
1726 ; GFX11-NEXT: s_mov_b32 s3, s11
1727 ; GFX11-NEXT: s_mov_b32 s14, s10
1728 ; GFX11-NEXT: s_mov_b32 s15, s11
1729 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1730 ; GFX11-NEXT: s_mov_b32 s12, s6
1731 ; GFX11-NEXT: s_mov_b32 s13, s7
1732 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
1733 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1734 ; GFX11-NEXT: s_mov_b32 s8, s4
1735 ; GFX11-NEXT: s_mov_b32 s9, s5
1736 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1737 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1738 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1739 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1740 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0
1741 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1742 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1743 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
1744 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1745 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
1746 ; GFX11-NEXT: s_nop 0
1747 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1748 ; GFX11-NEXT: s_endpgm
1749 ptr addrspace(1) %r,
1750 ptr addrspace(1) %a,
1751 ptr addrspace(1) %b) {
1753 %a.val = load <2 x half>, ptr addrspace(1) %a
1754 %b.val = load <2 x half>, ptr addrspace(1) %b
1755 %r.val = fcmp ogt <2 x half> %a.val, %b.val
1756 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1757 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1762 define amdgpu_kernel void @fcmp_v2f16_lg(
1763 ; SI-LABEL: fcmp_v2f16_lg:
1764 ; SI: ; %bb.0: ; %entry
1765 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1766 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1767 ; SI-NEXT: s_mov_b32 s11, 0xf000
1768 ; SI-NEXT: s_mov_b32 s10, -1
1769 ; SI-NEXT: s_mov_b32 s14, s10
1770 ; SI-NEXT: s_mov_b32 s15, s11
1771 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1772 ; SI-NEXT: s_mov_b32 s12, s6
1773 ; SI-NEXT: s_mov_b32 s13, s7
1774 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1775 ; SI-NEXT: s_mov_b32 s2, s10
1776 ; SI-NEXT: s_mov_b32 s3, s11
1777 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1778 ; SI-NEXT: s_mov_b32 s8, s4
1779 ; SI-NEXT: s_mov_b32 s9, s5
1780 ; SI-NEXT: s_waitcnt vmcnt(1)
1781 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1782 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1783 ; SI-NEXT: s_waitcnt vmcnt(0)
1784 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1785 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1786 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1787 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1788 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3
1789 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1790 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1
1791 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1792 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1795 ; VI-LABEL: fcmp_v2f16_lg:
1796 ; VI: ; %bb.0: ; %entry
1797 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1798 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1799 ; VI-NEXT: s_mov_b32 s3, 0xf000
1800 ; VI-NEXT: s_mov_b32 s2, -1
1801 ; VI-NEXT: s_mov_b32 s10, s2
1802 ; VI-NEXT: s_mov_b32 s11, s3
1803 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1804 ; VI-NEXT: s_mov_b32 s12, s6
1805 ; VI-NEXT: s_mov_b32 s13, s7
1806 ; VI-NEXT: s_mov_b32 s14, s2
1807 ; VI-NEXT: s_mov_b32 s15, s3
1808 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1809 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1810 ; VI-NEXT: s_mov_b32 s0, s4
1811 ; VI-NEXT: s_mov_b32 s1, s5
1812 ; VI-NEXT: s_waitcnt vmcnt(1)
1813 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1814 ; VI-NEXT: s_waitcnt vmcnt(0)
1815 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1816 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v1, v0
1817 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1818 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2
1819 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1820 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1823 ; GFX11-LABEL: fcmp_v2f16_lg:
1824 ; GFX11: ; %bb.0: ; %entry
1825 ; GFX11-NEXT: s_clause 0x1
1826 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1827 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1828 ; GFX11-NEXT: s_mov_b32 s10, -1
1829 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1830 ; GFX11-NEXT: s_mov_b32 s2, s10
1831 ; GFX11-NEXT: s_mov_b32 s3, s11
1832 ; GFX11-NEXT: s_mov_b32 s14, s10
1833 ; GFX11-NEXT: s_mov_b32 s15, s11
1834 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1835 ; GFX11-NEXT: s_mov_b32 s12, s6
1836 ; GFX11-NEXT: s_mov_b32 s13, s7
1837 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
1838 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1839 ; GFX11-NEXT: s_mov_b32 s8, s4
1840 ; GFX11-NEXT: s_mov_b32 s9, s5
1841 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1842 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1843 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1844 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1845 ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0
1846 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1847 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1848 ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
1849 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1850 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
1851 ; GFX11-NEXT: s_nop 0
1852 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1853 ; GFX11-NEXT: s_endpgm
1854 ptr addrspace(1) %r,
1855 ptr addrspace(1) %a,
1856 ptr addrspace(1) %b) {
1858 %a.val = load <2 x half>, ptr addrspace(1) %a
1859 %b.val = load <2 x half>, ptr addrspace(1) %b
1860 %r.val = fcmp one <2 x half> %a.val, %b.val
1861 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1862 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1867 define amdgpu_kernel void @fcmp_v2f16_ge(
1868 ; SI-LABEL: fcmp_v2f16_ge:
1869 ; SI: ; %bb.0: ; %entry
1870 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1871 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1872 ; SI-NEXT: s_mov_b32 s11, 0xf000
1873 ; SI-NEXT: s_mov_b32 s10, -1
1874 ; SI-NEXT: s_mov_b32 s14, s10
1875 ; SI-NEXT: s_mov_b32 s15, s11
1876 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1877 ; SI-NEXT: s_mov_b32 s12, s6
1878 ; SI-NEXT: s_mov_b32 s13, s7
1879 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1880 ; SI-NEXT: s_mov_b32 s2, s10
1881 ; SI-NEXT: s_mov_b32 s3, s11
1882 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1883 ; SI-NEXT: s_mov_b32 s8, s4
1884 ; SI-NEXT: s_mov_b32 s9, s5
1885 ; SI-NEXT: s_waitcnt vmcnt(1)
1886 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1887 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1888 ; SI-NEXT: s_waitcnt vmcnt(0)
1889 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1890 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1891 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1892 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1893 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3
1894 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1895 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1
1896 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1897 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1900 ; VI-LABEL: fcmp_v2f16_ge:
1901 ; VI: ; %bb.0: ; %entry
1902 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1903 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1904 ; VI-NEXT: s_mov_b32 s3, 0xf000
1905 ; VI-NEXT: s_mov_b32 s2, -1
1906 ; VI-NEXT: s_mov_b32 s10, s2
1907 ; VI-NEXT: s_mov_b32 s11, s3
1908 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1909 ; VI-NEXT: s_mov_b32 s12, s6
1910 ; VI-NEXT: s_mov_b32 s13, s7
1911 ; VI-NEXT: s_mov_b32 s14, s2
1912 ; VI-NEXT: s_mov_b32 s15, s3
1913 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1914 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1915 ; VI-NEXT: s_mov_b32 s0, s4
1916 ; VI-NEXT: s_mov_b32 s1, s5
1917 ; VI-NEXT: s_waitcnt vmcnt(1)
1918 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1919 ; VI-NEXT: s_waitcnt vmcnt(0)
1920 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1921 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v1, v0
1922 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1923 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2
1924 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
1925 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1928 ; GFX11-LABEL: fcmp_v2f16_ge:
1929 ; GFX11: ; %bb.0: ; %entry
1930 ; GFX11-NEXT: s_clause 0x1
1931 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1932 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1933 ; GFX11-NEXT: s_mov_b32 s10, -1
1934 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1935 ; GFX11-NEXT: s_mov_b32 s2, s10
1936 ; GFX11-NEXT: s_mov_b32 s3, s11
1937 ; GFX11-NEXT: s_mov_b32 s14, s10
1938 ; GFX11-NEXT: s_mov_b32 s15, s11
1939 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1940 ; GFX11-NEXT: s_mov_b32 s12, s6
1941 ; GFX11-NEXT: s_mov_b32 s13, s7
1942 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
1943 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1944 ; GFX11-NEXT: s_mov_b32 s8, s4
1945 ; GFX11-NEXT: s_mov_b32 s9, s5
1946 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1947 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1948 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1949 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1950 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0
1951 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1952 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1953 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
1954 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1955 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
1956 ; GFX11-NEXT: s_nop 0
1957 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1958 ; GFX11-NEXT: s_endpgm
1959 ptr addrspace(1) %r,
1960 ptr addrspace(1) %a,
1961 ptr addrspace(1) %b) {
1963 %a.val = load <2 x half>, ptr addrspace(1) %a
1964 %b.val = load <2 x half>, ptr addrspace(1) %b
1965 %r.val = fcmp oge <2 x half> %a.val, %b.val
1966 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1967 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1972 define amdgpu_kernel void @fcmp_v2f16_o(
1973 ; SI-LABEL: fcmp_v2f16_o:
1974 ; SI: ; %bb.0: ; %entry
1975 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1976 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1977 ; SI-NEXT: s_mov_b32 s11, 0xf000
1978 ; SI-NEXT: s_mov_b32 s10, -1
1979 ; SI-NEXT: s_mov_b32 s14, s10
1980 ; SI-NEXT: s_mov_b32 s15, s11
1981 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1982 ; SI-NEXT: s_mov_b32 s12, s6
1983 ; SI-NEXT: s_mov_b32 s13, s7
1984 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1985 ; SI-NEXT: s_mov_b32 s2, s10
1986 ; SI-NEXT: s_mov_b32 s3, s11
1987 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
1988 ; SI-NEXT: s_mov_b32 s8, s4
1989 ; SI-NEXT: s_mov_b32 s9, s5
1990 ; SI-NEXT: s_waitcnt vmcnt(1)
1991 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1992 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1993 ; SI-NEXT: s_waitcnt vmcnt(0)
1994 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
1995 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1996 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1997 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1998 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
1999 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2000 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1
2001 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2002 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2005 ; VI-LABEL: fcmp_v2f16_o:
2006 ; VI: ; %bb.0: ; %entry
2007 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2008 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2009 ; VI-NEXT: s_mov_b32 s3, 0xf000
2010 ; VI-NEXT: s_mov_b32 s2, -1
2011 ; VI-NEXT: s_mov_b32 s10, s2
2012 ; VI-NEXT: s_mov_b32 s11, s3
2013 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2014 ; VI-NEXT: s_mov_b32 s12, s6
2015 ; VI-NEXT: s_mov_b32 s13, s7
2016 ; VI-NEXT: s_mov_b32 s14, s2
2017 ; VI-NEXT: s_mov_b32 s15, s3
2018 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2019 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2020 ; VI-NEXT: s_mov_b32 s0, s4
2021 ; VI-NEXT: s_mov_b32 s1, s5
2022 ; VI-NEXT: s_waitcnt vmcnt(1)
2023 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2024 ; VI-NEXT: s_waitcnt vmcnt(0)
2025 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2026 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v1, v0
2027 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2028 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
2029 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2030 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2033 ; GFX11-LABEL: fcmp_v2f16_o:
2034 ; GFX11: ; %bb.0: ; %entry
2035 ; GFX11-NEXT: s_clause 0x1
2036 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2037 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2038 ; GFX11-NEXT: s_mov_b32 s10, -1
2039 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2040 ; GFX11-NEXT: s_mov_b32 s2, s10
2041 ; GFX11-NEXT: s_mov_b32 s3, s11
2042 ; GFX11-NEXT: s_mov_b32 s14, s10
2043 ; GFX11-NEXT: s_mov_b32 s15, s11
2044 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2045 ; GFX11-NEXT: s_mov_b32 s12, s6
2046 ; GFX11-NEXT: s_mov_b32 s13, s7
2047 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2048 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2049 ; GFX11-NEXT: s_mov_b32 s8, s4
2050 ; GFX11-NEXT: s_mov_b32 s9, s5
2051 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2052 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2053 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2054 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2055 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0
2056 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2057 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2058 ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
2059 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2060 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2061 ; GFX11-NEXT: s_nop 0
2062 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2063 ; GFX11-NEXT: s_endpgm
2064 ptr addrspace(1) %r,
2065 ptr addrspace(1) %a,
2066 ptr addrspace(1) %b) {
2068 %a.val = load <2 x half>, ptr addrspace(1) %a
2069 %b.val = load <2 x half>, ptr addrspace(1) %b
2070 %r.val = fcmp ord <2 x half> %a.val, %b.val
2071 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2072 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2077 define amdgpu_kernel void @fcmp_v2f16_u(
2078 ; SI-LABEL: fcmp_v2f16_u:
2079 ; SI: ; %bb.0: ; %entry
2080 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2081 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2082 ; SI-NEXT: s_mov_b32 s11, 0xf000
2083 ; SI-NEXT: s_mov_b32 s10, -1
2084 ; SI-NEXT: s_mov_b32 s14, s10
2085 ; SI-NEXT: s_mov_b32 s15, s11
2086 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2087 ; SI-NEXT: s_mov_b32 s12, s6
2088 ; SI-NEXT: s_mov_b32 s13, s7
2089 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2090 ; SI-NEXT: s_mov_b32 s2, s10
2091 ; SI-NEXT: s_mov_b32 s3, s11
2092 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2093 ; SI-NEXT: s_mov_b32 s8, s4
2094 ; SI-NEXT: s_mov_b32 s9, s5
2095 ; SI-NEXT: s_waitcnt vmcnt(1)
2096 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2097 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2098 ; SI-NEXT: s_waitcnt vmcnt(0)
2099 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2100 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2101 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2102 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2103 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3
2104 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2105 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1
2106 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2107 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2110 ; VI-LABEL: fcmp_v2f16_u:
2111 ; VI: ; %bb.0: ; %entry
2112 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2113 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2114 ; VI-NEXT: s_mov_b32 s3, 0xf000
2115 ; VI-NEXT: s_mov_b32 s2, -1
2116 ; VI-NEXT: s_mov_b32 s10, s2
2117 ; VI-NEXT: s_mov_b32 s11, s3
2118 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2119 ; VI-NEXT: s_mov_b32 s12, s6
2120 ; VI-NEXT: s_mov_b32 s13, s7
2121 ; VI-NEXT: s_mov_b32 s14, s2
2122 ; VI-NEXT: s_mov_b32 s15, s3
2123 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2124 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2125 ; VI-NEXT: s_mov_b32 s0, s4
2126 ; VI-NEXT: s_mov_b32 s1, s5
2127 ; VI-NEXT: s_waitcnt vmcnt(1)
2128 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2129 ; VI-NEXT: s_waitcnt vmcnt(0)
2130 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2131 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v1, v0
2132 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2133 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2
2134 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2135 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2138 ; GFX11-LABEL: fcmp_v2f16_u:
2139 ; GFX11: ; %bb.0: ; %entry
2140 ; GFX11-NEXT: s_clause 0x1
2141 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2142 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2143 ; GFX11-NEXT: s_mov_b32 s10, -1
2144 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2145 ; GFX11-NEXT: s_mov_b32 s2, s10
2146 ; GFX11-NEXT: s_mov_b32 s3, s11
2147 ; GFX11-NEXT: s_mov_b32 s14, s10
2148 ; GFX11-NEXT: s_mov_b32 s15, s11
2149 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2150 ; GFX11-NEXT: s_mov_b32 s12, s6
2151 ; GFX11-NEXT: s_mov_b32 s13, s7
2152 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2153 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2154 ; GFX11-NEXT: s_mov_b32 s8, s4
2155 ; GFX11-NEXT: s_mov_b32 s9, s5
2156 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2157 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2158 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2159 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2160 ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0
2161 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2162 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2163 ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
2164 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2165 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2166 ; GFX11-NEXT: s_nop 0
2167 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2168 ; GFX11-NEXT: s_endpgm
2169 ptr addrspace(1) %r,
2170 ptr addrspace(1) %a,
2171 ptr addrspace(1) %b) {
2173 %a.val = load <2 x half>, ptr addrspace(1) %a
2174 %b.val = load <2 x half>, ptr addrspace(1) %b
2175 %r.val = fcmp uno <2 x half> %a.val, %b.val
2176 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2177 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2181 define amdgpu_kernel void @fcmp_v2f16_nge(
2182 ; SI-LABEL: fcmp_v2f16_nge:
2183 ; SI: ; %bb.0: ; %entry
2184 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2185 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2186 ; SI-NEXT: s_mov_b32 s11, 0xf000
2187 ; SI-NEXT: s_mov_b32 s10, -1
2188 ; SI-NEXT: s_mov_b32 s14, s10
2189 ; SI-NEXT: s_mov_b32 s15, s11
2190 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2191 ; SI-NEXT: s_mov_b32 s12, s6
2192 ; SI-NEXT: s_mov_b32 s13, s7
2193 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2194 ; SI-NEXT: s_mov_b32 s2, s10
2195 ; SI-NEXT: s_mov_b32 s3, s11
2196 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2197 ; SI-NEXT: s_mov_b32 s8, s4
2198 ; SI-NEXT: s_mov_b32 s9, s5
2199 ; SI-NEXT: s_waitcnt vmcnt(1)
2200 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2201 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2202 ; SI-NEXT: s_waitcnt vmcnt(0)
2203 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2204 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2205 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2206 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2207 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3
2208 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2209 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1
2210 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2211 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2214 ; VI-LABEL: fcmp_v2f16_nge:
2215 ; VI: ; %bb.0: ; %entry
2216 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2217 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2218 ; VI-NEXT: s_mov_b32 s3, 0xf000
2219 ; VI-NEXT: s_mov_b32 s2, -1
2220 ; VI-NEXT: s_mov_b32 s10, s2
2221 ; VI-NEXT: s_mov_b32 s11, s3
2222 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2223 ; VI-NEXT: s_mov_b32 s12, s6
2224 ; VI-NEXT: s_mov_b32 s13, s7
2225 ; VI-NEXT: s_mov_b32 s14, s2
2226 ; VI-NEXT: s_mov_b32 s15, s3
2227 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2228 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2229 ; VI-NEXT: s_mov_b32 s0, s4
2230 ; VI-NEXT: s_mov_b32 s1, s5
2231 ; VI-NEXT: s_waitcnt vmcnt(1)
2232 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2233 ; VI-NEXT: s_waitcnt vmcnt(0)
2234 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2235 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v1, v0
2236 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2237 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2
2238 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2239 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2242 ; GFX11-LABEL: fcmp_v2f16_nge:
2243 ; GFX11: ; %bb.0: ; %entry
2244 ; GFX11-NEXT: s_clause 0x1
2245 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2246 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2247 ; GFX11-NEXT: s_mov_b32 s10, -1
2248 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2249 ; GFX11-NEXT: s_mov_b32 s2, s10
2250 ; GFX11-NEXT: s_mov_b32 s3, s11
2251 ; GFX11-NEXT: s_mov_b32 s14, s10
2252 ; GFX11-NEXT: s_mov_b32 s15, s11
2253 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2254 ; GFX11-NEXT: s_mov_b32 s12, s6
2255 ; GFX11-NEXT: s_mov_b32 s13, s7
2256 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2257 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2258 ; GFX11-NEXT: s_mov_b32 s8, s4
2259 ; GFX11-NEXT: s_mov_b32 s9, s5
2260 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2261 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2262 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2263 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2264 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0
2265 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2266 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2267 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
2268 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2269 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2270 ; GFX11-NEXT: s_nop 0
2271 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2272 ; GFX11-NEXT: s_endpgm
2273 ptr addrspace(1) %r,
2274 ptr addrspace(1) %a,
2275 ptr addrspace(1) %b) {
2277 %a.val = load <2 x half>, ptr addrspace(1) %a
2278 %b.val = load <2 x half>, ptr addrspace(1) %b
2279 %r.val = fcmp ult <2 x half> %a.val, %b.val
2280 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2281 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2285 define amdgpu_kernel void @fcmp_v2f16_nlg(
2286 ; SI-LABEL: fcmp_v2f16_nlg:
2287 ; SI: ; %bb.0: ; %entry
2288 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2289 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2290 ; SI-NEXT: s_mov_b32 s11, 0xf000
2291 ; SI-NEXT: s_mov_b32 s10, -1
2292 ; SI-NEXT: s_mov_b32 s14, s10
2293 ; SI-NEXT: s_mov_b32 s15, s11
2294 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2295 ; SI-NEXT: s_mov_b32 s12, s6
2296 ; SI-NEXT: s_mov_b32 s13, s7
2297 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2298 ; SI-NEXT: s_mov_b32 s2, s10
2299 ; SI-NEXT: s_mov_b32 s3, s11
2300 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2301 ; SI-NEXT: s_mov_b32 s8, s4
2302 ; SI-NEXT: s_mov_b32 s9, s5
2303 ; SI-NEXT: s_waitcnt vmcnt(1)
2304 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2305 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2306 ; SI-NEXT: s_waitcnt vmcnt(0)
2307 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2308 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2309 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2310 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2311 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
2312 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2313 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1
2314 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2315 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2318 ; VI-LABEL: fcmp_v2f16_nlg:
2319 ; VI: ; %bb.0: ; %entry
2320 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2321 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2322 ; VI-NEXT: s_mov_b32 s3, 0xf000
2323 ; VI-NEXT: s_mov_b32 s2, -1
2324 ; VI-NEXT: s_mov_b32 s10, s2
2325 ; VI-NEXT: s_mov_b32 s11, s3
2326 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2327 ; VI-NEXT: s_mov_b32 s12, s6
2328 ; VI-NEXT: s_mov_b32 s13, s7
2329 ; VI-NEXT: s_mov_b32 s14, s2
2330 ; VI-NEXT: s_mov_b32 s15, s3
2331 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2332 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2333 ; VI-NEXT: s_mov_b32 s0, s4
2334 ; VI-NEXT: s_mov_b32 s1, s5
2335 ; VI-NEXT: s_waitcnt vmcnt(1)
2336 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2337 ; VI-NEXT: s_waitcnt vmcnt(0)
2338 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2339 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v1, v0
2340 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2341 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2
2342 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2343 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2346 ; GFX11-LABEL: fcmp_v2f16_nlg:
2347 ; GFX11: ; %bb.0: ; %entry
2348 ; GFX11-NEXT: s_clause 0x1
2349 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2350 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2351 ; GFX11-NEXT: s_mov_b32 s10, -1
2352 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2353 ; GFX11-NEXT: s_mov_b32 s2, s10
2354 ; GFX11-NEXT: s_mov_b32 s3, s11
2355 ; GFX11-NEXT: s_mov_b32 s14, s10
2356 ; GFX11-NEXT: s_mov_b32 s15, s11
2357 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2358 ; GFX11-NEXT: s_mov_b32 s12, s6
2359 ; GFX11-NEXT: s_mov_b32 s13, s7
2360 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2361 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2362 ; GFX11-NEXT: s_mov_b32 s8, s4
2363 ; GFX11-NEXT: s_mov_b32 s9, s5
2364 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2365 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2366 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2367 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2368 ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0
2369 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2370 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2371 ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
2372 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2373 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2374 ; GFX11-NEXT: s_nop 0
2375 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2376 ; GFX11-NEXT: s_endpgm
2377 ptr addrspace(1) %r,
2378 ptr addrspace(1) %a,
2379 ptr addrspace(1) %b) {
2381 %a.val = load <2 x half>, ptr addrspace(1) %a
2382 %b.val = load <2 x half>, ptr addrspace(1) %b
2383 %r.val = fcmp ueq <2 x half> %a.val, %b.val
2384 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2385 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2390 define amdgpu_kernel void @fcmp_v2f16_ngt(
2391 ; SI-LABEL: fcmp_v2f16_ngt:
2392 ; SI: ; %bb.0: ; %entry
2393 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2394 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2395 ; SI-NEXT: s_mov_b32 s11, 0xf000
2396 ; SI-NEXT: s_mov_b32 s10, -1
2397 ; SI-NEXT: s_mov_b32 s14, s10
2398 ; SI-NEXT: s_mov_b32 s15, s11
2399 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2400 ; SI-NEXT: s_mov_b32 s12, s6
2401 ; SI-NEXT: s_mov_b32 s13, s7
2402 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2403 ; SI-NEXT: s_mov_b32 s2, s10
2404 ; SI-NEXT: s_mov_b32 s3, s11
2405 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2406 ; SI-NEXT: s_mov_b32 s8, s4
2407 ; SI-NEXT: s_mov_b32 s9, s5
2408 ; SI-NEXT: s_waitcnt vmcnt(1)
2409 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2410 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2411 ; SI-NEXT: s_waitcnt vmcnt(0)
2412 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2413 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2414 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2415 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2416 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3
2417 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2418 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1
2419 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2420 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2423 ; VI-LABEL: fcmp_v2f16_ngt:
2424 ; VI: ; %bb.0: ; %entry
2425 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2426 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2427 ; VI-NEXT: s_mov_b32 s3, 0xf000
2428 ; VI-NEXT: s_mov_b32 s2, -1
2429 ; VI-NEXT: s_mov_b32 s10, s2
2430 ; VI-NEXT: s_mov_b32 s11, s3
2431 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2432 ; VI-NEXT: s_mov_b32 s12, s6
2433 ; VI-NEXT: s_mov_b32 s13, s7
2434 ; VI-NEXT: s_mov_b32 s14, s2
2435 ; VI-NEXT: s_mov_b32 s15, s3
2436 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2437 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2438 ; VI-NEXT: s_mov_b32 s0, s4
2439 ; VI-NEXT: s_mov_b32 s1, s5
2440 ; VI-NEXT: s_waitcnt vmcnt(1)
2441 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2442 ; VI-NEXT: s_waitcnt vmcnt(0)
2443 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2444 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v0
2445 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2446 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2
2447 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2448 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2451 ; GFX11-LABEL: fcmp_v2f16_ngt:
2452 ; GFX11: ; %bb.0: ; %entry
2453 ; GFX11-NEXT: s_clause 0x1
2454 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2455 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2456 ; GFX11-NEXT: s_mov_b32 s10, -1
2457 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2458 ; GFX11-NEXT: s_mov_b32 s2, s10
2459 ; GFX11-NEXT: s_mov_b32 s3, s11
2460 ; GFX11-NEXT: s_mov_b32 s14, s10
2461 ; GFX11-NEXT: s_mov_b32 s15, s11
2462 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2463 ; GFX11-NEXT: s_mov_b32 s12, s6
2464 ; GFX11-NEXT: s_mov_b32 s13, s7
2465 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2466 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2467 ; GFX11-NEXT: s_mov_b32 s8, s4
2468 ; GFX11-NEXT: s_mov_b32 s9, s5
2469 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2470 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2471 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2472 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2473 ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0
2474 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2475 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2476 ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
2477 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2478 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2479 ; GFX11-NEXT: s_nop 0
2480 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2481 ; GFX11-NEXT: s_endpgm
2482 ptr addrspace(1) %r,
2483 ptr addrspace(1) %a,
2484 ptr addrspace(1) %b) {
2486 %a.val = load <2 x half>, ptr addrspace(1) %a
2487 %b.val = load <2 x half>, ptr addrspace(1) %b
2488 %r.val = fcmp ule <2 x half> %a.val, %b.val
2489 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2490 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2494 define amdgpu_kernel void @fcmp_v2f16_nle(
2495 ; SI-LABEL: fcmp_v2f16_nle:
2496 ; SI: ; %bb.0: ; %entry
2497 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2498 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2499 ; SI-NEXT: s_mov_b32 s11, 0xf000
2500 ; SI-NEXT: s_mov_b32 s10, -1
2501 ; SI-NEXT: s_mov_b32 s14, s10
2502 ; SI-NEXT: s_mov_b32 s15, s11
2503 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2504 ; SI-NEXT: s_mov_b32 s12, s6
2505 ; SI-NEXT: s_mov_b32 s13, s7
2506 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2507 ; SI-NEXT: s_mov_b32 s2, s10
2508 ; SI-NEXT: s_mov_b32 s3, s11
2509 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2510 ; SI-NEXT: s_mov_b32 s8, s4
2511 ; SI-NEXT: s_mov_b32 s9, s5
2512 ; SI-NEXT: s_waitcnt vmcnt(1)
2513 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2514 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2515 ; SI-NEXT: s_waitcnt vmcnt(0)
2516 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2517 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2518 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2519 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2520 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3
2521 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2522 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1
2523 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2524 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2527 ; VI-LABEL: fcmp_v2f16_nle:
2528 ; VI: ; %bb.0: ; %entry
2529 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2530 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2531 ; VI-NEXT: s_mov_b32 s3, 0xf000
2532 ; VI-NEXT: s_mov_b32 s2, -1
2533 ; VI-NEXT: s_mov_b32 s10, s2
2534 ; VI-NEXT: s_mov_b32 s11, s3
2535 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2536 ; VI-NEXT: s_mov_b32 s12, s6
2537 ; VI-NEXT: s_mov_b32 s13, s7
2538 ; VI-NEXT: s_mov_b32 s14, s2
2539 ; VI-NEXT: s_mov_b32 s15, s3
2540 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2541 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2542 ; VI-NEXT: s_mov_b32 s0, s4
2543 ; VI-NEXT: s_mov_b32 s1, s5
2544 ; VI-NEXT: s_waitcnt vmcnt(1)
2545 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2546 ; VI-NEXT: s_waitcnt vmcnt(0)
2547 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2548 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v0
2549 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2550 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2
2551 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2552 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2555 ; GFX11-LABEL: fcmp_v2f16_nle:
2556 ; GFX11: ; %bb.0: ; %entry
2557 ; GFX11-NEXT: s_clause 0x1
2558 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2559 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2560 ; GFX11-NEXT: s_mov_b32 s10, -1
2561 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2562 ; GFX11-NEXT: s_mov_b32 s2, s10
2563 ; GFX11-NEXT: s_mov_b32 s3, s11
2564 ; GFX11-NEXT: s_mov_b32 s14, s10
2565 ; GFX11-NEXT: s_mov_b32 s15, s11
2566 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2567 ; GFX11-NEXT: s_mov_b32 s12, s6
2568 ; GFX11-NEXT: s_mov_b32 s13, s7
2569 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2570 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2571 ; GFX11-NEXT: s_mov_b32 s8, s4
2572 ; GFX11-NEXT: s_mov_b32 s9, s5
2573 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2574 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2575 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2576 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2577 ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0
2578 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2579 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2580 ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
2581 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2582 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2583 ; GFX11-NEXT: s_nop 0
2584 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2585 ; GFX11-NEXT: s_endpgm
2586 ptr addrspace(1) %r,
2587 ptr addrspace(1) %a,
2588 ptr addrspace(1) %b) {
2590 %a.val = load <2 x half>, ptr addrspace(1) %a
2591 %b.val = load <2 x half>, ptr addrspace(1) %b
2592 %r.val = fcmp ugt <2 x half> %a.val, %b.val
2593 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2594 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2598 define amdgpu_kernel void @fcmp_v2f16_neq(
2599 ; SI-LABEL: fcmp_v2f16_neq:
2600 ; SI: ; %bb.0: ; %entry
2601 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2602 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2603 ; SI-NEXT: s_mov_b32 s11, 0xf000
2604 ; SI-NEXT: s_mov_b32 s10, -1
2605 ; SI-NEXT: s_mov_b32 s14, s10
2606 ; SI-NEXT: s_mov_b32 s15, s11
2607 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2608 ; SI-NEXT: s_mov_b32 s12, s6
2609 ; SI-NEXT: s_mov_b32 s13, s7
2610 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2611 ; SI-NEXT: s_mov_b32 s2, s10
2612 ; SI-NEXT: s_mov_b32 s3, s11
2613 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2614 ; SI-NEXT: s_mov_b32 s8, s4
2615 ; SI-NEXT: s_mov_b32 s9, s5
2616 ; SI-NEXT: s_waitcnt vmcnt(1)
2617 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2618 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2619 ; SI-NEXT: s_waitcnt vmcnt(0)
2620 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2621 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2622 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2623 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2624 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3
2625 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2626 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1
2627 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2628 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2631 ; VI-LABEL: fcmp_v2f16_neq:
2632 ; VI: ; %bb.0: ; %entry
2633 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2634 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2635 ; VI-NEXT: s_mov_b32 s3, 0xf000
2636 ; VI-NEXT: s_mov_b32 s2, -1
2637 ; VI-NEXT: s_mov_b32 s10, s2
2638 ; VI-NEXT: s_mov_b32 s11, s3
2639 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2640 ; VI-NEXT: s_mov_b32 s12, s6
2641 ; VI-NEXT: s_mov_b32 s13, s7
2642 ; VI-NEXT: s_mov_b32 s14, s2
2643 ; VI-NEXT: s_mov_b32 s15, s3
2644 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2645 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2646 ; VI-NEXT: s_mov_b32 s0, s4
2647 ; VI-NEXT: s_mov_b32 s1, s5
2648 ; VI-NEXT: s_waitcnt vmcnt(1)
2649 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2650 ; VI-NEXT: s_waitcnt vmcnt(0)
2651 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2652 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v1, v0
2653 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2654 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2
2655 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2656 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2659 ; GFX11-LABEL: fcmp_v2f16_neq:
2660 ; GFX11: ; %bb.0: ; %entry
2661 ; GFX11-NEXT: s_clause 0x1
2662 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2663 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2664 ; GFX11-NEXT: s_mov_b32 s10, -1
2665 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2666 ; GFX11-NEXT: s_mov_b32 s2, s10
2667 ; GFX11-NEXT: s_mov_b32 s3, s11
2668 ; GFX11-NEXT: s_mov_b32 s14, s10
2669 ; GFX11-NEXT: s_mov_b32 s15, s11
2670 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2671 ; GFX11-NEXT: s_mov_b32 s12, s6
2672 ; GFX11-NEXT: s_mov_b32 s13, s7
2673 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2674 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2675 ; GFX11-NEXT: s_mov_b32 s8, s4
2676 ; GFX11-NEXT: s_mov_b32 s9, s5
2677 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2678 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2679 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2680 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2681 ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0
2682 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2683 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2684 ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
2685 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2686 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2687 ; GFX11-NEXT: s_nop 0
2688 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2689 ; GFX11-NEXT: s_endpgm
2690 ptr addrspace(1) %r,
2691 ptr addrspace(1) %a,
2692 ptr addrspace(1) %b) {
2694 %a.val = load <2 x half>, ptr addrspace(1) %a
2695 %b.val = load <2 x half>, ptr addrspace(1) %b
2696 %r.val = fcmp une <2 x half> %a.val, %b.val
2697 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2698 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2702 define amdgpu_kernel void @fcmp_v2f16_nlt(
2703 ; SI-LABEL: fcmp_v2f16_nlt:
2704 ; SI: ; %bb.0: ; %entry
2705 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2706 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2707 ; SI-NEXT: s_mov_b32 s11, 0xf000
2708 ; SI-NEXT: s_mov_b32 s10, -1
2709 ; SI-NEXT: s_mov_b32 s14, s10
2710 ; SI-NEXT: s_mov_b32 s15, s11
2711 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2712 ; SI-NEXT: s_mov_b32 s12, s6
2713 ; SI-NEXT: s_mov_b32 s13, s7
2714 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2715 ; SI-NEXT: s_mov_b32 s2, s10
2716 ; SI-NEXT: s_mov_b32 s3, s11
2717 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
2718 ; SI-NEXT: s_mov_b32 s8, s4
2719 ; SI-NEXT: s_mov_b32 s9, s5
2720 ; SI-NEXT: s_waitcnt vmcnt(1)
2721 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2722 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2723 ; SI-NEXT: s_waitcnt vmcnt(0)
2724 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2725 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2726 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2727 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2728 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3
2729 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2730 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
2731 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2732 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2735 ; VI-LABEL: fcmp_v2f16_nlt:
2736 ; VI: ; %bb.0: ; %entry
2737 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2738 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2739 ; VI-NEXT: s_mov_b32 s3, 0xf000
2740 ; VI-NEXT: s_mov_b32 s2, -1
2741 ; VI-NEXT: s_mov_b32 s10, s2
2742 ; VI-NEXT: s_mov_b32 s11, s3
2743 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2744 ; VI-NEXT: s_mov_b32 s12, s6
2745 ; VI-NEXT: s_mov_b32 s13, s7
2746 ; VI-NEXT: s_mov_b32 s14, s2
2747 ; VI-NEXT: s_mov_b32 s15, s3
2748 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2749 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2750 ; VI-NEXT: s_mov_b32 s0, s4
2751 ; VI-NEXT: s_mov_b32 s1, s5
2752 ; VI-NEXT: s_waitcnt vmcnt(1)
2753 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2754 ; VI-NEXT: s_waitcnt vmcnt(0)
2755 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2756 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
2757 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2758 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2
2759 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2760 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2763 ; GFX11-LABEL: fcmp_v2f16_nlt:
2764 ; GFX11: ; %bb.0: ; %entry
2765 ; GFX11-NEXT: s_clause 0x1
2766 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2767 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2768 ; GFX11-NEXT: s_mov_b32 s10, -1
2769 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2770 ; GFX11-NEXT: s_mov_b32 s2, s10
2771 ; GFX11-NEXT: s_mov_b32 s3, s11
2772 ; GFX11-NEXT: s_mov_b32 s14, s10
2773 ; GFX11-NEXT: s_mov_b32 s15, s11
2774 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2775 ; GFX11-NEXT: s_mov_b32 s12, s6
2776 ; GFX11-NEXT: s_mov_b32 s13, s7
2777 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
2778 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2779 ; GFX11-NEXT: s_mov_b32 s8, s4
2780 ; GFX11-NEXT: s_mov_b32 s9, s5
2781 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2782 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2783 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2784 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2785 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
2786 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2787 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2788 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
2789 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2790 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2791 ; GFX11-NEXT: s_nop 0
2792 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2793 ; GFX11-NEXT: s_endpgm
2794 ptr addrspace(1) %r,
2795 ptr addrspace(1) %a,
2796 ptr addrspace(1) %b) {
2798 %a.val = load <2 x half>, ptr addrspace(1) %a
2799 %b.val = load <2 x half>, ptr addrspace(1) %b
2800 %r.val = fcmp uge <2 x half> %a.val, %b.val
2801 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2802 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2806 declare half @llvm.fabs.f16(half) #1
2808 attributes #0 = { nounwind }
2809 attributes #1 = { nounwind readnone }