1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12 %s
8 define amdgpu_kernel void @fcmp_f16_lt(
9 ; SI-LABEL: fcmp_f16_lt:
10 ; SI: ; %bb.0: ; %entry
11 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
12 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
13 ; SI-NEXT: s_mov_b32 s11, 0xf000
14 ; SI-NEXT: s_mov_b32 s10, -1
15 ; SI-NEXT: s_mov_b32 s14, s10
16 ; SI-NEXT: s_mov_b32 s15, s11
17 ; SI-NEXT: s_mov_b32 s6, s10
18 ; SI-NEXT: s_mov_b32 s7, s11
19 ; SI-NEXT: s_waitcnt lgkmcnt(0)
20 ; SI-NEXT: s_mov_b32 s12, s2
21 ; SI-NEXT: s_mov_b32 s13, s3
22 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
25 ; SI-NEXT: s_waitcnt vmcnt(0)
26 ; SI-NEXT: s_mov_b32 s8, s0
27 ; SI-NEXT: s_mov_b32 s9, s1
28 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
29 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
30 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
31 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
32 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
35 ; VI-LABEL: fcmp_f16_lt:
36 ; VI: ; %bb.0: ; %entry
37 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
38 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
39 ; VI-NEXT: s_mov_b32 s7, 0xf000
40 ; VI-NEXT: s_mov_b32 s6, -1
41 ; VI-NEXT: s_mov_b32 s14, s6
42 ; VI-NEXT: s_waitcnt lgkmcnt(0)
43 ; VI-NEXT: s_mov_b32 s12, s2
44 ; VI-NEXT: s_mov_b32 s13, s3
45 ; VI-NEXT: s_mov_b32 s15, s7
46 ; VI-NEXT: s_mov_b32 s10, s6
47 ; VI-NEXT: s_mov_b32 s11, s7
48 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
49 ; VI-NEXT: s_waitcnt vmcnt(0)
50 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
51 ; VI-NEXT: s_waitcnt vmcnt(0)
52 ; VI-NEXT: s_mov_b32 s4, s0
53 ; VI-NEXT: s_mov_b32 s5, s1
54 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
55 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
56 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
59 ; GFX11-TRUE16-LABEL: fcmp_f16_lt:
60 ; GFX11-TRUE16: ; %bb.0: ; %entry
61 ; GFX11-TRUE16-NEXT: s_clause 0x1
62 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
63 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
64 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
65 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
66 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
67 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
68 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
69 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
70 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
71 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
72 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
73 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
74 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
76 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
78 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
79 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
80 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
81 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
82 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
83 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
84 ; GFX11-TRUE16-NEXT: s_endpgm
86 ; GFX11-FAKE16-LABEL: fcmp_f16_lt:
87 ; GFX11-FAKE16: ; %bb.0: ; %entry
88 ; GFX11-FAKE16-NEXT: s_clause 0x1
89 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
91 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
92 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
93 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
94 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
95 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
96 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
97 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
98 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
99 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
100 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
101 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
102 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
103 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
104 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
105 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
106 ; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
107 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
108 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
109 ; GFX11-FAKE16-NEXT: s_endpgm
111 ; GFX12-LABEL: fcmp_f16_lt:
112 ; GFX12: ; %bb.0: ; %entry
113 ; GFX12-NEXT: s_clause 0x1
114 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
115 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
116 ; GFX12-NEXT: s_mov_b32 s10, -1
117 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
118 ; GFX12-NEXT: s_mov_b32 s14, s10
119 ; GFX12-NEXT: s_mov_b32 s15, s11
120 ; GFX12-NEXT: s_mov_b32 s6, s10
121 ; GFX12-NEXT: s_mov_b32 s7, s11
122 ; GFX12-NEXT: s_wait_kmcnt 0x0
123 ; GFX12-NEXT: s_mov_b32 s12, s2
124 ; GFX12-NEXT: s_mov_b32 s13, s3
125 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
126 ; GFX12-NEXT: s_wait_loadcnt 0x0
127 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
128 ; GFX12-NEXT: s_wait_loadcnt 0x0
129 ; GFX12-NEXT: s_mov_b32 s8, s0
130 ; GFX12-NEXT: s_mov_b32 s9, s1
131 ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
132 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
133 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
134 ; GFX12-NEXT: s_endpgm
137 ptr addrspace(1) %b) {
139 %a.val = load volatile half, ptr addrspace(1) %a
140 %b.val = load volatile half, ptr addrspace(1) %b
141 %r.val = fcmp olt half %a.val, %b.val
142 %r.val.sext = sext i1 %r.val to i32
143 store i32 %r.val.sext, ptr addrspace(1) %r
147 define amdgpu_kernel void @fcmp_f16_lt_abs(
148 ; SI-LABEL: fcmp_f16_lt_abs:
149 ; SI: ; %bb.0: ; %entry
150 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
151 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
152 ; SI-NEXT: s_mov_b32 s11, 0xf000
153 ; SI-NEXT: s_mov_b32 s10, -1
154 ; SI-NEXT: s_mov_b32 s14, s10
155 ; SI-NEXT: s_mov_b32 s15, s11
156 ; SI-NEXT: s_mov_b32 s6, s10
157 ; SI-NEXT: s_mov_b32 s7, s11
158 ; SI-NEXT: s_waitcnt lgkmcnt(0)
159 ; SI-NEXT: s_mov_b32 s12, s2
160 ; SI-NEXT: s_mov_b32 s13, s3
161 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
162 ; SI-NEXT: s_waitcnt vmcnt(0)
163 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
164 ; SI-NEXT: s_waitcnt vmcnt(0)
165 ; SI-NEXT: s_mov_b32 s8, s0
166 ; SI-NEXT: s_mov_b32 s9, s1
167 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
168 ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
169 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
170 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
171 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
174 ; VI-LABEL: fcmp_f16_lt_abs:
175 ; VI: ; %bb.0: ; %entry
176 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
177 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
178 ; VI-NEXT: s_mov_b32 s7, 0xf000
179 ; VI-NEXT: s_mov_b32 s6, -1
180 ; VI-NEXT: s_mov_b32 s14, s6
181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
182 ; VI-NEXT: s_mov_b32 s12, s2
183 ; VI-NEXT: s_mov_b32 s13, s3
184 ; VI-NEXT: s_mov_b32 s15, s7
185 ; VI-NEXT: s_mov_b32 s10, s6
186 ; VI-NEXT: s_mov_b32 s11, s7
187 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
188 ; VI-NEXT: s_waitcnt vmcnt(0)
189 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
190 ; VI-NEXT: s_waitcnt vmcnt(0)
191 ; VI-NEXT: s_mov_b32 s4, s0
192 ; VI-NEXT: s_mov_b32 s5, s1
193 ; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], |v0|, |v1|
194 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
195 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
198 ; GFX11-TRUE16-LABEL: fcmp_f16_lt_abs:
199 ; GFX11-TRUE16: ; %bb.0: ; %entry
200 ; GFX11-TRUE16-NEXT: s_clause 0x1
201 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
202 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
203 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
204 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
205 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
206 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
207 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
208 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
209 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
211 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
212 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
213 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
214 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
215 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
216 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
217 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
218 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
219 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
220 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s2, |v0.l|, |v0.h|
221 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2
222 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
223 ; GFX11-TRUE16-NEXT: s_endpgm
225 ; GFX11-FAKE16-LABEL: fcmp_f16_lt_abs:
226 ; GFX11-FAKE16: ; %bb.0: ; %entry
227 ; GFX11-FAKE16-NEXT: s_clause 0x1
228 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
229 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
230 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
231 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
232 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
233 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
234 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
235 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
236 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
238 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
239 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
240 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
241 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
242 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
243 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
244 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
245 ; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e64 s2, |v0|, |v1|
246 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2
248 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
249 ; GFX11-FAKE16-NEXT: s_endpgm
251 ; GFX12-LABEL: fcmp_f16_lt_abs:
252 ; GFX12: ; %bb.0: ; %entry
253 ; GFX12-NEXT: s_clause 0x1
254 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
255 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
256 ; GFX12-NEXT: s_mov_b32 s10, -1
257 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
258 ; GFX12-NEXT: s_mov_b32 s14, s10
259 ; GFX12-NEXT: s_mov_b32 s15, s11
260 ; GFX12-NEXT: s_mov_b32 s6, s10
261 ; GFX12-NEXT: s_mov_b32 s7, s11
262 ; GFX12-NEXT: s_wait_kmcnt 0x0
263 ; GFX12-NEXT: s_mov_b32 s12, s2
264 ; GFX12-NEXT: s_mov_b32 s13, s3
265 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
266 ; GFX12-NEXT: s_wait_loadcnt 0x0
267 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
268 ; GFX12-NEXT: s_wait_loadcnt 0x0
269 ; GFX12-NEXT: s_mov_b32 s8, s0
270 ; GFX12-NEXT: s_mov_b32 s9, s1
271 ; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff, v0
272 ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v1
273 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
274 ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
275 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
276 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
277 ; GFX12-NEXT: s_endpgm
280 ptr addrspace(1) %b) {
282 %a.val = load volatile half, ptr addrspace(1) %a
283 %b.val = load volatile half, ptr addrspace(1) %b
284 %a.abs = call half @llvm.fabs.f16(half %a.val)
285 %b.abs = call half @llvm.fabs.f16(half %b.val)
286 %r.val = fcmp olt half %a.abs, %b.abs
287 %r.val.sext = sext i1 %r.val to i32
288 store i32 %r.val.sext, ptr addrspace(1) %r
292 define amdgpu_kernel void @fcmp_f16_eq(
293 ; SI-LABEL: fcmp_f16_eq:
294 ; SI: ; %bb.0: ; %entry
295 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
296 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
297 ; SI-NEXT: s_mov_b32 s11, 0xf000
298 ; SI-NEXT: s_mov_b32 s10, -1
299 ; SI-NEXT: s_mov_b32 s14, s10
300 ; SI-NEXT: s_mov_b32 s15, s11
301 ; SI-NEXT: s_mov_b32 s6, s10
302 ; SI-NEXT: s_mov_b32 s7, s11
303 ; SI-NEXT: s_waitcnt lgkmcnt(0)
304 ; SI-NEXT: s_mov_b32 s12, s2
305 ; SI-NEXT: s_mov_b32 s13, s3
306 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
307 ; SI-NEXT: s_waitcnt vmcnt(0)
308 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
309 ; SI-NEXT: s_waitcnt vmcnt(0)
310 ; SI-NEXT: s_mov_b32 s8, s0
311 ; SI-NEXT: s_mov_b32 s9, s1
312 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
313 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
314 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
315 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
316 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
319 ; VI-LABEL: fcmp_f16_eq:
320 ; VI: ; %bb.0: ; %entry
321 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
322 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
323 ; VI-NEXT: s_mov_b32 s7, 0xf000
324 ; VI-NEXT: s_mov_b32 s6, -1
325 ; VI-NEXT: s_mov_b32 s14, s6
326 ; VI-NEXT: s_waitcnt lgkmcnt(0)
327 ; VI-NEXT: s_mov_b32 s12, s2
328 ; VI-NEXT: s_mov_b32 s13, s3
329 ; VI-NEXT: s_mov_b32 s15, s7
330 ; VI-NEXT: s_mov_b32 s10, s6
331 ; VI-NEXT: s_mov_b32 s11, s7
332 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
333 ; VI-NEXT: s_waitcnt vmcnt(0)
334 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
335 ; VI-NEXT: s_waitcnt vmcnt(0)
336 ; VI-NEXT: s_mov_b32 s4, s0
337 ; VI-NEXT: s_mov_b32 s5, s1
338 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
339 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
340 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
343 ; GFX11-TRUE16-LABEL: fcmp_f16_eq:
344 ; GFX11-TRUE16: ; %bb.0: ; %entry
345 ; GFX11-TRUE16-NEXT: s_clause 0x1
346 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
347 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
348 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
349 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
350 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
351 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
352 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
353 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
354 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
355 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
356 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
357 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
358 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
359 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
360 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
361 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
362 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
363 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
364 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
365 ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0.l, v0.h
366 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
367 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
368 ; GFX11-TRUE16-NEXT: s_endpgm
370 ; GFX11-FAKE16-LABEL: fcmp_f16_eq:
371 ; GFX11-FAKE16: ; %bb.0: ; %entry
372 ; GFX11-FAKE16-NEXT: s_clause 0x1
373 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
374 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
375 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
376 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
377 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
378 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
379 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
380 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
381 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
382 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
383 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
384 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
385 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
386 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
387 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
388 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
389 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
390 ; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
391 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
392 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
393 ; GFX11-FAKE16-NEXT: s_endpgm
395 ; GFX12-LABEL: fcmp_f16_eq:
396 ; GFX12: ; %bb.0: ; %entry
397 ; GFX12-NEXT: s_clause 0x1
398 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
399 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
400 ; GFX12-NEXT: s_mov_b32 s10, -1
401 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
402 ; GFX12-NEXT: s_mov_b32 s14, s10
403 ; GFX12-NEXT: s_mov_b32 s15, s11
404 ; GFX12-NEXT: s_mov_b32 s6, s10
405 ; GFX12-NEXT: s_mov_b32 s7, s11
406 ; GFX12-NEXT: s_wait_kmcnt 0x0
407 ; GFX12-NEXT: s_mov_b32 s12, s2
408 ; GFX12-NEXT: s_mov_b32 s13, s3
409 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
410 ; GFX12-NEXT: s_wait_loadcnt 0x0
411 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
412 ; GFX12-NEXT: s_wait_loadcnt 0x0
413 ; GFX12-NEXT: s_mov_b32 s8, s0
414 ; GFX12-NEXT: s_mov_b32 s9, s1
415 ; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
416 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
417 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
418 ; GFX12-NEXT: s_endpgm
421 ptr addrspace(1) %b) {
423 %a.val = load volatile half, ptr addrspace(1) %a
424 %b.val = load volatile half, ptr addrspace(1) %b
425 %r.val = fcmp oeq half %a.val, %b.val
426 %r.val.sext = sext i1 %r.val to i32
427 store i32 %r.val.sext, ptr addrspace(1) %r
431 define amdgpu_kernel void @fcmp_f16_le(
432 ; SI-LABEL: fcmp_f16_le:
433 ; SI: ; %bb.0: ; %entry
434 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
435 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
436 ; SI-NEXT: s_mov_b32 s11, 0xf000
437 ; SI-NEXT: s_mov_b32 s10, -1
438 ; SI-NEXT: s_mov_b32 s14, s10
439 ; SI-NEXT: s_mov_b32 s15, s11
440 ; SI-NEXT: s_mov_b32 s6, s10
441 ; SI-NEXT: s_mov_b32 s7, s11
442 ; SI-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NEXT: s_mov_b32 s12, s2
444 ; SI-NEXT: s_mov_b32 s13, s3
445 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
446 ; SI-NEXT: s_waitcnt vmcnt(0)
447 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
448 ; SI-NEXT: s_waitcnt vmcnt(0)
449 ; SI-NEXT: s_mov_b32 s8, s0
450 ; SI-NEXT: s_mov_b32 s9, s1
451 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
452 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
453 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
454 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
455 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
458 ; VI-LABEL: fcmp_f16_le:
459 ; VI: ; %bb.0: ; %entry
460 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
461 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
462 ; VI-NEXT: s_mov_b32 s7, 0xf000
463 ; VI-NEXT: s_mov_b32 s6, -1
464 ; VI-NEXT: s_mov_b32 s14, s6
465 ; VI-NEXT: s_waitcnt lgkmcnt(0)
466 ; VI-NEXT: s_mov_b32 s12, s2
467 ; VI-NEXT: s_mov_b32 s13, s3
468 ; VI-NEXT: s_mov_b32 s15, s7
469 ; VI-NEXT: s_mov_b32 s10, s6
470 ; VI-NEXT: s_mov_b32 s11, s7
471 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
472 ; VI-NEXT: s_waitcnt vmcnt(0)
473 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
474 ; VI-NEXT: s_waitcnt vmcnt(0)
475 ; VI-NEXT: s_mov_b32 s4, s0
476 ; VI-NEXT: s_mov_b32 s5, s1
477 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
478 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
479 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
482 ; GFX11-TRUE16-LABEL: fcmp_f16_le:
483 ; GFX11-TRUE16: ; %bb.0: ; %entry
484 ; GFX11-TRUE16-NEXT: s_clause 0x1
485 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
486 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
487 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
488 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
489 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
490 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
491 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
492 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
493 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
494 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
495 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
496 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
497 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
498 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
499 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
500 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
501 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
502 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
503 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
504 ; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v0.l, v0.h
505 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
506 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
507 ; GFX11-TRUE16-NEXT: s_endpgm
509 ; GFX11-FAKE16-LABEL: fcmp_f16_le:
510 ; GFX11-FAKE16: ; %bb.0: ; %entry
511 ; GFX11-FAKE16-NEXT: s_clause 0x1
512 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
513 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
514 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
515 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
516 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
517 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
518 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
519 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
520 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
522 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
523 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
524 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
525 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
526 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
527 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
528 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
529 ; GFX11-FAKE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
530 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
531 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
532 ; GFX11-FAKE16-NEXT: s_endpgm
534 ; GFX12-LABEL: fcmp_f16_le:
535 ; GFX12: ; %bb.0: ; %entry
536 ; GFX12-NEXT: s_clause 0x1
537 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
538 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
539 ; GFX12-NEXT: s_mov_b32 s10, -1
540 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
541 ; GFX12-NEXT: s_mov_b32 s14, s10
542 ; GFX12-NEXT: s_mov_b32 s15, s11
543 ; GFX12-NEXT: s_mov_b32 s6, s10
544 ; GFX12-NEXT: s_mov_b32 s7, s11
545 ; GFX12-NEXT: s_wait_kmcnt 0x0
546 ; GFX12-NEXT: s_mov_b32 s12, s2
547 ; GFX12-NEXT: s_mov_b32 s13, s3
548 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
549 ; GFX12-NEXT: s_wait_loadcnt 0x0
550 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
551 ; GFX12-NEXT: s_wait_loadcnt 0x0
552 ; GFX12-NEXT: s_mov_b32 s8, s0
553 ; GFX12-NEXT: s_mov_b32 s9, s1
554 ; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
555 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
556 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
557 ; GFX12-NEXT: s_endpgm
560 ptr addrspace(1) %b) {
562 %a.val = load volatile half, ptr addrspace(1) %a
563 %b.val = load volatile half, ptr addrspace(1) %b
564 %r.val = fcmp ole half %a.val, %b.val
565 %r.val.sext = sext i1 %r.val to i32
566 store i32 %r.val.sext, ptr addrspace(1) %r
570 define amdgpu_kernel void @fcmp_f16_gt(
571 ; SI-LABEL: fcmp_f16_gt:
572 ; SI: ; %bb.0: ; %entry
573 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
574 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
575 ; SI-NEXT: s_mov_b32 s11, 0xf000
576 ; SI-NEXT: s_mov_b32 s10, -1
577 ; SI-NEXT: s_mov_b32 s14, s10
578 ; SI-NEXT: s_mov_b32 s15, s11
579 ; SI-NEXT: s_mov_b32 s6, s10
580 ; SI-NEXT: s_mov_b32 s7, s11
581 ; SI-NEXT: s_waitcnt lgkmcnt(0)
582 ; SI-NEXT: s_mov_b32 s12, s2
583 ; SI-NEXT: s_mov_b32 s13, s3
584 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
585 ; SI-NEXT: s_waitcnt vmcnt(0)
586 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
587 ; SI-NEXT: s_waitcnt vmcnt(0)
588 ; SI-NEXT: s_mov_b32 s8, s0
589 ; SI-NEXT: s_mov_b32 s9, s1
590 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
591 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
592 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
593 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
594 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
597 ; VI-LABEL: fcmp_f16_gt:
598 ; VI: ; %bb.0: ; %entry
599 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
600 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
601 ; VI-NEXT: s_mov_b32 s7, 0xf000
602 ; VI-NEXT: s_mov_b32 s6, -1
603 ; VI-NEXT: s_mov_b32 s14, s6
604 ; VI-NEXT: s_waitcnt lgkmcnt(0)
605 ; VI-NEXT: s_mov_b32 s12, s2
606 ; VI-NEXT: s_mov_b32 s13, s3
607 ; VI-NEXT: s_mov_b32 s15, s7
608 ; VI-NEXT: s_mov_b32 s10, s6
609 ; VI-NEXT: s_mov_b32 s11, s7
610 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
611 ; VI-NEXT: s_waitcnt vmcnt(0)
612 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
613 ; VI-NEXT: s_waitcnt vmcnt(0)
614 ; VI-NEXT: s_mov_b32 s4, s0
615 ; VI-NEXT: s_mov_b32 s5, s1
616 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
617 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
618 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
621 ; GFX11-TRUE16-LABEL: fcmp_f16_gt:
622 ; GFX11-TRUE16: ; %bb.0: ; %entry
623 ; GFX11-TRUE16-NEXT: s_clause 0x1
624 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
625 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
626 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
627 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
628 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
629 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
630 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
631 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
632 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
634 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
635 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
636 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
637 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
638 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
639 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
640 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
641 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
642 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
643 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0.l, v0.h
644 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
645 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
646 ; GFX11-TRUE16-NEXT: s_endpgm
648 ; GFX11-FAKE16-LABEL: fcmp_f16_gt:
649 ; GFX11-FAKE16: ; %bb.0: ; %entry
650 ; GFX11-FAKE16-NEXT: s_clause 0x1
651 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
652 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
653 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
654 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
655 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
656 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
657 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
658 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
659 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
660 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
661 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
662 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
663 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
664 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
665 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
666 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
667 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
668 ; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
669 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
670 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
671 ; GFX11-FAKE16-NEXT: s_endpgm
673 ; GFX12-LABEL: fcmp_f16_gt:
674 ; GFX12: ; %bb.0: ; %entry
675 ; GFX12-NEXT: s_clause 0x1
676 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
677 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
678 ; GFX12-NEXT: s_mov_b32 s10, -1
679 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
680 ; GFX12-NEXT: s_mov_b32 s14, s10
681 ; GFX12-NEXT: s_mov_b32 s15, s11
682 ; GFX12-NEXT: s_mov_b32 s6, s10
683 ; GFX12-NEXT: s_mov_b32 s7, s11
684 ; GFX12-NEXT: s_wait_kmcnt 0x0
685 ; GFX12-NEXT: s_mov_b32 s12, s2
686 ; GFX12-NEXT: s_mov_b32 s13, s3
687 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
688 ; GFX12-NEXT: s_wait_loadcnt 0x0
689 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
690 ; GFX12-NEXT: s_wait_loadcnt 0x0
691 ; GFX12-NEXT: s_mov_b32 s8, s0
692 ; GFX12-NEXT: s_mov_b32 s9, s1
693 ; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
694 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
695 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
696 ; GFX12-NEXT: s_endpgm
699 ptr addrspace(1) %b) {
701 %a.val = load volatile half, ptr addrspace(1) %a
702 %b.val = load volatile half, ptr addrspace(1) %b
703 %r.val = fcmp ogt half %a.val, %b.val
704 %r.val.sext = sext i1 %r.val to i32
705 store i32 %r.val.sext, ptr addrspace(1) %r
709 define amdgpu_kernel void @fcmp_f16_lg(
710 ; SI-LABEL: fcmp_f16_lg:
711 ; SI: ; %bb.0: ; %entry
712 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
713 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
714 ; SI-NEXT: s_mov_b32 s11, 0xf000
715 ; SI-NEXT: s_mov_b32 s10, -1
716 ; SI-NEXT: s_mov_b32 s14, s10
717 ; SI-NEXT: s_mov_b32 s15, s11
718 ; SI-NEXT: s_mov_b32 s6, s10
719 ; SI-NEXT: s_mov_b32 s7, s11
720 ; SI-NEXT: s_waitcnt lgkmcnt(0)
721 ; SI-NEXT: s_mov_b32 s12, s2
722 ; SI-NEXT: s_mov_b32 s13, s3
723 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
724 ; SI-NEXT: s_waitcnt vmcnt(0)
725 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
726 ; SI-NEXT: s_waitcnt vmcnt(0)
727 ; SI-NEXT: s_mov_b32 s8, s0
728 ; SI-NEXT: s_mov_b32 s9, s1
729 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
730 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
731 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
732 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
733 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
736 ; VI-LABEL: fcmp_f16_lg:
737 ; VI: ; %bb.0: ; %entry
738 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
739 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
740 ; VI-NEXT: s_mov_b32 s7, 0xf000
741 ; VI-NEXT: s_mov_b32 s6, -1
742 ; VI-NEXT: s_mov_b32 s14, s6
743 ; VI-NEXT: s_waitcnt lgkmcnt(0)
744 ; VI-NEXT: s_mov_b32 s12, s2
745 ; VI-NEXT: s_mov_b32 s13, s3
746 ; VI-NEXT: s_mov_b32 s15, s7
747 ; VI-NEXT: s_mov_b32 s10, s6
748 ; VI-NEXT: s_mov_b32 s11, s7
749 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
750 ; VI-NEXT: s_waitcnt vmcnt(0)
751 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
752 ; VI-NEXT: s_waitcnt vmcnt(0)
753 ; VI-NEXT: s_mov_b32 s4, s0
754 ; VI-NEXT: s_mov_b32 s5, s1
755 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
756 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
757 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
760 ; GFX11-TRUE16-LABEL: fcmp_f16_lg:
761 ; GFX11-TRUE16: ; %bb.0: ; %entry
762 ; GFX11-TRUE16-NEXT: s_clause 0x1
763 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
764 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
765 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
766 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
767 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
768 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
769 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
770 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
771 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
772 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
773 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
774 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
775 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
776 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
777 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
778 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
779 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
780 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
781 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
782 ; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0.l, v0.h
783 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
784 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
785 ; GFX11-TRUE16-NEXT: s_endpgm
787 ; GFX11-FAKE16-LABEL: fcmp_f16_lg:
788 ; GFX11-FAKE16: ; %bb.0: ; %entry
789 ; GFX11-FAKE16-NEXT: s_clause 0x1
790 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
791 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
792 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
793 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
794 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
795 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
796 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
797 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
798 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
799 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
800 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
801 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
802 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
803 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
804 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
805 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
806 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
807 ; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
808 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
809 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
810 ; GFX11-FAKE16-NEXT: s_endpgm
812 ; GFX12-LABEL: fcmp_f16_lg:
813 ; GFX12: ; %bb.0: ; %entry
814 ; GFX12-NEXT: s_clause 0x1
815 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
816 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
817 ; GFX12-NEXT: s_mov_b32 s10, -1
818 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
819 ; GFX12-NEXT: s_mov_b32 s14, s10
820 ; GFX12-NEXT: s_mov_b32 s15, s11
821 ; GFX12-NEXT: s_mov_b32 s6, s10
822 ; GFX12-NEXT: s_mov_b32 s7, s11
823 ; GFX12-NEXT: s_wait_kmcnt 0x0
824 ; GFX12-NEXT: s_mov_b32 s12, s2
825 ; GFX12-NEXT: s_mov_b32 s13, s3
826 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
827 ; GFX12-NEXT: s_wait_loadcnt 0x0
828 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
829 ; GFX12-NEXT: s_wait_loadcnt 0x0
830 ; GFX12-NEXT: s_mov_b32 s8, s0
831 ; GFX12-NEXT: s_mov_b32 s9, s1
832 ; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
833 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
834 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
835 ; GFX12-NEXT: s_endpgm
838 ptr addrspace(1) %b) {
840 %a.val = load volatile half, ptr addrspace(1) %a
841 %b.val = load volatile half, ptr addrspace(1) %b
842 %r.val = fcmp one half %a.val, %b.val
843 %r.val.sext = sext i1 %r.val to i32
844 store i32 %r.val.sext, ptr addrspace(1) %r
848 define amdgpu_kernel void @fcmp_f16_ge(
849 ; SI-LABEL: fcmp_f16_ge:
850 ; SI: ; %bb.0: ; %entry
851 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
852 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
853 ; SI-NEXT: s_mov_b32 s11, 0xf000
854 ; SI-NEXT: s_mov_b32 s10, -1
855 ; SI-NEXT: s_mov_b32 s14, s10
856 ; SI-NEXT: s_mov_b32 s15, s11
857 ; SI-NEXT: s_mov_b32 s6, s10
858 ; SI-NEXT: s_mov_b32 s7, s11
859 ; SI-NEXT: s_waitcnt lgkmcnt(0)
860 ; SI-NEXT: s_mov_b32 s12, s2
861 ; SI-NEXT: s_mov_b32 s13, s3
862 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
863 ; SI-NEXT: s_waitcnt vmcnt(0)
864 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
865 ; SI-NEXT: s_waitcnt vmcnt(0)
866 ; SI-NEXT: s_mov_b32 s8, s0
867 ; SI-NEXT: s_mov_b32 s9, s1
868 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
869 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
870 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
871 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
872 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
875 ; VI-LABEL: fcmp_f16_ge:
876 ; VI: ; %bb.0: ; %entry
877 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
878 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
879 ; VI-NEXT: s_mov_b32 s7, 0xf000
880 ; VI-NEXT: s_mov_b32 s6, -1
881 ; VI-NEXT: s_mov_b32 s14, s6
882 ; VI-NEXT: s_waitcnt lgkmcnt(0)
883 ; VI-NEXT: s_mov_b32 s12, s2
884 ; VI-NEXT: s_mov_b32 s13, s3
885 ; VI-NEXT: s_mov_b32 s15, s7
886 ; VI-NEXT: s_mov_b32 s10, s6
887 ; VI-NEXT: s_mov_b32 s11, s7
888 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
889 ; VI-NEXT: s_waitcnt vmcnt(0)
890 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
891 ; VI-NEXT: s_waitcnt vmcnt(0)
892 ; VI-NEXT: s_mov_b32 s4, s0
893 ; VI-NEXT: s_mov_b32 s5, s1
894 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
895 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
896 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
899 ; GFX11-TRUE16-LABEL: fcmp_f16_ge:
900 ; GFX11-TRUE16: ; %bb.0: ; %entry
901 ; GFX11-TRUE16-NEXT: s_clause 0x1
902 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
903 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
904 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
905 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
906 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
907 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
908 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
909 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
910 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
911 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
912 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
913 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
914 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
915 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
916 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
917 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
918 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
919 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
920 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
921 ; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0.l, v0.h
922 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
923 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
924 ; GFX11-TRUE16-NEXT: s_endpgm
926 ; GFX11-FAKE16-LABEL: fcmp_f16_ge:
927 ; GFX11-FAKE16: ; %bb.0: ; %entry
928 ; GFX11-FAKE16-NEXT: s_clause 0x1
929 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
930 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
931 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
932 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
933 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
934 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
935 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
936 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
937 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
938 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
939 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
940 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
941 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
942 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
943 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
944 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
945 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
946 ; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
947 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
948 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
949 ; GFX11-FAKE16-NEXT: s_endpgm
951 ; GFX12-LABEL: fcmp_f16_ge:
952 ; GFX12: ; %bb.0: ; %entry
953 ; GFX12-NEXT: s_clause 0x1
954 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
955 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
956 ; GFX12-NEXT: s_mov_b32 s10, -1
957 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
958 ; GFX12-NEXT: s_mov_b32 s14, s10
959 ; GFX12-NEXT: s_mov_b32 s15, s11
960 ; GFX12-NEXT: s_mov_b32 s6, s10
961 ; GFX12-NEXT: s_mov_b32 s7, s11
962 ; GFX12-NEXT: s_wait_kmcnt 0x0
963 ; GFX12-NEXT: s_mov_b32 s12, s2
964 ; GFX12-NEXT: s_mov_b32 s13, s3
965 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
966 ; GFX12-NEXT: s_wait_loadcnt 0x0
967 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
968 ; GFX12-NEXT: s_wait_loadcnt 0x0
969 ; GFX12-NEXT: s_mov_b32 s8, s0
970 ; GFX12-NEXT: s_mov_b32 s9, s1
971 ; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
972 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
973 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
974 ; GFX12-NEXT: s_endpgm
977 ptr addrspace(1) %b) {
979 %a.val = load volatile half, ptr addrspace(1) %a
980 %b.val = load volatile half, ptr addrspace(1) %b
981 %r.val = fcmp oge half %a.val, %b.val
982 %r.val.sext = sext i1 %r.val to i32
983 store i32 %r.val.sext, ptr addrspace(1) %r
987 define amdgpu_kernel void @fcmp_f16_o(
988 ; SI-LABEL: fcmp_f16_o:
989 ; SI: ; %bb.0: ; %entry
990 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
991 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
992 ; SI-NEXT: s_mov_b32 s11, 0xf000
993 ; SI-NEXT: s_mov_b32 s10, -1
994 ; SI-NEXT: s_mov_b32 s14, s10
995 ; SI-NEXT: s_mov_b32 s15, s11
996 ; SI-NEXT: s_mov_b32 s6, s10
997 ; SI-NEXT: s_mov_b32 s7, s11
998 ; SI-NEXT: s_waitcnt lgkmcnt(0)
999 ; SI-NEXT: s_mov_b32 s12, s2
1000 ; SI-NEXT: s_mov_b32 s13, s3
1001 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1002 ; SI-NEXT: s_waitcnt vmcnt(0)
1003 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1004 ; SI-NEXT: s_waitcnt vmcnt(0)
1005 ; SI-NEXT: s_mov_b32 s8, s0
1006 ; SI-NEXT: s_mov_b32 s9, s1
1007 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1008 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1009 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
1010 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1011 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1014 ; VI-LABEL: fcmp_f16_o:
1015 ; VI: ; %bb.0: ; %entry
1016 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1017 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1018 ; VI-NEXT: s_mov_b32 s7, 0xf000
1019 ; VI-NEXT: s_mov_b32 s6, -1
1020 ; VI-NEXT: s_mov_b32 s14, s6
1021 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1022 ; VI-NEXT: s_mov_b32 s12, s2
1023 ; VI-NEXT: s_mov_b32 s13, s3
1024 ; VI-NEXT: s_mov_b32 s15, s7
1025 ; VI-NEXT: s_mov_b32 s10, s6
1026 ; VI-NEXT: s_mov_b32 s11, s7
1027 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1028 ; VI-NEXT: s_waitcnt vmcnt(0)
1029 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1030 ; VI-NEXT: s_waitcnt vmcnt(0)
1031 ; VI-NEXT: s_mov_b32 s4, s0
1032 ; VI-NEXT: s_mov_b32 s5, s1
1033 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
1034 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1035 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1038 ; GFX11-TRUE16-LABEL: fcmp_f16_o:
1039 ; GFX11-TRUE16: ; %bb.0: ; %entry
1040 ; GFX11-TRUE16-NEXT: s_clause 0x1
1041 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1042 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1043 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1044 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1045 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1046 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1047 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1048 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1049 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1050 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1051 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1052 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1053 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1054 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1055 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1056 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1057 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1058 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1059 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1060 ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
1061 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1062 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1063 ; GFX11-TRUE16-NEXT: s_endpgm
1065 ; GFX11-FAKE16-LABEL: fcmp_f16_o:
1066 ; GFX11-FAKE16: ; %bb.0: ; %entry
1067 ; GFX11-FAKE16-NEXT: s_clause 0x1
1068 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1069 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1070 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1071 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1072 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1073 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1074 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1075 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1076 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1077 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1078 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1079 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1080 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1081 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1082 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1083 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1084 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1085 ; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
1086 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1087 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1088 ; GFX11-FAKE16-NEXT: s_endpgm
1090 ; GFX12-LABEL: fcmp_f16_o:
1091 ; GFX12: ; %bb.0: ; %entry
1092 ; GFX12-NEXT: s_clause 0x1
1093 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1094 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1095 ; GFX12-NEXT: s_mov_b32 s10, -1
1096 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1097 ; GFX12-NEXT: s_mov_b32 s14, s10
1098 ; GFX12-NEXT: s_mov_b32 s15, s11
1099 ; GFX12-NEXT: s_mov_b32 s6, s10
1100 ; GFX12-NEXT: s_mov_b32 s7, s11
1101 ; GFX12-NEXT: s_wait_kmcnt 0x0
1102 ; GFX12-NEXT: s_mov_b32 s12, s2
1103 ; GFX12-NEXT: s_mov_b32 s13, s3
1104 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1105 ; GFX12-NEXT: s_wait_loadcnt 0x0
1106 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1107 ; GFX12-NEXT: s_wait_loadcnt 0x0
1108 ; GFX12-NEXT: s_mov_b32 s8, s0
1109 ; GFX12-NEXT: s_mov_b32 s9, s1
1110 ; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
1111 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1112 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1113 ; GFX12-NEXT: s_endpgm
1114 ptr addrspace(1) %r,
1115 ptr addrspace(1) %a,
1116 ptr addrspace(1) %b) {
1118 %a.val = load volatile half, ptr addrspace(1) %a
1119 %b.val = load volatile half, ptr addrspace(1) %b
1120 %r.val = fcmp ord half %a.val, %b.val
1121 %r.val.sext = sext i1 %r.val to i32
1122 store i32 %r.val.sext, ptr addrspace(1) %r
1126 define amdgpu_kernel void @fcmp_f16_u(
1127 ; SI-LABEL: fcmp_f16_u:
1128 ; SI: ; %bb.0: ; %entry
1129 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1130 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1131 ; SI-NEXT: s_mov_b32 s11, 0xf000
1132 ; SI-NEXT: s_mov_b32 s10, -1
1133 ; SI-NEXT: s_mov_b32 s14, s10
1134 ; SI-NEXT: s_mov_b32 s15, s11
1135 ; SI-NEXT: s_mov_b32 s6, s10
1136 ; SI-NEXT: s_mov_b32 s7, s11
1137 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1138 ; SI-NEXT: s_mov_b32 s12, s2
1139 ; SI-NEXT: s_mov_b32 s13, s3
1140 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1141 ; SI-NEXT: s_waitcnt vmcnt(0)
1142 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1143 ; SI-NEXT: s_waitcnt vmcnt(0)
1144 ; SI-NEXT: s_mov_b32 s8, s0
1145 ; SI-NEXT: s_mov_b32 s9, s1
1146 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1147 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1148 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
1149 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1150 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1153 ; VI-LABEL: fcmp_f16_u:
1154 ; VI: ; %bb.0: ; %entry
1155 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1156 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1157 ; VI-NEXT: s_mov_b32 s7, 0xf000
1158 ; VI-NEXT: s_mov_b32 s6, -1
1159 ; VI-NEXT: s_mov_b32 s14, s6
1160 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1161 ; VI-NEXT: s_mov_b32 s12, s2
1162 ; VI-NEXT: s_mov_b32 s13, s3
1163 ; VI-NEXT: s_mov_b32 s15, s7
1164 ; VI-NEXT: s_mov_b32 s10, s6
1165 ; VI-NEXT: s_mov_b32 s11, s7
1166 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1167 ; VI-NEXT: s_waitcnt vmcnt(0)
1168 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1169 ; VI-NEXT: s_waitcnt vmcnt(0)
1170 ; VI-NEXT: s_mov_b32 s4, s0
1171 ; VI-NEXT: s_mov_b32 s5, s1
1172 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
1173 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1174 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1177 ; GFX11-TRUE16-LABEL: fcmp_f16_u:
1178 ; GFX11-TRUE16: ; %bb.0: ; %entry
1179 ; GFX11-TRUE16-NEXT: s_clause 0x1
1180 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1181 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1182 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1183 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1184 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1185 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1186 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1187 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1188 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1189 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1190 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1191 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1192 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1193 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1194 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1195 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1196 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1197 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1198 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1199 ; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v0.h
1200 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1201 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1202 ; GFX11-TRUE16-NEXT: s_endpgm
1204 ; GFX11-FAKE16-LABEL: fcmp_f16_u:
1205 ; GFX11-FAKE16: ; %bb.0: ; %entry
1206 ; GFX11-FAKE16-NEXT: s_clause 0x1
1207 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1208 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1209 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1210 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1211 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1212 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1213 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1214 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1215 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1216 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1217 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1218 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1219 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1220 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1221 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1222 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1223 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1224 ; GFX11-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
1225 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1226 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1227 ; GFX11-FAKE16-NEXT: s_endpgm
1229 ; GFX12-LABEL: fcmp_f16_u:
1230 ; GFX12: ; %bb.0: ; %entry
1231 ; GFX12-NEXT: s_clause 0x1
1232 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1233 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1234 ; GFX12-NEXT: s_mov_b32 s10, -1
1235 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1236 ; GFX12-NEXT: s_mov_b32 s14, s10
1237 ; GFX12-NEXT: s_mov_b32 s15, s11
1238 ; GFX12-NEXT: s_mov_b32 s6, s10
1239 ; GFX12-NEXT: s_mov_b32 s7, s11
1240 ; GFX12-NEXT: s_wait_kmcnt 0x0
1241 ; GFX12-NEXT: s_mov_b32 s12, s2
1242 ; GFX12-NEXT: s_mov_b32 s13, s3
1243 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1244 ; GFX12-NEXT: s_wait_loadcnt 0x0
1245 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1246 ; GFX12-NEXT: s_wait_loadcnt 0x0
1247 ; GFX12-NEXT: s_mov_b32 s8, s0
1248 ; GFX12-NEXT: s_mov_b32 s9, s1
1249 ; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
1250 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1251 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1252 ; GFX12-NEXT: s_endpgm
1253 ptr addrspace(1) %r,
1254 ptr addrspace(1) %a,
1255 ptr addrspace(1) %b) {
1257 %a.val = load volatile half, ptr addrspace(1) %a
1258 %b.val = load volatile half, ptr addrspace(1) %b
1259 %r.val = fcmp uno half %a.val, %b.val
1260 %r.val.sext = sext i1 %r.val to i32
1261 store i32 %r.val.sext, ptr addrspace(1) %r
1265 define amdgpu_kernel void @fcmp_f16_nge(
1266 ; SI-LABEL: fcmp_f16_nge:
1267 ; SI: ; %bb.0: ; %entry
1268 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1269 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1270 ; SI-NEXT: s_mov_b32 s11, 0xf000
1271 ; SI-NEXT: s_mov_b32 s10, -1
1272 ; SI-NEXT: s_mov_b32 s14, s10
1273 ; SI-NEXT: s_mov_b32 s15, s11
1274 ; SI-NEXT: s_mov_b32 s6, s10
1275 ; SI-NEXT: s_mov_b32 s7, s11
1276 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1277 ; SI-NEXT: s_mov_b32 s12, s2
1278 ; SI-NEXT: s_mov_b32 s13, s3
1279 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1280 ; SI-NEXT: s_waitcnt vmcnt(0)
1281 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1282 ; SI-NEXT: s_waitcnt vmcnt(0)
1283 ; SI-NEXT: s_mov_b32 s8, s0
1284 ; SI-NEXT: s_mov_b32 s9, s1
1285 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1286 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1287 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
1288 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1289 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1292 ; VI-LABEL: fcmp_f16_nge:
1293 ; VI: ; %bb.0: ; %entry
1294 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1295 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1296 ; VI-NEXT: s_mov_b32 s7, 0xf000
1297 ; VI-NEXT: s_mov_b32 s6, -1
1298 ; VI-NEXT: s_mov_b32 s14, s6
1299 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1300 ; VI-NEXT: s_mov_b32 s12, s2
1301 ; VI-NEXT: s_mov_b32 s13, s3
1302 ; VI-NEXT: s_mov_b32 s15, s7
1303 ; VI-NEXT: s_mov_b32 s10, s6
1304 ; VI-NEXT: s_mov_b32 s11, s7
1305 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1306 ; VI-NEXT: s_waitcnt vmcnt(0)
1307 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1308 ; VI-NEXT: s_waitcnt vmcnt(0)
1309 ; VI-NEXT: s_mov_b32 s4, s0
1310 ; VI-NEXT: s_mov_b32 s5, s1
1311 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
1312 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1313 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1316 ; GFX11-TRUE16-LABEL: fcmp_f16_nge:
1317 ; GFX11-TRUE16: ; %bb.0: ; %entry
1318 ; GFX11-TRUE16-NEXT: s_clause 0x1
1319 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1320 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1321 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1322 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1323 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1324 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1325 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1326 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1327 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1328 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1329 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1330 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1331 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1332 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1333 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1334 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1335 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1336 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1337 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1338 ; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0.l, v0.h
1339 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1340 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1341 ; GFX11-TRUE16-NEXT: s_endpgm
1343 ; GFX11-FAKE16-LABEL: fcmp_f16_nge:
1344 ; GFX11-FAKE16: ; %bb.0: ; %entry
1345 ; GFX11-FAKE16-NEXT: s_clause 0x1
1346 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1347 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1348 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1349 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1350 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1351 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1352 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1353 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1354 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1355 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1356 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1357 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1358 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1359 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1360 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1361 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1362 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1363 ; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
1364 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1365 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1366 ; GFX11-FAKE16-NEXT: s_endpgm
1368 ; GFX12-LABEL: fcmp_f16_nge:
1369 ; GFX12: ; %bb.0: ; %entry
1370 ; GFX12-NEXT: s_clause 0x1
1371 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1372 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1373 ; GFX12-NEXT: s_mov_b32 s10, -1
1374 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1375 ; GFX12-NEXT: s_mov_b32 s14, s10
1376 ; GFX12-NEXT: s_mov_b32 s15, s11
1377 ; GFX12-NEXT: s_mov_b32 s6, s10
1378 ; GFX12-NEXT: s_mov_b32 s7, s11
1379 ; GFX12-NEXT: s_wait_kmcnt 0x0
1380 ; GFX12-NEXT: s_mov_b32 s12, s2
1381 ; GFX12-NEXT: s_mov_b32 s13, s3
1382 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1383 ; GFX12-NEXT: s_wait_loadcnt 0x0
1384 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1385 ; GFX12-NEXT: s_wait_loadcnt 0x0
1386 ; GFX12-NEXT: s_mov_b32 s8, s0
1387 ; GFX12-NEXT: s_mov_b32 s9, s1
1388 ; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
1389 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1390 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1391 ; GFX12-NEXT: s_endpgm
1392 ptr addrspace(1) %r,
1393 ptr addrspace(1) %a,
1394 ptr addrspace(1) %b) {
1396 %a.val = load volatile half, ptr addrspace(1) %a
1397 %b.val = load volatile half, ptr addrspace(1) %b
1398 %r.val = fcmp ult half %a.val, %b.val
1399 %r.val.sext = sext i1 %r.val to i32
1400 store i32 %r.val.sext, ptr addrspace(1) %r
1404 define amdgpu_kernel void @fcmp_f16_nlg(
1405 ; SI-LABEL: fcmp_f16_nlg:
1406 ; SI: ; %bb.0: ; %entry
1407 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1408 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1409 ; SI-NEXT: s_mov_b32 s11, 0xf000
1410 ; SI-NEXT: s_mov_b32 s10, -1
1411 ; SI-NEXT: s_mov_b32 s14, s10
1412 ; SI-NEXT: s_mov_b32 s15, s11
1413 ; SI-NEXT: s_mov_b32 s6, s10
1414 ; SI-NEXT: s_mov_b32 s7, s11
1415 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1416 ; SI-NEXT: s_mov_b32 s12, s2
1417 ; SI-NEXT: s_mov_b32 s13, s3
1418 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1419 ; SI-NEXT: s_waitcnt vmcnt(0)
1420 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1421 ; SI-NEXT: s_waitcnt vmcnt(0)
1422 ; SI-NEXT: s_mov_b32 s8, s0
1423 ; SI-NEXT: s_mov_b32 s9, s1
1424 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1425 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1426 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
1427 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1428 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1431 ; VI-LABEL: fcmp_f16_nlg:
1432 ; VI: ; %bb.0: ; %entry
1433 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1434 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1435 ; VI-NEXT: s_mov_b32 s7, 0xf000
1436 ; VI-NEXT: s_mov_b32 s6, -1
1437 ; VI-NEXT: s_mov_b32 s14, s6
1438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1439 ; VI-NEXT: s_mov_b32 s12, s2
1440 ; VI-NEXT: s_mov_b32 s13, s3
1441 ; VI-NEXT: s_mov_b32 s15, s7
1442 ; VI-NEXT: s_mov_b32 s10, s6
1443 ; VI-NEXT: s_mov_b32 s11, s7
1444 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1445 ; VI-NEXT: s_waitcnt vmcnt(0)
1446 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1447 ; VI-NEXT: s_waitcnt vmcnt(0)
1448 ; VI-NEXT: s_mov_b32 s4, s0
1449 ; VI-NEXT: s_mov_b32 s5, s1
1450 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
1451 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1452 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1455 ; GFX11-TRUE16-LABEL: fcmp_f16_nlg:
1456 ; GFX11-TRUE16: ; %bb.0: ; %entry
1457 ; GFX11-TRUE16-NEXT: s_clause 0x1
1458 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1459 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1460 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1461 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1462 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1463 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1464 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1465 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1466 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1467 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1468 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1469 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1470 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1471 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1472 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1473 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1474 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1475 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1476 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1477 ; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0.l, v0.h
1478 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1479 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1480 ; GFX11-TRUE16-NEXT: s_endpgm
1482 ; GFX11-FAKE16-LABEL: fcmp_f16_nlg:
1483 ; GFX11-FAKE16: ; %bb.0: ; %entry
1484 ; GFX11-FAKE16-NEXT: s_clause 0x1
1485 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1486 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1487 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1488 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1489 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1490 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1491 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1492 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1493 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1494 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1495 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1496 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1497 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1498 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1499 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1500 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1501 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1502 ; GFX11-FAKE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
1503 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1504 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1505 ; GFX11-FAKE16-NEXT: s_endpgm
1507 ; GFX12-LABEL: fcmp_f16_nlg:
1508 ; GFX12: ; %bb.0: ; %entry
1509 ; GFX12-NEXT: s_clause 0x1
1510 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1511 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1512 ; GFX12-NEXT: s_mov_b32 s10, -1
1513 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1514 ; GFX12-NEXT: s_mov_b32 s14, s10
1515 ; GFX12-NEXT: s_mov_b32 s15, s11
1516 ; GFX12-NEXT: s_mov_b32 s6, s10
1517 ; GFX12-NEXT: s_mov_b32 s7, s11
1518 ; GFX12-NEXT: s_wait_kmcnt 0x0
1519 ; GFX12-NEXT: s_mov_b32 s12, s2
1520 ; GFX12-NEXT: s_mov_b32 s13, s3
1521 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1522 ; GFX12-NEXT: s_wait_loadcnt 0x0
1523 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1524 ; GFX12-NEXT: s_wait_loadcnt 0x0
1525 ; GFX12-NEXT: s_mov_b32 s8, s0
1526 ; GFX12-NEXT: s_mov_b32 s9, s1
1527 ; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
1528 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1529 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1530 ; GFX12-NEXT: s_endpgm
1531 ptr addrspace(1) %r,
1532 ptr addrspace(1) %a,
1533 ptr addrspace(1) %b) {
1535 %a.val = load volatile half, ptr addrspace(1) %a
1536 %b.val = load volatile half, ptr addrspace(1) %b
1537 %r.val = fcmp ueq half %a.val, %b.val
1538 %r.val.sext = sext i1 %r.val to i32
1539 store i32 %r.val.sext, ptr addrspace(1) %r
1543 define amdgpu_kernel void @fcmp_f16_ngt(
1544 ; SI-LABEL: fcmp_f16_ngt:
1545 ; SI: ; %bb.0: ; %entry
1546 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1547 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1548 ; SI-NEXT: s_mov_b32 s11, 0xf000
1549 ; SI-NEXT: s_mov_b32 s10, -1
1550 ; SI-NEXT: s_mov_b32 s14, s10
1551 ; SI-NEXT: s_mov_b32 s15, s11
1552 ; SI-NEXT: s_mov_b32 s6, s10
1553 ; SI-NEXT: s_mov_b32 s7, s11
1554 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1555 ; SI-NEXT: s_mov_b32 s12, s2
1556 ; SI-NEXT: s_mov_b32 s13, s3
1557 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1558 ; SI-NEXT: s_waitcnt vmcnt(0)
1559 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1560 ; SI-NEXT: s_waitcnt vmcnt(0)
1561 ; SI-NEXT: s_mov_b32 s8, s0
1562 ; SI-NEXT: s_mov_b32 s9, s1
1563 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1564 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1565 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
1566 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1567 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1570 ; VI-LABEL: fcmp_f16_ngt:
1571 ; VI: ; %bb.0: ; %entry
1572 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1573 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1574 ; VI-NEXT: s_mov_b32 s7, 0xf000
1575 ; VI-NEXT: s_mov_b32 s6, -1
1576 ; VI-NEXT: s_mov_b32 s14, s6
1577 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1578 ; VI-NEXT: s_mov_b32 s12, s2
1579 ; VI-NEXT: s_mov_b32 s13, s3
1580 ; VI-NEXT: s_mov_b32 s15, s7
1581 ; VI-NEXT: s_mov_b32 s10, s6
1582 ; VI-NEXT: s_mov_b32 s11, s7
1583 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1584 ; VI-NEXT: s_waitcnt vmcnt(0)
1585 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1586 ; VI-NEXT: s_waitcnt vmcnt(0)
1587 ; VI-NEXT: s_mov_b32 s4, s0
1588 ; VI-NEXT: s_mov_b32 s5, s1
1589 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
1590 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1591 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1594 ; GFX11-TRUE16-LABEL: fcmp_f16_ngt:
1595 ; GFX11-TRUE16: ; %bb.0: ; %entry
1596 ; GFX11-TRUE16-NEXT: s_clause 0x1
1597 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1598 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1599 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1600 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1601 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1602 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1603 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1604 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1605 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1606 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1607 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1608 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1609 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1610 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1611 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1612 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1613 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1614 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1615 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1616 ; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v0.h
1617 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1618 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1619 ; GFX11-TRUE16-NEXT: s_endpgm
1621 ; GFX11-FAKE16-LABEL: fcmp_f16_ngt:
1622 ; GFX11-FAKE16: ; %bb.0: ; %entry
1623 ; GFX11-FAKE16-NEXT: s_clause 0x1
1624 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1625 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1626 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1627 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1628 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1629 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1630 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1631 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1632 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1633 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1634 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1635 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1636 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1637 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1638 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1639 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1640 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1641 ; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
1642 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1643 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1644 ; GFX11-FAKE16-NEXT: s_endpgm
1646 ; GFX12-LABEL: fcmp_f16_ngt:
1647 ; GFX12: ; %bb.0: ; %entry
1648 ; GFX12-NEXT: s_clause 0x1
1649 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1650 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1651 ; GFX12-NEXT: s_mov_b32 s10, -1
1652 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1653 ; GFX12-NEXT: s_mov_b32 s14, s10
1654 ; GFX12-NEXT: s_mov_b32 s15, s11
1655 ; GFX12-NEXT: s_mov_b32 s6, s10
1656 ; GFX12-NEXT: s_mov_b32 s7, s11
1657 ; GFX12-NEXT: s_wait_kmcnt 0x0
1658 ; GFX12-NEXT: s_mov_b32 s12, s2
1659 ; GFX12-NEXT: s_mov_b32 s13, s3
1660 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1661 ; GFX12-NEXT: s_wait_loadcnt 0x0
1662 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1663 ; GFX12-NEXT: s_wait_loadcnt 0x0
1664 ; GFX12-NEXT: s_mov_b32 s8, s0
1665 ; GFX12-NEXT: s_mov_b32 s9, s1
1666 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
1667 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1668 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1669 ; GFX12-NEXT: s_endpgm
1670 ptr addrspace(1) %r,
1671 ptr addrspace(1) %a,
1672 ptr addrspace(1) %b) {
1674 %a.val = load volatile half, ptr addrspace(1) %a
1675 %b.val = load volatile half, ptr addrspace(1) %b
1676 %r.val = fcmp ule half %a.val, %b.val
1677 %r.val.sext = sext i1 %r.val to i32
1678 store i32 %r.val.sext, ptr addrspace(1) %r
1682 define amdgpu_kernel void @fcmp_f16_nle(
1683 ; SI-LABEL: fcmp_f16_nle:
1684 ; SI: ; %bb.0: ; %entry
1685 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1686 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1687 ; SI-NEXT: s_mov_b32 s11, 0xf000
1688 ; SI-NEXT: s_mov_b32 s10, -1
1689 ; SI-NEXT: s_mov_b32 s14, s10
1690 ; SI-NEXT: s_mov_b32 s15, s11
1691 ; SI-NEXT: s_mov_b32 s6, s10
1692 ; SI-NEXT: s_mov_b32 s7, s11
1693 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1694 ; SI-NEXT: s_mov_b32 s12, s2
1695 ; SI-NEXT: s_mov_b32 s13, s3
1696 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1697 ; SI-NEXT: s_waitcnt vmcnt(0)
1698 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1699 ; SI-NEXT: s_waitcnt vmcnt(0)
1700 ; SI-NEXT: s_mov_b32 s8, s0
1701 ; SI-NEXT: s_mov_b32 s9, s1
1702 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1703 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1704 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
1705 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1706 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1709 ; VI-LABEL: fcmp_f16_nle:
1710 ; VI: ; %bb.0: ; %entry
1711 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1712 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1713 ; VI-NEXT: s_mov_b32 s7, 0xf000
1714 ; VI-NEXT: s_mov_b32 s6, -1
1715 ; VI-NEXT: s_mov_b32 s14, s6
1716 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1717 ; VI-NEXT: s_mov_b32 s12, s2
1718 ; VI-NEXT: s_mov_b32 s13, s3
1719 ; VI-NEXT: s_mov_b32 s15, s7
1720 ; VI-NEXT: s_mov_b32 s10, s6
1721 ; VI-NEXT: s_mov_b32 s11, s7
1722 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1723 ; VI-NEXT: s_waitcnt vmcnt(0)
1724 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1725 ; VI-NEXT: s_waitcnt vmcnt(0)
1726 ; VI-NEXT: s_mov_b32 s4, s0
1727 ; VI-NEXT: s_mov_b32 s5, s1
1728 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
1729 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1730 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1733 ; GFX11-TRUE16-LABEL: fcmp_f16_nle:
1734 ; GFX11-TRUE16: ; %bb.0: ; %entry
1735 ; GFX11-TRUE16-NEXT: s_clause 0x1
1736 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1737 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1738 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1739 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1740 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1741 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1742 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1743 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1744 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1745 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1746 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1747 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1748 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1749 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1750 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1751 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1752 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1753 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1754 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1755 ; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v0.h
1756 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1757 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1758 ; GFX11-TRUE16-NEXT: s_endpgm
1760 ; GFX11-FAKE16-LABEL: fcmp_f16_nle:
1761 ; GFX11-FAKE16: ; %bb.0: ; %entry
1762 ; GFX11-FAKE16-NEXT: s_clause 0x1
1763 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1764 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1765 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1766 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1767 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1768 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1769 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1770 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1771 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1772 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1773 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1774 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1775 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1776 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1777 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1778 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1779 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1780 ; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
1781 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1782 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1783 ; GFX11-FAKE16-NEXT: s_endpgm
1785 ; GFX12-LABEL: fcmp_f16_nle:
1786 ; GFX12: ; %bb.0: ; %entry
1787 ; GFX12-NEXT: s_clause 0x1
1788 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1789 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1790 ; GFX12-NEXT: s_mov_b32 s10, -1
1791 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1792 ; GFX12-NEXT: s_mov_b32 s14, s10
1793 ; GFX12-NEXT: s_mov_b32 s15, s11
1794 ; GFX12-NEXT: s_mov_b32 s6, s10
1795 ; GFX12-NEXT: s_mov_b32 s7, s11
1796 ; GFX12-NEXT: s_wait_kmcnt 0x0
1797 ; GFX12-NEXT: s_mov_b32 s12, s2
1798 ; GFX12-NEXT: s_mov_b32 s13, s3
1799 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1800 ; GFX12-NEXT: s_wait_loadcnt 0x0
1801 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1802 ; GFX12-NEXT: s_wait_loadcnt 0x0
1803 ; GFX12-NEXT: s_mov_b32 s8, s0
1804 ; GFX12-NEXT: s_mov_b32 s9, s1
1805 ; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
1806 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1807 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1808 ; GFX12-NEXT: s_endpgm
1809 ptr addrspace(1) %r,
1810 ptr addrspace(1) %a,
1811 ptr addrspace(1) %b) {
1813 %a.val = load volatile half, ptr addrspace(1) %a
1814 %b.val = load volatile half, ptr addrspace(1) %b
1815 %r.val = fcmp ugt half %a.val, %b.val
1816 %r.val.sext = sext i1 %r.val to i32
1817 store i32 %r.val.sext, ptr addrspace(1) %r
1821 define amdgpu_kernel void @fcmp_f16_neq(
1822 ; SI-LABEL: fcmp_f16_neq:
1823 ; SI: ; %bb.0: ; %entry
1824 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1825 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1826 ; SI-NEXT: s_mov_b32 s11, 0xf000
1827 ; SI-NEXT: s_mov_b32 s10, -1
1828 ; SI-NEXT: s_mov_b32 s14, s10
1829 ; SI-NEXT: s_mov_b32 s15, s11
1830 ; SI-NEXT: s_mov_b32 s6, s10
1831 ; SI-NEXT: s_mov_b32 s7, s11
1832 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1833 ; SI-NEXT: s_mov_b32 s12, s2
1834 ; SI-NEXT: s_mov_b32 s13, s3
1835 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1836 ; SI-NEXT: s_waitcnt vmcnt(0)
1837 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1838 ; SI-NEXT: s_waitcnt vmcnt(0)
1839 ; SI-NEXT: s_mov_b32 s8, s0
1840 ; SI-NEXT: s_mov_b32 s9, s1
1841 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1842 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1843 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
1844 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1845 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1848 ; VI-LABEL: fcmp_f16_neq:
1849 ; VI: ; %bb.0: ; %entry
1850 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1851 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1852 ; VI-NEXT: s_mov_b32 s7, 0xf000
1853 ; VI-NEXT: s_mov_b32 s6, -1
1854 ; VI-NEXT: s_mov_b32 s14, s6
1855 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1856 ; VI-NEXT: s_mov_b32 s12, s2
1857 ; VI-NEXT: s_mov_b32 s13, s3
1858 ; VI-NEXT: s_mov_b32 s15, s7
1859 ; VI-NEXT: s_mov_b32 s10, s6
1860 ; VI-NEXT: s_mov_b32 s11, s7
1861 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1862 ; VI-NEXT: s_waitcnt vmcnt(0)
1863 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
1864 ; VI-NEXT: s_waitcnt vmcnt(0)
1865 ; VI-NEXT: s_mov_b32 s4, s0
1866 ; VI-NEXT: s_mov_b32 s5, s1
1867 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
1868 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1869 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1872 ; GFX11-TRUE16-LABEL: fcmp_f16_neq:
1873 ; GFX11-TRUE16: ; %bb.0: ; %entry
1874 ; GFX11-TRUE16-NEXT: s_clause 0x1
1875 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1876 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1877 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
1878 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
1879 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1880 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1881 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
1882 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
1883 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1884 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1885 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
1886 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1887 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1888 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1889 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1890 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
1891 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
1892 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
1893 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1894 ; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0.l, v0.h
1895 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1896 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1897 ; GFX11-TRUE16-NEXT: s_endpgm
1899 ; GFX11-FAKE16-LABEL: fcmp_f16_neq:
1900 ; GFX11-FAKE16: ; %bb.0: ; %entry
1901 ; GFX11-FAKE16-NEXT: s_clause 0x1
1902 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1903 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1904 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
1905 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
1906 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
1907 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
1908 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
1909 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
1910 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1911 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
1912 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
1913 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1914 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1915 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1916 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
1917 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
1918 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
1919 ; GFX11-FAKE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
1920 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1921 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1922 ; GFX11-FAKE16-NEXT: s_endpgm
1924 ; GFX12-LABEL: fcmp_f16_neq:
1925 ; GFX12: ; %bb.0: ; %entry
1926 ; GFX12-NEXT: s_clause 0x1
1927 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1928 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1929 ; GFX12-NEXT: s_mov_b32 s10, -1
1930 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
1931 ; GFX12-NEXT: s_mov_b32 s14, s10
1932 ; GFX12-NEXT: s_mov_b32 s15, s11
1933 ; GFX12-NEXT: s_mov_b32 s6, s10
1934 ; GFX12-NEXT: s_mov_b32 s7, s11
1935 ; GFX12-NEXT: s_wait_kmcnt 0x0
1936 ; GFX12-NEXT: s_mov_b32 s12, s2
1937 ; GFX12-NEXT: s_mov_b32 s13, s3
1938 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1939 ; GFX12-NEXT: s_wait_loadcnt 0x0
1940 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1941 ; GFX12-NEXT: s_wait_loadcnt 0x0
1942 ; GFX12-NEXT: s_mov_b32 s8, s0
1943 ; GFX12-NEXT: s_mov_b32 s9, s1
1944 ; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
1945 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1946 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
1947 ; GFX12-NEXT: s_endpgm
1948 ptr addrspace(1) %r,
1949 ptr addrspace(1) %a,
1950 ptr addrspace(1) %b) {
1952 %a.val = load volatile half, ptr addrspace(1) %a
1953 %b.val = load volatile half, ptr addrspace(1) %b
1954 %r.val = fcmp une half %a.val, %b.val
1955 %r.val.sext = sext i1 %r.val to i32
1956 store i32 %r.val.sext, ptr addrspace(1) %r
1960 define amdgpu_kernel void @fcmp_f16_nlt(
1961 ; SI-LABEL: fcmp_f16_nlt:
1962 ; SI: ; %bb.0: ; %entry
1963 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1964 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1965 ; SI-NEXT: s_mov_b32 s11, 0xf000
1966 ; SI-NEXT: s_mov_b32 s10, -1
1967 ; SI-NEXT: s_mov_b32 s14, s10
1968 ; SI-NEXT: s_mov_b32 s15, s11
1969 ; SI-NEXT: s_mov_b32 s6, s10
1970 ; SI-NEXT: s_mov_b32 s7, s11
1971 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1972 ; SI-NEXT: s_mov_b32 s12, s2
1973 ; SI-NEXT: s_mov_b32 s13, s3
1974 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
1975 ; SI-NEXT: s_waitcnt vmcnt(0)
1976 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
1977 ; SI-NEXT: s_waitcnt vmcnt(0)
1978 ; SI-NEXT: s_mov_b32 s8, s0
1979 ; SI-NEXT: s_mov_b32 s9, s1
1980 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1981 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1982 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
1983 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
1984 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1987 ; VI-LABEL: fcmp_f16_nlt:
1988 ; VI: ; %bb.0: ; %entry
1989 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1990 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1991 ; VI-NEXT: s_mov_b32 s7, 0xf000
1992 ; VI-NEXT: s_mov_b32 s6, -1
1993 ; VI-NEXT: s_mov_b32 s14, s6
1994 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1995 ; VI-NEXT: s_mov_b32 s12, s2
1996 ; VI-NEXT: s_mov_b32 s13, s3
1997 ; VI-NEXT: s_mov_b32 s15, s7
1998 ; VI-NEXT: s_mov_b32 s10, s6
1999 ; VI-NEXT: s_mov_b32 s11, s7
2000 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
2001 ; VI-NEXT: s_waitcnt vmcnt(0)
2002 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
2003 ; VI-NEXT: s_waitcnt vmcnt(0)
2004 ; VI-NEXT: s_mov_b32 s4, s0
2005 ; VI-NEXT: s_mov_b32 s5, s1
2006 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
2007 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2008 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
2011 ; GFX11-TRUE16-LABEL: fcmp_f16_nlt:
2012 ; GFX11-TRUE16: ; %bb.0: ; %entry
2013 ; GFX11-TRUE16-NEXT: s_clause 0x1
2014 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2015 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2016 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2017 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2018 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2019 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2020 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2021 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2022 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2023 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2024 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2025 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
2026 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2027 ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
2028 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2029 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2030 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2031 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
2032 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
2033 ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h
2034 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2035 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
2036 ; GFX11-TRUE16-NEXT: s_endpgm
2038 ; GFX11-FAKE16-LABEL: fcmp_f16_nlt:
2039 ; GFX11-FAKE16: ; %bb.0: ; %entry
2040 ; GFX11-FAKE16-NEXT: s_clause 0x1
2041 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2042 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2043 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
2044 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
2045 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
2046 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
2047 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
2048 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
2049 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
2050 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
2051 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
2052 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
2053 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2054 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
2055 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2056 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
2057 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
2058 ; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
2059 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2060 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
2061 ; GFX11-FAKE16-NEXT: s_endpgm
2063 ; GFX12-LABEL: fcmp_f16_nlt:
2064 ; GFX12: ; %bb.0: ; %entry
2065 ; GFX12-NEXT: s_clause 0x1
2066 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2067 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2068 ; GFX12-NEXT: s_mov_b32 s10, -1
2069 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2070 ; GFX12-NEXT: s_mov_b32 s14, s10
2071 ; GFX12-NEXT: s_mov_b32 s15, s11
2072 ; GFX12-NEXT: s_mov_b32 s6, s10
2073 ; GFX12-NEXT: s_mov_b32 s7, s11
2074 ; GFX12-NEXT: s_wait_kmcnt 0x0
2075 ; GFX12-NEXT: s_mov_b32 s12, s2
2076 ; GFX12-NEXT: s_mov_b32 s13, s3
2077 ; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
2078 ; GFX12-NEXT: s_wait_loadcnt 0x0
2079 ; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
2080 ; GFX12-NEXT: s_wait_loadcnt 0x0
2081 ; GFX12-NEXT: s_mov_b32 s8, s0
2082 ; GFX12-NEXT: s_mov_b32 s9, s1
2083 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
2084 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2085 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
2086 ; GFX12-NEXT: s_endpgm
2087 ptr addrspace(1) %r,
2088 ptr addrspace(1) %a,
2089 ptr addrspace(1) %b) {
2091 %a.val = load volatile half, ptr addrspace(1) %a
2092 %b.val = load volatile half, ptr addrspace(1) %b
2093 %r.val = fcmp uge half %a.val, %b.val
2094 %r.val.sext = sext i1 %r.val to i32
2095 store i32 %r.val.sext, ptr addrspace(1) %r
2099 define amdgpu_kernel void @fcmp_v2f16_lt(
2100 ; SI-LABEL: fcmp_v2f16_lt:
2101 ; SI: ; %bb.0: ; %entry
2102 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2103 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2104 ; SI-NEXT: s_mov_b32 s11, 0xf000
2105 ; SI-NEXT: s_mov_b32 s10, -1
2106 ; SI-NEXT: s_mov_b32 s14, s10
2107 ; SI-NEXT: s_mov_b32 s15, s11
2108 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2109 ; SI-NEXT: s_mov_b32 s12, s2
2110 ; SI-NEXT: s_mov_b32 s13, s3
2111 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2112 ; SI-NEXT: s_mov_b32 s6, s10
2113 ; SI-NEXT: s_mov_b32 s7, s11
2114 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2115 ; SI-NEXT: s_mov_b32 s8, s0
2116 ; SI-NEXT: s_mov_b32 s9, s1
2117 ; SI-NEXT: s_waitcnt vmcnt(1)
2118 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2119 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2120 ; SI-NEXT: s_waitcnt vmcnt(0)
2121 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2122 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2123 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2124 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2125 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
2126 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2127 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1
2128 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2129 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2132 ; VI-LABEL: fcmp_v2f16_lt:
2133 ; VI: ; %bb.0: ; %entry
2134 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2135 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
2136 ; VI-NEXT: s_mov_b32 s7, 0xf000
2137 ; VI-NEXT: s_mov_b32 s6, -1
2138 ; VI-NEXT: s_mov_b32 s10, s6
2139 ; VI-NEXT: s_mov_b32 s11, s7
2140 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2141 ; VI-NEXT: s_mov_b32 s12, s2
2142 ; VI-NEXT: s_mov_b32 s13, s3
2143 ; VI-NEXT: s_mov_b32 s14, s6
2144 ; VI-NEXT: s_mov_b32 s15, s7
2145 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2146 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2147 ; VI-NEXT: s_mov_b32 s4, s0
2148 ; VI-NEXT: s_mov_b32 s5, s1
2149 ; VI-NEXT: s_waitcnt vmcnt(1)
2150 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2151 ; VI-NEXT: s_waitcnt vmcnt(0)
2152 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2153 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
2154 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2155 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2
2156 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2157 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2160 ; GFX11-TRUE16-LABEL: fcmp_v2f16_lt:
2161 ; GFX11-TRUE16: ; %bb.0: ; %entry
2162 ; GFX11-TRUE16-NEXT: s_clause 0x1
2163 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2164 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2165 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2166 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2167 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2168 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2169 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2170 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2171 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2172 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2173 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2174 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2175 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2176 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2177 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2178 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
2179 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2180 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2181 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2182 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
2183 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2184 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2185 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3.l, v2.l
2186 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2187 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2188 ; GFX11-TRUE16-NEXT: s_endpgm
2190 ; GFX11-FAKE16-LABEL: fcmp_v2f16_lt:
2191 ; GFX11-FAKE16: ; %bb.0: ; %entry
2192 ; GFX11-FAKE16-NEXT: s_clause 0x1
2193 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2194 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2195 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
2196 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
2197 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
2198 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
2199 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
2200 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
2201 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
2202 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
2203 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
2204 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2205 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2206 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
2207 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
2208 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
2209 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2210 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2211 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2212 ; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
2213 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2214 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2215 ; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
2216 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2217 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2218 ; GFX11-FAKE16-NEXT: s_endpgm
2220 ; GFX12-LABEL: fcmp_v2f16_lt:
2221 ; GFX12: ; %bb.0: ; %entry
2222 ; GFX12-NEXT: s_clause 0x1
2223 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2224 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2225 ; GFX12-NEXT: s_mov_b32 s10, -1
2226 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2227 ; GFX12-NEXT: s_mov_b32 s6, s10
2228 ; GFX12-NEXT: s_mov_b32 s7, s11
2229 ; GFX12-NEXT: s_mov_b32 s14, s10
2230 ; GFX12-NEXT: s_mov_b32 s15, s11
2231 ; GFX12-NEXT: s_wait_kmcnt 0x0
2232 ; GFX12-NEXT: s_mov_b32 s12, s2
2233 ; GFX12-NEXT: s_mov_b32 s13, s3
2234 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
2235 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
2236 ; GFX12-NEXT: s_mov_b32 s8, s0
2237 ; GFX12-NEXT: s_mov_b32 s9, s1
2238 ; GFX12-NEXT: s_wait_loadcnt 0x1
2239 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2240 ; GFX12-NEXT: s_wait_loadcnt 0x0
2241 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2242 ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
2243 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2244 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2245 ; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
2246 ; GFX12-NEXT: s_wait_alu 0xfffd
2247 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2248 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2249 ; GFX12-NEXT: s_endpgm
2250 ptr addrspace(1) %r,
2251 ptr addrspace(1) %a,
2252 ptr addrspace(1) %b) {
2254 %a.val = load <2 x half>, ptr addrspace(1) %a
2255 %b.val = load <2 x half>, ptr addrspace(1) %b
2256 %r.val = fcmp olt <2 x half> %a.val, %b.val
2257 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2258 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2263 define amdgpu_kernel void @fcmp_v2f16_eq(
2264 ; SI-LABEL: fcmp_v2f16_eq:
2265 ; SI: ; %bb.0: ; %entry
2266 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2267 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2268 ; SI-NEXT: s_mov_b32 s11, 0xf000
2269 ; SI-NEXT: s_mov_b32 s10, -1
2270 ; SI-NEXT: s_mov_b32 s14, s10
2271 ; SI-NEXT: s_mov_b32 s15, s11
2272 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2273 ; SI-NEXT: s_mov_b32 s12, s2
2274 ; SI-NEXT: s_mov_b32 s13, s3
2275 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2276 ; SI-NEXT: s_mov_b32 s6, s10
2277 ; SI-NEXT: s_mov_b32 s7, s11
2278 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2279 ; SI-NEXT: s_mov_b32 s8, s0
2280 ; SI-NEXT: s_mov_b32 s9, s1
2281 ; SI-NEXT: s_waitcnt vmcnt(1)
2282 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2283 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2284 ; SI-NEXT: s_waitcnt vmcnt(0)
2285 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2286 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2287 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2288 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2289 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3
2290 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2291 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1
2292 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2293 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2296 ; VI-LABEL: fcmp_v2f16_eq:
2297 ; VI: ; %bb.0: ; %entry
2298 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2299 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
2300 ; VI-NEXT: s_mov_b32 s7, 0xf000
2301 ; VI-NEXT: s_mov_b32 s6, -1
2302 ; VI-NEXT: s_mov_b32 s10, s6
2303 ; VI-NEXT: s_mov_b32 s11, s7
2304 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2305 ; VI-NEXT: s_mov_b32 s12, s2
2306 ; VI-NEXT: s_mov_b32 s13, s3
2307 ; VI-NEXT: s_mov_b32 s14, s6
2308 ; VI-NEXT: s_mov_b32 s15, s7
2309 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2310 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2311 ; VI-NEXT: s_mov_b32 s4, s0
2312 ; VI-NEXT: s_mov_b32 s5, s1
2313 ; VI-NEXT: s_waitcnt vmcnt(1)
2314 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2315 ; VI-NEXT: s_waitcnt vmcnt(0)
2316 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2317 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v1, v0
2318 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2319 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2
2320 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2321 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2324 ; GFX11-TRUE16-LABEL: fcmp_v2f16_eq:
2325 ; GFX11-TRUE16: ; %bb.0: ; %entry
2326 ; GFX11-TRUE16-NEXT: s_clause 0x1
2327 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2328 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2329 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2330 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2331 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2332 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2333 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2334 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2335 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2336 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2337 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2338 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2339 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2340 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2341 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2342 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
2343 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2344 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2345 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2346 ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1.l, v0.l
2347 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2348 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2349 ; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3.l, v2.l
2350 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2351 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2352 ; GFX11-TRUE16-NEXT: s_endpgm
2354 ; GFX11-FAKE16-LABEL: fcmp_v2f16_eq:
2355 ; GFX11-FAKE16: ; %bb.0: ; %entry
2356 ; GFX11-FAKE16-NEXT: s_clause 0x1
2357 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2358 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2359 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
2360 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
2361 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
2362 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
2363 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
2364 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
2365 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
2366 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
2367 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
2368 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2369 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2370 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
2371 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
2372 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
2373 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2374 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2375 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2376 ; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0
2377 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2378 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2379 ; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
2380 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2381 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2382 ; GFX11-FAKE16-NEXT: s_endpgm
2384 ; GFX12-LABEL: fcmp_v2f16_eq:
2385 ; GFX12: ; %bb.0: ; %entry
2386 ; GFX12-NEXT: s_clause 0x1
2387 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2388 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2389 ; GFX12-NEXT: s_mov_b32 s10, -1
2390 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2391 ; GFX12-NEXT: s_mov_b32 s6, s10
2392 ; GFX12-NEXT: s_mov_b32 s7, s11
2393 ; GFX12-NEXT: s_mov_b32 s14, s10
2394 ; GFX12-NEXT: s_mov_b32 s15, s11
2395 ; GFX12-NEXT: s_wait_kmcnt 0x0
2396 ; GFX12-NEXT: s_mov_b32 s12, s2
2397 ; GFX12-NEXT: s_mov_b32 s13, s3
2398 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
2399 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
2400 ; GFX12-NEXT: s_mov_b32 s8, s0
2401 ; GFX12-NEXT: s_mov_b32 s9, s1
2402 ; GFX12-NEXT: s_wait_loadcnt 0x1
2403 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2404 ; GFX12-NEXT: s_wait_loadcnt 0x0
2405 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2406 ; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0
2407 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2408 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2409 ; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
2410 ; GFX12-NEXT: s_wait_alu 0xfffd
2411 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2412 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2413 ; GFX12-NEXT: s_endpgm
2414 ptr addrspace(1) %r,
2415 ptr addrspace(1) %a,
2416 ptr addrspace(1) %b) {
2418 %a.val = load <2 x half>, ptr addrspace(1) %a
2419 %b.val = load <2 x half>, ptr addrspace(1) %b
2420 %r.val = fcmp oeq <2 x half> %a.val, %b.val
2421 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2422 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2426 define amdgpu_kernel void @fcmp_v2f16_le(
2427 ; SI-LABEL: fcmp_v2f16_le:
2428 ; SI: ; %bb.0: ; %entry
2429 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2430 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2431 ; SI-NEXT: s_mov_b32 s11, 0xf000
2432 ; SI-NEXT: s_mov_b32 s10, -1
2433 ; SI-NEXT: s_mov_b32 s14, s10
2434 ; SI-NEXT: s_mov_b32 s15, s11
2435 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2436 ; SI-NEXT: s_mov_b32 s12, s2
2437 ; SI-NEXT: s_mov_b32 s13, s3
2438 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2439 ; SI-NEXT: s_mov_b32 s6, s10
2440 ; SI-NEXT: s_mov_b32 s7, s11
2441 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2442 ; SI-NEXT: s_mov_b32 s8, s0
2443 ; SI-NEXT: s_mov_b32 s9, s1
2444 ; SI-NEXT: s_waitcnt vmcnt(1)
2445 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2446 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2447 ; SI-NEXT: s_waitcnt vmcnt(0)
2448 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2449 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2450 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2451 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2452 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3
2453 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2454 ; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1
2455 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2456 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2459 ; VI-LABEL: fcmp_v2f16_le:
2460 ; VI: ; %bb.0: ; %entry
2461 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2462 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
2463 ; VI-NEXT: s_mov_b32 s7, 0xf000
2464 ; VI-NEXT: s_mov_b32 s6, -1
2465 ; VI-NEXT: s_mov_b32 s10, s6
2466 ; VI-NEXT: s_mov_b32 s11, s7
2467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2468 ; VI-NEXT: s_mov_b32 s12, s2
2469 ; VI-NEXT: s_mov_b32 s13, s3
2470 ; VI-NEXT: s_mov_b32 s14, s6
2471 ; VI-NEXT: s_mov_b32 s15, s7
2472 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2473 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2474 ; VI-NEXT: s_mov_b32 s4, s0
2475 ; VI-NEXT: s_mov_b32 s5, s1
2476 ; VI-NEXT: s_waitcnt vmcnt(1)
2477 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2478 ; VI-NEXT: s_waitcnt vmcnt(0)
2479 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2480 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v1, v0
2481 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2482 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2
2483 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2484 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2487 ; GFX11-TRUE16-LABEL: fcmp_v2f16_le:
2488 ; GFX11-TRUE16: ; %bb.0: ; %entry
2489 ; GFX11-TRUE16-NEXT: s_clause 0x1
2490 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2491 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2492 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2493 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2494 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2495 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2496 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2497 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2498 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2499 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2500 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2501 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2502 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2503 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2504 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2505 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
2506 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2507 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2508 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2509 ; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v1.l, v0.l
2510 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2511 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2512 ; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v3.l, v2.l
2513 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2514 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2515 ; GFX11-TRUE16-NEXT: s_endpgm
2517 ; GFX11-FAKE16-LABEL: fcmp_v2f16_le:
2518 ; GFX11-FAKE16: ; %bb.0: ; %entry
2519 ; GFX11-FAKE16-NEXT: s_clause 0x1
2520 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2521 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2522 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
2523 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
2524 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
2525 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
2526 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
2527 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
2528 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
2529 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
2530 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
2531 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2532 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2533 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
2534 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
2535 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
2536 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2537 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2538 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2539 ; GFX11-FAKE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0
2540 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2541 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2542 ; GFX11-FAKE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
2543 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2544 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2545 ; GFX11-FAKE16-NEXT: s_endpgm
2547 ; GFX12-LABEL: fcmp_v2f16_le:
2548 ; GFX12: ; %bb.0: ; %entry
2549 ; GFX12-NEXT: s_clause 0x1
2550 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2551 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2552 ; GFX12-NEXT: s_mov_b32 s10, -1
2553 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2554 ; GFX12-NEXT: s_mov_b32 s6, s10
2555 ; GFX12-NEXT: s_mov_b32 s7, s11
2556 ; GFX12-NEXT: s_mov_b32 s14, s10
2557 ; GFX12-NEXT: s_mov_b32 s15, s11
2558 ; GFX12-NEXT: s_wait_kmcnt 0x0
2559 ; GFX12-NEXT: s_mov_b32 s12, s2
2560 ; GFX12-NEXT: s_mov_b32 s13, s3
2561 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
2562 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
2563 ; GFX12-NEXT: s_mov_b32 s8, s0
2564 ; GFX12-NEXT: s_mov_b32 s9, s1
2565 ; GFX12-NEXT: s_wait_loadcnt 0x1
2566 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2567 ; GFX12-NEXT: s_wait_loadcnt 0x0
2568 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2569 ; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0
2570 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2571 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2572 ; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
2573 ; GFX12-NEXT: s_wait_alu 0xfffd
2574 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2575 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2576 ; GFX12-NEXT: s_endpgm
2577 ptr addrspace(1) %r,
2578 ptr addrspace(1) %a,
2579 ptr addrspace(1) %b) {
2581 %a.val = load <2 x half>, ptr addrspace(1) %a
2582 %b.val = load <2 x half>, ptr addrspace(1) %b
2583 %r.val = fcmp ole <2 x half> %a.val, %b.val
2584 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2585 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2589 define amdgpu_kernel void @fcmp_v2f16_gt(
2590 ; SI-LABEL: fcmp_v2f16_gt:
2591 ; SI: ; %bb.0: ; %entry
2592 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2593 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2594 ; SI-NEXT: s_mov_b32 s11, 0xf000
2595 ; SI-NEXT: s_mov_b32 s10, -1
2596 ; SI-NEXT: s_mov_b32 s14, s10
2597 ; SI-NEXT: s_mov_b32 s15, s11
2598 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2599 ; SI-NEXT: s_mov_b32 s12, s2
2600 ; SI-NEXT: s_mov_b32 s13, s3
2601 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2602 ; SI-NEXT: s_mov_b32 s6, s10
2603 ; SI-NEXT: s_mov_b32 s7, s11
2604 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2605 ; SI-NEXT: s_mov_b32 s8, s0
2606 ; SI-NEXT: s_mov_b32 s9, s1
2607 ; SI-NEXT: s_waitcnt vmcnt(1)
2608 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2609 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2610 ; SI-NEXT: s_waitcnt vmcnt(0)
2611 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2612 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2613 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2614 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2615 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
2616 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2617 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1
2618 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2619 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2622 ; VI-LABEL: fcmp_v2f16_gt:
2623 ; VI: ; %bb.0: ; %entry
2624 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2625 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
2626 ; VI-NEXT: s_mov_b32 s7, 0xf000
2627 ; VI-NEXT: s_mov_b32 s6, -1
2628 ; VI-NEXT: s_mov_b32 s10, s6
2629 ; VI-NEXT: s_mov_b32 s11, s7
2630 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2631 ; VI-NEXT: s_mov_b32 s12, s2
2632 ; VI-NEXT: s_mov_b32 s13, s3
2633 ; VI-NEXT: s_mov_b32 s14, s6
2634 ; VI-NEXT: s_mov_b32 s15, s7
2635 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2636 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2637 ; VI-NEXT: s_mov_b32 s4, s0
2638 ; VI-NEXT: s_mov_b32 s5, s1
2639 ; VI-NEXT: s_waitcnt vmcnt(1)
2640 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2641 ; VI-NEXT: s_waitcnt vmcnt(0)
2642 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2643 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v1, v0
2644 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2645 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2
2646 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2647 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2650 ; GFX11-TRUE16-LABEL: fcmp_v2f16_gt:
2651 ; GFX11-TRUE16: ; %bb.0: ; %entry
2652 ; GFX11-TRUE16-NEXT: s_clause 0x1
2653 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2654 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2655 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2656 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2657 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2658 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2659 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2660 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2661 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2662 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2663 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2664 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2665 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2666 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2667 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2668 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
2669 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2670 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2671 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2672 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1.l, v0.l
2673 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2674 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2675 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3.l, v2.l
2676 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2677 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2678 ; GFX11-TRUE16-NEXT: s_endpgm
2680 ; GFX11-FAKE16-LABEL: fcmp_v2f16_gt:
2681 ; GFX11-FAKE16: ; %bb.0: ; %entry
2682 ; GFX11-FAKE16-NEXT: s_clause 0x1
2683 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2684 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2685 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
2686 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
2687 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
2688 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
2689 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
2690 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
2691 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
2692 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
2693 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
2694 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2695 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2696 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
2697 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
2698 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
2699 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2700 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2701 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2702 ; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0
2703 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2704 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2705 ; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
2706 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2707 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2708 ; GFX11-FAKE16-NEXT: s_endpgm
2710 ; GFX12-LABEL: fcmp_v2f16_gt:
2711 ; GFX12: ; %bb.0: ; %entry
2712 ; GFX12-NEXT: s_clause 0x1
2713 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2714 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2715 ; GFX12-NEXT: s_mov_b32 s10, -1
2716 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2717 ; GFX12-NEXT: s_mov_b32 s6, s10
2718 ; GFX12-NEXT: s_mov_b32 s7, s11
2719 ; GFX12-NEXT: s_mov_b32 s14, s10
2720 ; GFX12-NEXT: s_mov_b32 s15, s11
2721 ; GFX12-NEXT: s_wait_kmcnt 0x0
2722 ; GFX12-NEXT: s_mov_b32 s12, s2
2723 ; GFX12-NEXT: s_mov_b32 s13, s3
2724 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
2725 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
2726 ; GFX12-NEXT: s_mov_b32 s8, s0
2727 ; GFX12-NEXT: s_mov_b32 s9, s1
2728 ; GFX12-NEXT: s_wait_loadcnt 0x1
2729 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2730 ; GFX12-NEXT: s_wait_loadcnt 0x0
2731 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2732 ; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0
2733 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2734 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2735 ; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
2736 ; GFX12-NEXT: s_wait_alu 0xfffd
2737 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2738 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2739 ; GFX12-NEXT: s_endpgm
2740 ptr addrspace(1) %r,
2741 ptr addrspace(1) %a,
2742 ptr addrspace(1) %b) {
2744 %a.val = load <2 x half>, ptr addrspace(1) %a
2745 %b.val = load <2 x half>, ptr addrspace(1) %b
2746 %r.val = fcmp ogt <2 x half> %a.val, %b.val
2747 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2748 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2753 define amdgpu_kernel void @fcmp_v2f16_lg(
2754 ; SI-LABEL: fcmp_v2f16_lg:
2755 ; SI: ; %bb.0: ; %entry
2756 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2757 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2758 ; SI-NEXT: s_mov_b32 s11, 0xf000
2759 ; SI-NEXT: s_mov_b32 s10, -1
2760 ; SI-NEXT: s_mov_b32 s14, s10
2761 ; SI-NEXT: s_mov_b32 s15, s11
2762 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2763 ; SI-NEXT: s_mov_b32 s12, s2
2764 ; SI-NEXT: s_mov_b32 s13, s3
2765 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2766 ; SI-NEXT: s_mov_b32 s6, s10
2767 ; SI-NEXT: s_mov_b32 s7, s11
2768 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2769 ; SI-NEXT: s_mov_b32 s8, s0
2770 ; SI-NEXT: s_mov_b32 s9, s1
2771 ; SI-NEXT: s_waitcnt vmcnt(1)
2772 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2773 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2774 ; SI-NEXT: s_waitcnt vmcnt(0)
2775 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2776 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2777 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2778 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2779 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3
2780 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2781 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1
2782 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2783 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2786 ; VI-LABEL: fcmp_v2f16_lg:
2787 ; VI: ; %bb.0: ; %entry
2788 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2789 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
2790 ; VI-NEXT: s_mov_b32 s7, 0xf000
2791 ; VI-NEXT: s_mov_b32 s6, -1
2792 ; VI-NEXT: s_mov_b32 s10, s6
2793 ; VI-NEXT: s_mov_b32 s11, s7
2794 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2795 ; VI-NEXT: s_mov_b32 s12, s2
2796 ; VI-NEXT: s_mov_b32 s13, s3
2797 ; VI-NEXT: s_mov_b32 s14, s6
2798 ; VI-NEXT: s_mov_b32 s15, s7
2799 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2800 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2801 ; VI-NEXT: s_mov_b32 s4, s0
2802 ; VI-NEXT: s_mov_b32 s5, s1
2803 ; VI-NEXT: s_waitcnt vmcnt(1)
2804 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2805 ; VI-NEXT: s_waitcnt vmcnt(0)
2806 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2807 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v1, v0
2808 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2809 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2
2810 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2811 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2814 ; GFX11-TRUE16-LABEL: fcmp_v2f16_lg:
2815 ; GFX11-TRUE16: ; %bb.0: ; %entry
2816 ; GFX11-TRUE16-NEXT: s_clause 0x1
2817 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2818 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2819 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2820 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2821 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2822 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2823 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2824 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2825 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2826 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2827 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2828 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2829 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2830 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2831 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2832 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
2833 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2834 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2835 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2836 ; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1.l, v0.l
2837 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2838 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2839 ; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3.l, v2.l
2840 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2841 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2842 ; GFX11-TRUE16-NEXT: s_endpgm
2844 ; GFX11-FAKE16-LABEL: fcmp_v2f16_lg:
2845 ; GFX11-FAKE16: ; %bb.0: ; %entry
2846 ; GFX11-FAKE16-NEXT: s_clause 0x1
2847 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2848 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2849 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
2850 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
2851 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
2852 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
2853 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
2854 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
2855 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
2856 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
2857 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
2858 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2859 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2860 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
2861 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
2862 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
2863 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2864 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
2865 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2866 ; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0
2867 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2868 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
2869 ; GFX11-FAKE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
2870 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2871 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2872 ; GFX11-FAKE16-NEXT: s_endpgm
2874 ; GFX12-LABEL: fcmp_v2f16_lg:
2875 ; GFX12: ; %bb.0: ; %entry
2876 ; GFX12-NEXT: s_clause 0x1
2877 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2878 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2879 ; GFX12-NEXT: s_mov_b32 s10, -1
2880 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2881 ; GFX12-NEXT: s_mov_b32 s6, s10
2882 ; GFX12-NEXT: s_mov_b32 s7, s11
2883 ; GFX12-NEXT: s_mov_b32 s14, s10
2884 ; GFX12-NEXT: s_mov_b32 s15, s11
2885 ; GFX12-NEXT: s_wait_kmcnt 0x0
2886 ; GFX12-NEXT: s_mov_b32 s12, s2
2887 ; GFX12-NEXT: s_mov_b32 s13, s3
2888 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
2889 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
2890 ; GFX12-NEXT: s_mov_b32 s8, s0
2891 ; GFX12-NEXT: s_mov_b32 s9, s1
2892 ; GFX12-NEXT: s_wait_loadcnt 0x1
2893 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2894 ; GFX12-NEXT: s_wait_loadcnt 0x0
2895 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2896 ; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0
2897 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2898 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2899 ; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
2900 ; GFX12-NEXT: s_wait_alu 0xfffd
2901 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2902 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2903 ; GFX12-NEXT: s_endpgm
2904 ptr addrspace(1) %r,
2905 ptr addrspace(1) %a,
2906 ptr addrspace(1) %b) {
2908 %a.val = load <2 x half>, ptr addrspace(1) %a
2909 %b.val = load <2 x half>, ptr addrspace(1) %b
2910 %r.val = fcmp one <2 x half> %a.val, %b.val
2911 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2912 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2917 define amdgpu_kernel void @fcmp_v2f16_ge(
2918 ; SI-LABEL: fcmp_v2f16_ge:
2919 ; SI: ; %bb.0: ; %entry
2920 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
2921 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
2922 ; SI-NEXT: s_mov_b32 s11, 0xf000
2923 ; SI-NEXT: s_mov_b32 s10, -1
2924 ; SI-NEXT: s_mov_b32 s14, s10
2925 ; SI-NEXT: s_mov_b32 s15, s11
2926 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2927 ; SI-NEXT: s_mov_b32 s12, s2
2928 ; SI-NEXT: s_mov_b32 s13, s3
2929 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
2930 ; SI-NEXT: s_mov_b32 s6, s10
2931 ; SI-NEXT: s_mov_b32 s7, s11
2932 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2933 ; SI-NEXT: s_mov_b32 s8, s0
2934 ; SI-NEXT: s_mov_b32 s9, s1
2935 ; SI-NEXT: s_waitcnt vmcnt(1)
2936 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2937 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2938 ; SI-NEXT: s_waitcnt vmcnt(0)
2939 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
2940 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2941 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
2942 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2943 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3
2944 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2945 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1
2946 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2947 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2950 ; VI-LABEL: fcmp_v2f16_ge:
2951 ; VI: ; %bb.0: ; %entry
2952 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
2953 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
2954 ; VI-NEXT: s_mov_b32 s7, 0xf000
2955 ; VI-NEXT: s_mov_b32 s6, -1
2956 ; VI-NEXT: s_mov_b32 s10, s6
2957 ; VI-NEXT: s_mov_b32 s11, s7
2958 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2959 ; VI-NEXT: s_mov_b32 s12, s2
2960 ; VI-NEXT: s_mov_b32 s13, s3
2961 ; VI-NEXT: s_mov_b32 s14, s6
2962 ; VI-NEXT: s_mov_b32 s15, s7
2963 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
2964 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
2965 ; VI-NEXT: s_mov_b32 s4, s0
2966 ; VI-NEXT: s_mov_b32 s5, s1
2967 ; VI-NEXT: s_waitcnt vmcnt(1)
2968 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2969 ; VI-NEXT: s_waitcnt vmcnt(0)
2970 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2971 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v1, v0
2972 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
2973 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2
2974 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
2975 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2978 ; GFX11-TRUE16-LABEL: fcmp_v2f16_ge:
2979 ; GFX11-TRUE16: ; %bb.0: ; %entry
2980 ; GFX11-TRUE16-NEXT: s_clause 0x1
2981 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2982 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2983 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
2984 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
2985 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
2986 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
2987 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
2988 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
2989 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
2990 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
2991 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
2992 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2993 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
2994 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
2995 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
2996 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
2997 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2998 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2999 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3000 ; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1.l, v0.l
3001 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3002 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3003 ; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3.l, v2.l
3004 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3005 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3006 ; GFX11-TRUE16-NEXT: s_endpgm
3008 ; GFX11-FAKE16-LABEL: fcmp_v2f16_ge:
3009 ; GFX11-FAKE16: ; %bb.0: ; %entry
3010 ; GFX11-FAKE16-NEXT: s_clause 0x1
3011 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3012 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3013 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3014 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3015 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3016 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3017 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3018 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
3019 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3020 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
3021 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
3022 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3023 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3024 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
3025 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
3026 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
3027 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3028 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
3029 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3030 ; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0
3031 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3032 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3033 ; GFX11-FAKE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
3034 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3035 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3036 ; GFX11-FAKE16-NEXT: s_endpgm
3038 ; GFX12-LABEL: fcmp_v2f16_ge:
3039 ; GFX12: ; %bb.0: ; %entry
3040 ; GFX12-NEXT: s_clause 0x1
3041 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3042 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3043 ; GFX12-NEXT: s_mov_b32 s10, -1
3044 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
3045 ; GFX12-NEXT: s_mov_b32 s6, s10
3046 ; GFX12-NEXT: s_mov_b32 s7, s11
3047 ; GFX12-NEXT: s_mov_b32 s14, s10
3048 ; GFX12-NEXT: s_mov_b32 s15, s11
3049 ; GFX12-NEXT: s_wait_kmcnt 0x0
3050 ; GFX12-NEXT: s_mov_b32 s12, s2
3051 ; GFX12-NEXT: s_mov_b32 s13, s3
3052 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
3053 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
3054 ; GFX12-NEXT: s_mov_b32 s8, s0
3055 ; GFX12-NEXT: s_mov_b32 s9, s1
3056 ; GFX12-NEXT: s_wait_loadcnt 0x1
3057 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3058 ; GFX12-NEXT: s_wait_loadcnt 0x0
3059 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3060 ; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0
3061 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3062 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3063 ; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
3064 ; GFX12-NEXT: s_wait_alu 0xfffd
3065 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3066 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3067 ; GFX12-NEXT: s_endpgm
3068 ptr addrspace(1) %r,
3069 ptr addrspace(1) %a,
3070 ptr addrspace(1) %b) {
3072 %a.val = load <2 x half>, ptr addrspace(1) %a
3073 %b.val = load <2 x half>, ptr addrspace(1) %b
3074 %r.val = fcmp oge <2 x half> %a.val, %b.val
3075 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3076 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3081 define amdgpu_kernel void @fcmp_v2f16_o(
3082 ; SI-LABEL: fcmp_v2f16_o:
3083 ; SI: ; %bb.0: ; %entry
3084 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3085 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
3086 ; SI-NEXT: s_mov_b32 s11, 0xf000
3087 ; SI-NEXT: s_mov_b32 s10, -1
3088 ; SI-NEXT: s_mov_b32 s14, s10
3089 ; SI-NEXT: s_mov_b32 s15, s11
3090 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3091 ; SI-NEXT: s_mov_b32 s12, s2
3092 ; SI-NEXT: s_mov_b32 s13, s3
3093 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
3094 ; SI-NEXT: s_mov_b32 s6, s10
3095 ; SI-NEXT: s_mov_b32 s7, s11
3096 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
3097 ; SI-NEXT: s_mov_b32 s8, s0
3098 ; SI-NEXT: s_mov_b32 s9, s1
3099 ; SI-NEXT: s_waitcnt vmcnt(1)
3100 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
3101 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3102 ; SI-NEXT: s_waitcnt vmcnt(0)
3103 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
3104 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3105 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
3106 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3107 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3
3108 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3109 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1
3110 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3111 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3114 ; VI-LABEL: fcmp_v2f16_o:
3115 ; VI: ; %bb.0: ; %entry
3116 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3117 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
3118 ; VI-NEXT: s_mov_b32 s7, 0xf000
3119 ; VI-NEXT: s_mov_b32 s6, -1
3120 ; VI-NEXT: s_mov_b32 s10, s6
3121 ; VI-NEXT: s_mov_b32 s11, s7
3122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3123 ; VI-NEXT: s_mov_b32 s12, s2
3124 ; VI-NEXT: s_mov_b32 s13, s3
3125 ; VI-NEXT: s_mov_b32 s14, s6
3126 ; VI-NEXT: s_mov_b32 s15, s7
3127 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
3128 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
3129 ; VI-NEXT: s_mov_b32 s4, s0
3130 ; VI-NEXT: s_mov_b32 s5, s1
3131 ; VI-NEXT: s_waitcnt vmcnt(1)
3132 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3133 ; VI-NEXT: s_waitcnt vmcnt(0)
3134 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3135 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v1, v0
3136 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3137 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2
3138 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3139 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3142 ; GFX11-TRUE16-LABEL: fcmp_v2f16_o:
3143 ; GFX11-TRUE16: ; %bb.0: ; %entry
3144 ; GFX11-TRUE16-NEXT: s_clause 0x1
3145 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3146 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3147 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
3148 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
3149 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
3150 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
3151 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
3152 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
3153 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
3154 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
3155 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
3156 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3157 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3158 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
3159 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
3160 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
3161 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3162 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
3163 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3164 ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v0.l
3165 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3166 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3167 ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v2.l
3168 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3169 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3170 ; GFX11-TRUE16-NEXT: s_endpgm
3172 ; GFX11-FAKE16-LABEL: fcmp_v2f16_o:
3173 ; GFX11-FAKE16: ; %bb.0: ; %entry
3174 ; GFX11-FAKE16-NEXT: s_clause 0x1
3175 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3176 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3177 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3178 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3179 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3180 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3181 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3182 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
3183 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3184 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
3185 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
3186 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3187 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3188 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
3189 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
3190 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
3191 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3192 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
3193 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3194 ; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0
3195 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3196 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3197 ; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
3198 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3199 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3200 ; GFX11-FAKE16-NEXT: s_endpgm
3202 ; GFX12-LABEL: fcmp_v2f16_o:
3203 ; GFX12: ; %bb.0: ; %entry
3204 ; GFX12-NEXT: s_clause 0x1
3205 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3206 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3207 ; GFX12-NEXT: s_mov_b32 s10, -1
3208 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
3209 ; GFX12-NEXT: s_mov_b32 s6, s10
3210 ; GFX12-NEXT: s_mov_b32 s7, s11
3211 ; GFX12-NEXT: s_mov_b32 s14, s10
3212 ; GFX12-NEXT: s_mov_b32 s15, s11
3213 ; GFX12-NEXT: s_wait_kmcnt 0x0
3214 ; GFX12-NEXT: s_mov_b32 s12, s2
3215 ; GFX12-NEXT: s_mov_b32 s13, s3
3216 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
3217 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
3218 ; GFX12-NEXT: s_mov_b32 s8, s0
3219 ; GFX12-NEXT: s_mov_b32 s9, s1
3220 ; GFX12-NEXT: s_wait_loadcnt 0x1
3221 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3222 ; GFX12-NEXT: s_wait_loadcnt 0x0
3223 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3224 ; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0
3225 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3226 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3227 ; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
3228 ; GFX12-NEXT: s_wait_alu 0xfffd
3229 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3230 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3231 ; GFX12-NEXT: s_endpgm
3232 ptr addrspace(1) %r,
3233 ptr addrspace(1) %a,
3234 ptr addrspace(1) %b) {
3236 %a.val = load <2 x half>, ptr addrspace(1) %a
3237 %b.val = load <2 x half>, ptr addrspace(1) %b
3238 %r.val = fcmp ord <2 x half> %a.val, %b.val
3239 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3240 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3245 define amdgpu_kernel void @fcmp_v2f16_u(
3246 ; SI-LABEL: fcmp_v2f16_u:
3247 ; SI: ; %bb.0: ; %entry
3248 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3249 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
3250 ; SI-NEXT: s_mov_b32 s11, 0xf000
3251 ; SI-NEXT: s_mov_b32 s10, -1
3252 ; SI-NEXT: s_mov_b32 s14, s10
3253 ; SI-NEXT: s_mov_b32 s15, s11
3254 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3255 ; SI-NEXT: s_mov_b32 s12, s2
3256 ; SI-NEXT: s_mov_b32 s13, s3
3257 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
3258 ; SI-NEXT: s_mov_b32 s6, s10
3259 ; SI-NEXT: s_mov_b32 s7, s11
3260 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
3261 ; SI-NEXT: s_mov_b32 s8, s0
3262 ; SI-NEXT: s_mov_b32 s9, s1
3263 ; SI-NEXT: s_waitcnt vmcnt(1)
3264 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
3265 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3266 ; SI-NEXT: s_waitcnt vmcnt(0)
3267 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
3268 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3269 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
3270 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3271 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3
3272 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3273 ; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1
3274 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3275 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3278 ; VI-LABEL: fcmp_v2f16_u:
3279 ; VI: ; %bb.0: ; %entry
3280 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3281 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
3282 ; VI-NEXT: s_mov_b32 s7, 0xf000
3283 ; VI-NEXT: s_mov_b32 s6, -1
3284 ; VI-NEXT: s_mov_b32 s10, s6
3285 ; VI-NEXT: s_mov_b32 s11, s7
3286 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3287 ; VI-NEXT: s_mov_b32 s12, s2
3288 ; VI-NEXT: s_mov_b32 s13, s3
3289 ; VI-NEXT: s_mov_b32 s14, s6
3290 ; VI-NEXT: s_mov_b32 s15, s7
3291 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
3292 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
3293 ; VI-NEXT: s_mov_b32 s4, s0
3294 ; VI-NEXT: s_mov_b32 s5, s1
3295 ; VI-NEXT: s_waitcnt vmcnt(1)
3296 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3297 ; VI-NEXT: s_waitcnt vmcnt(0)
3298 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3299 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v1, v0
3300 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3301 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2
3302 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3303 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3306 ; GFX11-TRUE16-LABEL: fcmp_v2f16_u:
3307 ; GFX11-TRUE16: ; %bb.0: ; %entry
3308 ; GFX11-TRUE16-NEXT: s_clause 0x1
3309 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3310 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3311 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
3312 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
3313 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
3314 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
3315 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
3316 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
3317 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
3318 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
3319 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
3320 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3321 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3322 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
3323 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
3324 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
3325 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3326 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
3327 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3328 ; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1.l, v0.l
3329 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3330 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3331 ; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v3.l, v2.l
3332 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3333 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3334 ; GFX11-TRUE16-NEXT: s_endpgm
3336 ; GFX11-FAKE16-LABEL: fcmp_v2f16_u:
3337 ; GFX11-FAKE16: ; %bb.0: ; %entry
3338 ; GFX11-FAKE16-NEXT: s_clause 0x1
3339 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3340 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3341 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3342 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3343 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3344 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3345 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3346 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
3347 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3348 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
3349 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
3350 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3351 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3352 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
3353 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
3354 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
3355 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3356 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
3357 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3358 ; GFX11-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0
3359 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3360 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3361 ; GFX11-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
3362 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3363 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3364 ; GFX11-FAKE16-NEXT: s_endpgm
3366 ; GFX12-LABEL: fcmp_v2f16_u:
3367 ; GFX12: ; %bb.0: ; %entry
3368 ; GFX12-NEXT: s_clause 0x1
3369 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3370 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3371 ; GFX12-NEXT: s_mov_b32 s10, -1
3372 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
3373 ; GFX12-NEXT: s_mov_b32 s6, s10
3374 ; GFX12-NEXT: s_mov_b32 s7, s11
3375 ; GFX12-NEXT: s_mov_b32 s14, s10
3376 ; GFX12-NEXT: s_mov_b32 s15, s11
3377 ; GFX12-NEXT: s_wait_kmcnt 0x0
3378 ; GFX12-NEXT: s_mov_b32 s12, s2
3379 ; GFX12-NEXT: s_mov_b32 s13, s3
3380 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
3381 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
3382 ; GFX12-NEXT: s_mov_b32 s8, s0
3383 ; GFX12-NEXT: s_mov_b32 s9, s1
3384 ; GFX12-NEXT: s_wait_loadcnt 0x1
3385 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3386 ; GFX12-NEXT: s_wait_loadcnt 0x0
3387 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3388 ; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0
3389 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3390 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3391 ; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
3392 ; GFX12-NEXT: s_wait_alu 0xfffd
3393 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3394 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3395 ; GFX12-NEXT: s_endpgm
3396 ptr addrspace(1) %r,
3397 ptr addrspace(1) %a,
3398 ptr addrspace(1) %b) {
3400 %a.val = load <2 x half>, ptr addrspace(1) %a
3401 %b.val = load <2 x half>, ptr addrspace(1) %b
3402 %r.val = fcmp uno <2 x half> %a.val, %b.val
3403 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3404 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3408 define amdgpu_kernel void @fcmp_v2f16_nge(
3409 ; SI-LABEL: fcmp_v2f16_nge:
3410 ; SI: ; %bb.0: ; %entry
3411 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3412 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
3413 ; SI-NEXT: s_mov_b32 s11, 0xf000
3414 ; SI-NEXT: s_mov_b32 s10, -1
3415 ; SI-NEXT: s_mov_b32 s14, s10
3416 ; SI-NEXT: s_mov_b32 s15, s11
3417 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3418 ; SI-NEXT: s_mov_b32 s12, s2
3419 ; SI-NEXT: s_mov_b32 s13, s3
3420 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
3421 ; SI-NEXT: s_mov_b32 s6, s10
3422 ; SI-NEXT: s_mov_b32 s7, s11
3423 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
3424 ; SI-NEXT: s_mov_b32 s8, s0
3425 ; SI-NEXT: s_mov_b32 s9, s1
3426 ; SI-NEXT: s_waitcnt vmcnt(1)
3427 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
3428 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3429 ; SI-NEXT: s_waitcnt vmcnt(0)
3430 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
3431 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3432 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
3433 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3434 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3
3435 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3436 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1
3437 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3438 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3441 ; VI-LABEL: fcmp_v2f16_nge:
3442 ; VI: ; %bb.0: ; %entry
3443 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3444 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
3445 ; VI-NEXT: s_mov_b32 s7, 0xf000
3446 ; VI-NEXT: s_mov_b32 s6, -1
3447 ; VI-NEXT: s_mov_b32 s10, s6
3448 ; VI-NEXT: s_mov_b32 s11, s7
3449 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3450 ; VI-NEXT: s_mov_b32 s12, s2
3451 ; VI-NEXT: s_mov_b32 s13, s3
3452 ; VI-NEXT: s_mov_b32 s14, s6
3453 ; VI-NEXT: s_mov_b32 s15, s7
3454 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
3455 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
3456 ; VI-NEXT: s_mov_b32 s4, s0
3457 ; VI-NEXT: s_mov_b32 s5, s1
3458 ; VI-NEXT: s_waitcnt vmcnt(1)
3459 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3460 ; VI-NEXT: s_waitcnt vmcnt(0)
3461 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3462 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v1, v0
3463 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3464 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2
3465 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3466 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3469 ; GFX11-TRUE16-LABEL: fcmp_v2f16_nge:
3470 ; GFX11-TRUE16: ; %bb.0: ; %entry
3471 ; GFX11-TRUE16-NEXT: s_clause 0x1
3472 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3473 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3474 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
3475 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
3476 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
3477 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
3478 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
3479 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
3480 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
3481 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
3482 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
3483 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3484 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3485 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
3486 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
3487 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
3488 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3489 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
3490 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3491 ; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1.l, v0.l
3492 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3493 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3494 ; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3.l, v2.l
3495 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3496 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3497 ; GFX11-TRUE16-NEXT: s_endpgm
3499 ; GFX11-FAKE16-LABEL: fcmp_v2f16_nge:
3500 ; GFX11-FAKE16: ; %bb.0: ; %entry
3501 ; GFX11-FAKE16-NEXT: s_clause 0x1
3502 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3503 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3504 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3505 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3506 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3507 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3508 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3509 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
3510 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3511 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
3512 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
3513 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3514 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3515 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
3516 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
3517 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
3518 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3519 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
3520 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3521 ; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0
3522 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3523 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3524 ; GFX11-FAKE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
3525 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3526 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3527 ; GFX11-FAKE16-NEXT: s_endpgm
3529 ; GFX12-LABEL: fcmp_v2f16_nge:
3530 ; GFX12: ; %bb.0: ; %entry
3531 ; GFX12-NEXT: s_clause 0x1
3532 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3533 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3534 ; GFX12-NEXT: s_mov_b32 s10, -1
3535 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
3536 ; GFX12-NEXT: s_mov_b32 s6, s10
3537 ; GFX12-NEXT: s_mov_b32 s7, s11
3538 ; GFX12-NEXT: s_mov_b32 s14, s10
3539 ; GFX12-NEXT: s_mov_b32 s15, s11
3540 ; GFX12-NEXT: s_wait_kmcnt 0x0
3541 ; GFX12-NEXT: s_mov_b32 s12, s2
3542 ; GFX12-NEXT: s_mov_b32 s13, s3
3543 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
3544 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
3545 ; GFX12-NEXT: s_mov_b32 s8, s0
3546 ; GFX12-NEXT: s_mov_b32 s9, s1
3547 ; GFX12-NEXT: s_wait_loadcnt 0x1
3548 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3549 ; GFX12-NEXT: s_wait_loadcnt 0x0
3550 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3551 ; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0
3552 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3553 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3554 ; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
3555 ; GFX12-NEXT: s_wait_alu 0xfffd
3556 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3557 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3558 ; GFX12-NEXT: s_endpgm
3559 ptr addrspace(1) %r,
3560 ptr addrspace(1) %a,
3561 ptr addrspace(1) %b) {
3563 %a.val = load <2 x half>, ptr addrspace(1) %a
3564 %b.val = load <2 x half>, ptr addrspace(1) %b
3565 %r.val = fcmp ult <2 x half> %a.val, %b.val
3566 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3567 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3571 define amdgpu_kernel void @fcmp_v2f16_nlg(
3572 ; SI-LABEL: fcmp_v2f16_nlg:
3573 ; SI: ; %bb.0: ; %entry
3574 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3575 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
3576 ; SI-NEXT: s_mov_b32 s11, 0xf000
3577 ; SI-NEXT: s_mov_b32 s10, -1
3578 ; SI-NEXT: s_mov_b32 s14, s10
3579 ; SI-NEXT: s_mov_b32 s15, s11
3580 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3581 ; SI-NEXT: s_mov_b32 s12, s2
3582 ; SI-NEXT: s_mov_b32 s13, s3
3583 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
3584 ; SI-NEXT: s_mov_b32 s6, s10
3585 ; SI-NEXT: s_mov_b32 s7, s11
3586 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
3587 ; SI-NEXT: s_mov_b32 s8, s0
3588 ; SI-NEXT: s_mov_b32 s9, s1
3589 ; SI-NEXT: s_waitcnt vmcnt(1)
3590 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
3591 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3592 ; SI-NEXT: s_waitcnt vmcnt(0)
3593 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
3594 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3595 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
3596 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3597 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
3598 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3599 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1
3600 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3601 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3604 ; VI-LABEL: fcmp_v2f16_nlg:
3605 ; VI: ; %bb.0: ; %entry
3606 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3607 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
3608 ; VI-NEXT: s_mov_b32 s7, 0xf000
3609 ; VI-NEXT: s_mov_b32 s6, -1
3610 ; VI-NEXT: s_mov_b32 s10, s6
3611 ; VI-NEXT: s_mov_b32 s11, s7
3612 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3613 ; VI-NEXT: s_mov_b32 s12, s2
3614 ; VI-NEXT: s_mov_b32 s13, s3
3615 ; VI-NEXT: s_mov_b32 s14, s6
3616 ; VI-NEXT: s_mov_b32 s15, s7
3617 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
3618 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
3619 ; VI-NEXT: s_mov_b32 s4, s0
3620 ; VI-NEXT: s_mov_b32 s5, s1
3621 ; VI-NEXT: s_waitcnt vmcnt(1)
3622 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3623 ; VI-NEXT: s_waitcnt vmcnt(0)
3624 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3625 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v1, v0
3626 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3627 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2
3628 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3629 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3632 ; GFX11-TRUE16-LABEL: fcmp_v2f16_nlg:
3633 ; GFX11-TRUE16: ; %bb.0: ; %entry
3634 ; GFX11-TRUE16-NEXT: s_clause 0x1
3635 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3636 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3637 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
3638 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
3639 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
3640 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
3641 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
3642 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
3643 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
3644 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
3645 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
3646 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3647 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3648 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
3649 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
3650 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
3651 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3652 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
3653 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3654 ; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1.l, v0.l
3655 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3656 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3657 ; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3.l, v2.l
3658 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3659 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3660 ; GFX11-TRUE16-NEXT: s_endpgm
3662 ; GFX11-FAKE16-LABEL: fcmp_v2f16_nlg:
3663 ; GFX11-FAKE16: ; %bb.0: ; %entry
3664 ; GFX11-FAKE16-NEXT: s_clause 0x1
3665 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3666 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3667 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3668 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3669 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3670 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3671 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3672 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
3673 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3674 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
3675 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
3676 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3677 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3678 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
3679 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
3680 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
3681 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3682 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
3683 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3684 ; GFX11-FAKE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0
3685 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3686 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3687 ; GFX11-FAKE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
3688 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3689 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3690 ; GFX11-FAKE16-NEXT: s_endpgm
3692 ; GFX12-LABEL: fcmp_v2f16_nlg:
3693 ; GFX12: ; %bb.0: ; %entry
3694 ; GFX12-NEXT: s_clause 0x1
3695 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3696 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3697 ; GFX12-NEXT: s_mov_b32 s10, -1
3698 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
3699 ; GFX12-NEXT: s_mov_b32 s6, s10
3700 ; GFX12-NEXT: s_mov_b32 s7, s11
3701 ; GFX12-NEXT: s_mov_b32 s14, s10
3702 ; GFX12-NEXT: s_mov_b32 s15, s11
3703 ; GFX12-NEXT: s_wait_kmcnt 0x0
3704 ; GFX12-NEXT: s_mov_b32 s12, s2
3705 ; GFX12-NEXT: s_mov_b32 s13, s3
3706 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
3707 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
3708 ; GFX12-NEXT: s_mov_b32 s8, s0
3709 ; GFX12-NEXT: s_mov_b32 s9, s1
3710 ; GFX12-NEXT: s_wait_loadcnt 0x1
3711 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3712 ; GFX12-NEXT: s_wait_loadcnt 0x0
3713 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3714 ; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0
3715 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3716 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3717 ; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
3718 ; GFX12-NEXT: s_wait_alu 0xfffd
3719 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3720 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3721 ; GFX12-NEXT: s_endpgm
3722 ptr addrspace(1) %r,
3723 ptr addrspace(1) %a,
3724 ptr addrspace(1) %b) {
3726 %a.val = load <2 x half>, ptr addrspace(1) %a
3727 %b.val = load <2 x half>, ptr addrspace(1) %b
3728 %r.val = fcmp ueq <2 x half> %a.val, %b.val
3729 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3730 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3735 define amdgpu_kernel void @fcmp_v2f16_ngt(
3736 ; SI-LABEL: fcmp_v2f16_ngt:
3737 ; SI: ; %bb.0: ; %entry
3738 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3739 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
3740 ; SI-NEXT: s_mov_b32 s11, 0xf000
3741 ; SI-NEXT: s_mov_b32 s10, -1
3742 ; SI-NEXT: s_mov_b32 s14, s10
3743 ; SI-NEXT: s_mov_b32 s15, s11
3744 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3745 ; SI-NEXT: s_mov_b32 s12, s2
3746 ; SI-NEXT: s_mov_b32 s13, s3
3747 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
3748 ; SI-NEXT: s_mov_b32 s6, s10
3749 ; SI-NEXT: s_mov_b32 s7, s11
3750 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
3751 ; SI-NEXT: s_mov_b32 s8, s0
3752 ; SI-NEXT: s_mov_b32 s9, s1
3753 ; SI-NEXT: s_waitcnt vmcnt(1)
3754 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
3755 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3756 ; SI-NEXT: s_waitcnt vmcnt(0)
3757 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
3758 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3759 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
3760 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3761 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3
3762 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3763 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1
3764 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3765 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3768 ; VI-LABEL: fcmp_v2f16_ngt:
3769 ; VI: ; %bb.0: ; %entry
3770 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3771 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
3772 ; VI-NEXT: s_mov_b32 s7, 0xf000
3773 ; VI-NEXT: s_mov_b32 s6, -1
3774 ; VI-NEXT: s_mov_b32 s10, s6
3775 ; VI-NEXT: s_mov_b32 s11, s7
3776 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3777 ; VI-NEXT: s_mov_b32 s12, s2
3778 ; VI-NEXT: s_mov_b32 s13, s3
3779 ; VI-NEXT: s_mov_b32 s14, s6
3780 ; VI-NEXT: s_mov_b32 s15, s7
3781 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
3782 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
3783 ; VI-NEXT: s_mov_b32 s4, s0
3784 ; VI-NEXT: s_mov_b32 s5, s1
3785 ; VI-NEXT: s_waitcnt vmcnt(1)
3786 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3787 ; VI-NEXT: s_waitcnt vmcnt(0)
3788 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3789 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v0
3790 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3791 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2
3792 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3793 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3796 ; GFX11-TRUE16-LABEL: fcmp_v2f16_ngt:
3797 ; GFX11-TRUE16: ; %bb.0: ; %entry
3798 ; GFX11-TRUE16-NEXT: s_clause 0x1
3799 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3800 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3801 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
3802 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
3803 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
3804 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
3805 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
3806 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
3807 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
3808 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
3809 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
3810 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3811 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3812 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
3813 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
3814 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
3815 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3816 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
3817 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3818 ; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1.l, v0.l
3819 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3820 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3821 ; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3.l, v2.l
3822 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3823 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3824 ; GFX11-TRUE16-NEXT: s_endpgm
3826 ; GFX11-FAKE16-LABEL: fcmp_v2f16_ngt:
3827 ; GFX11-FAKE16: ; %bb.0: ; %entry
3828 ; GFX11-FAKE16-NEXT: s_clause 0x1
3829 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3830 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3831 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3832 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3833 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3834 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3835 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3836 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
3837 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
3838 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
3839 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
3840 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3841 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3842 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
3843 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
3844 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
3845 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3846 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
3847 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3848 ; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0
3849 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3850 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3851 ; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
3852 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3853 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3854 ; GFX11-FAKE16-NEXT: s_endpgm
3856 ; GFX12-LABEL: fcmp_v2f16_ngt:
3857 ; GFX12: ; %bb.0: ; %entry
3858 ; GFX12-NEXT: s_clause 0x1
3859 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3860 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3861 ; GFX12-NEXT: s_mov_b32 s10, -1
3862 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
3863 ; GFX12-NEXT: s_mov_b32 s6, s10
3864 ; GFX12-NEXT: s_mov_b32 s7, s11
3865 ; GFX12-NEXT: s_mov_b32 s14, s10
3866 ; GFX12-NEXT: s_mov_b32 s15, s11
3867 ; GFX12-NEXT: s_wait_kmcnt 0x0
3868 ; GFX12-NEXT: s_mov_b32 s12, s2
3869 ; GFX12-NEXT: s_mov_b32 s13, s3
3870 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
3871 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
3872 ; GFX12-NEXT: s_mov_b32 s8, s0
3873 ; GFX12-NEXT: s_mov_b32 s9, s1
3874 ; GFX12-NEXT: s_wait_loadcnt 0x1
3875 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3876 ; GFX12-NEXT: s_wait_loadcnt 0x0
3877 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3878 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0
3879 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3880 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3881 ; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
3882 ; GFX12-NEXT: s_wait_alu 0xfffd
3883 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3884 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3885 ; GFX12-NEXT: s_endpgm
3886 ptr addrspace(1) %r,
3887 ptr addrspace(1) %a,
3888 ptr addrspace(1) %b) {
3890 %a.val = load <2 x half>, ptr addrspace(1) %a
3891 %b.val = load <2 x half>, ptr addrspace(1) %b
3892 %r.val = fcmp ule <2 x half> %a.val, %b.val
3893 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3894 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3898 define amdgpu_kernel void @fcmp_v2f16_nle(
3899 ; SI-LABEL: fcmp_v2f16_nle:
3900 ; SI: ; %bb.0: ; %entry
3901 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3902 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
3903 ; SI-NEXT: s_mov_b32 s11, 0xf000
3904 ; SI-NEXT: s_mov_b32 s10, -1
3905 ; SI-NEXT: s_mov_b32 s14, s10
3906 ; SI-NEXT: s_mov_b32 s15, s11
3907 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3908 ; SI-NEXT: s_mov_b32 s12, s2
3909 ; SI-NEXT: s_mov_b32 s13, s3
3910 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
3911 ; SI-NEXT: s_mov_b32 s6, s10
3912 ; SI-NEXT: s_mov_b32 s7, s11
3913 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
3914 ; SI-NEXT: s_mov_b32 s8, s0
3915 ; SI-NEXT: s_mov_b32 s9, s1
3916 ; SI-NEXT: s_waitcnt vmcnt(1)
3917 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
3918 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3919 ; SI-NEXT: s_waitcnt vmcnt(0)
3920 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
3921 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3922 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
3923 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3924 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3
3925 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3926 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1
3927 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3928 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3931 ; VI-LABEL: fcmp_v2f16_nle:
3932 ; VI: ; %bb.0: ; %entry
3933 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3934 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
3935 ; VI-NEXT: s_mov_b32 s7, 0xf000
3936 ; VI-NEXT: s_mov_b32 s6, -1
3937 ; VI-NEXT: s_mov_b32 s10, s6
3938 ; VI-NEXT: s_mov_b32 s11, s7
3939 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3940 ; VI-NEXT: s_mov_b32 s12, s2
3941 ; VI-NEXT: s_mov_b32 s13, s3
3942 ; VI-NEXT: s_mov_b32 s14, s6
3943 ; VI-NEXT: s_mov_b32 s15, s7
3944 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
3945 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
3946 ; VI-NEXT: s_mov_b32 s4, s0
3947 ; VI-NEXT: s_mov_b32 s5, s1
3948 ; VI-NEXT: s_waitcnt vmcnt(1)
3949 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3950 ; VI-NEXT: s_waitcnt vmcnt(0)
3951 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3952 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v0
3953 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
3954 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2
3955 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
3956 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3959 ; GFX11-TRUE16-LABEL: fcmp_v2f16_nle:
3960 ; GFX11-TRUE16: ; %bb.0: ; %entry
3961 ; GFX11-TRUE16-NEXT: s_clause 0x1
3962 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3963 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3964 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
3965 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
3966 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
3967 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
3968 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
3969 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
3970 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
3971 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
3972 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
3973 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
3974 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
3975 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
3976 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
3977 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
3978 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
3979 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
3980 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3981 ; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1.l, v0.l
3982 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3983 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3984 ; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3.l, v2.l
3985 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3986 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
3987 ; GFX11-TRUE16-NEXT: s_endpgm
3989 ; GFX11-FAKE16-LABEL: fcmp_v2f16_nle:
3990 ; GFX11-FAKE16: ; %bb.0: ; %entry
3991 ; GFX11-FAKE16-NEXT: s_clause 0x1
3992 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
3993 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
3994 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
3995 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
3996 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
3997 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
3998 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
3999 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
4000 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
4001 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
4002 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
4003 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
4004 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
4005 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
4006 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
4007 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
4008 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4009 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
4010 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4011 ; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0
4012 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4013 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
4014 ; GFX11-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
4015 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4016 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
4017 ; GFX11-FAKE16-NEXT: s_endpgm
4019 ; GFX12-LABEL: fcmp_v2f16_nle:
4020 ; GFX12: ; %bb.0: ; %entry
4021 ; GFX12-NEXT: s_clause 0x1
4022 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4023 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4024 ; GFX12-NEXT: s_mov_b32 s10, -1
4025 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
4026 ; GFX12-NEXT: s_mov_b32 s6, s10
4027 ; GFX12-NEXT: s_mov_b32 s7, s11
4028 ; GFX12-NEXT: s_mov_b32 s14, s10
4029 ; GFX12-NEXT: s_mov_b32 s15, s11
4030 ; GFX12-NEXT: s_wait_kmcnt 0x0
4031 ; GFX12-NEXT: s_mov_b32 s12, s2
4032 ; GFX12-NEXT: s_mov_b32 s13, s3
4033 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
4034 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
4035 ; GFX12-NEXT: s_mov_b32 s8, s0
4036 ; GFX12-NEXT: s_mov_b32 s9, s1
4037 ; GFX12-NEXT: s_wait_loadcnt 0x1
4038 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4039 ; GFX12-NEXT: s_wait_loadcnt 0x0
4040 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4041 ; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0
4042 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4043 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
4044 ; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
4045 ; GFX12-NEXT: s_wait_alu 0xfffd
4046 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4047 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
4048 ; GFX12-NEXT: s_endpgm
4049 ptr addrspace(1) %r,
4050 ptr addrspace(1) %a,
4051 ptr addrspace(1) %b) {
4053 %a.val = load <2 x half>, ptr addrspace(1) %a
4054 %b.val = load <2 x half>, ptr addrspace(1) %b
4055 %r.val = fcmp ugt <2 x half> %a.val, %b.val
4056 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
4057 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
4061 define amdgpu_kernel void @fcmp_v2f16_neq(
4062 ; SI-LABEL: fcmp_v2f16_neq:
4063 ; SI: ; %bb.0: ; %entry
4064 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4065 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
4066 ; SI-NEXT: s_mov_b32 s11, 0xf000
4067 ; SI-NEXT: s_mov_b32 s10, -1
4068 ; SI-NEXT: s_mov_b32 s14, s10
4069 ; SI-NEXT: s_mov_b32 s15, s11
4070 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4071 ; SI-NEXT: s_mov_b32 s12, s2
4072 ; SI-NEXT: s_mov_b32 s13, s3
4073 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
4074 ; SI-NEXT: s_mov_b32 s6, s10
4075 ; SI-NEXT: s_mov_b32 s7, s11
4076 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
4077 ; SI-NEXT: s_mov_b32 s8, s0
4078 ; SI-NEXT: s_mov_b32 s9, s1
4079 ; SI-NEXT: s_waitcnt vmcnt(1)
4080 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
4081 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4082 ; SI-NEXT: s_waitcnt vmcnt(0)
4083 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
4084 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4085 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
4086 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4087 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3
4088 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
4089 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1
4090 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
4091 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4094 ; VI-LABEL: fcmp_v2f16_neq:
4095 ; VI: ; %bb.0: ; %entry
4096 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4097 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
4098 ; VI-NEXT: s_mov_b32 s7, 0xf000
4099 ; VI-NEXT: s_mov_b32 s6, -1
4100 ; VI-NEXT: s_mov_b32 s10, s6
4101 ; VI-NEXT: s_mov_b32 s11, s7
4102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4103 ; VI-NEXT: s_mov_b32 s12, s2
4104 ; VI-NEXT: s_mov_b32 s13, s3
4105 ; VI-NEXT: s_mov_b32 s14, s6
4106 ; VI-NEXT: s_mov_b32 s15, s7
4107 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
4108 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
4109 ; VI-NEXT: s_mov_b32 s4, s0
4110 ; VI-NEXT: s_mov_b32 s5, s1
4111 ; VI-NEXT: s_waitcnt vmcnt(1)
4112 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4113 ; VI-NEXT: s_waitcnt vmcnt(0)
4114 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4115 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v1, v0
4116 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
4117 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2
4118 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
4119 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4122 ; GFX11-TRUE16-LABEL: fcmp_v2f16_neq:
4123 ; GFX11-TRUE16: ; %bb.0: ; %entry
4124 ; GFX11-TRUE16-NEXT: s_clause 0x1
4125 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4126 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4127 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
4128 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
4129 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
4130 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
4131 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
4132 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
4133 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
4134 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
4135 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
4136 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
4137 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
4138 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
4139 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
4140 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
4141 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4142 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4143 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4144 ; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1.l, v0.l
4145 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4146 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
4147 ; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3.l, v2.l
4148 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4149 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
4150 ; GFX11-TRUE16-NEXT: s_endpgm
4152 ; GFX11-FAKE16-LABEL: fcmp_v2f16_neq:
4153 ; GFX11-FAKE16: ; %bb.0: ; %entry
4154 ; GFX11-FAKE16-NEXT: s_clause 0x1
4155 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4156 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4157 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
4158 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
4159 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
4160 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
4161 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
4162 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
4163 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
4164 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
4165 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
4166 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
4167 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
4168 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
4169 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
4170 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
4171 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4172 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
4173 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4174 ; GFX11-FAKE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0
4175 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4176 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
4177 ; GFX11-FAKE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
4178 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4179 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
4180 ; GFX11-FAKE16-NEXT: s_endpgm
4182 ; GFX12-LABEL: fcmp_v2f16_neq:
4183 ; GFX12: ; %bb.0: ; %entry
4184 ; GFX12-NEXT: s_clause 0x1
4185 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4186 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4187 ; GFX12-NEXT: s_mov_b32 s10, -1
4188 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
4189 ; GFX12-NEXT: s_mov_b32 s6, s10
4190 ; GFX12-NEXT: s_mov_b32 s7, s11
4191 ; GFX12-NEXT: s_mov_b32 s14, s10
4192 ; GFX12-NEXT: s_mov_b32 s15, s11
4193 ; GFX12-NEXT: s_wait_kmcnt 0x0
4194 ; GFX12-NEXT: s_mov_b32 s12, s2
4195 ; GFX12-NEXT: s_mov_b32 s13, s3
4196 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
4197 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
4198 ; GFX12-NEXT: s_mov_b32 s8, s0
4199 ; GFX12-NEXT: s_mov_b32 s9, s1
4200 ; GFX12-NEXT: s_wait_loadcnt 0x1
4201 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4202 ; GFX12-NEXT: s_wait_loadcnt 0x0
4203 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4204 ; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0
4205 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4206 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
4207 ; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
4208 ; GFX12-NEXT: s_wait_alu 0xfffd
4209 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4210 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
4211 ; GFX12-NEXT: s_endpgm
4212 ptr addrspace(1) %r,
4213 ptr addrspace(1) %a,
4214 ptr addrspace(1) %b) {
4216 %a.val = load <2 x half>, ptr addrspace(1) %a
4217 %b.val = load <2 x half>, ptr addrspace(1) %b
4218 %r.val = fcmp une <2 x half> %a.val, %b.val
4219 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
4220 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
4224 define amdgpu_kernel void @fcmp_v2f16_nlt(
4225 ; SI-LABEL: fcmp_v2f16_nlt:
4226 ; SI: ; %bb.0: ; %entry
4227 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4228 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
4229 ; SI-NEXT: s_mov_b32 s11, 0xf000
4230 ; SI-NEXT: s_mov_b32 s10, -1
4231 ; SI-NEXT: s_mov_b32 s14, s10
4232 ; SI-NEXT: s_mov_b32 s15, s11
4233 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4234 ; SI-NEXT: s_mov_b32 s12, s2
4235 ; SI-NEXT: s_mov_b32 s13, s3
4236 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
4237 ; SI-NEXT: s_mov_b32 s6, s10
4238 ; SI-NEXT: s_mov_b32 s7, s11
4239 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
4240 ; SI-NEXT: s_mov_b32 s8, s0
4241 ; SI-NEXT: s_mov_b32 s9, s1
4242 ; SI-NEXT: s_waitcnt vmcnt(1)
4243 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
4244 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
4245 ; SI-NEXT: s_waitcnt vmcnt(0)
4246 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
4247 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4248 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
4249 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4250 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3
4251 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
4252 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
4253 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
4254 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4257 ; VI-LABEL: fcmp_v2f16_nlt:
4258 ; VI: ; %bb.0: ; %entry
4259 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4260 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
4261 ; VI-NEXT: s_mov_b32 s7, 0xf000
4262 ; VI-NEXT: s_mov_b32 s6, -1
4263 ; VI-NEXT: s_mov_b32 s10, s6
4264 ; VI-NEXT: s_mov_b32 s11, s7
4265 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4266 ; VI-NEXT: s_mov_b32 s12, s2
4267 ; VI-NEXT: s_mov_b32 s13, s3
4268 ; VI-NEXT: s_mov_b32 s14, s6
4269 ; VI-NEXT: s_mov_b32 s15, s7
4270 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
4271 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
4272 ; VI-NEXT: s_mov_b32 s4, s0
4273 ; VI-NEXT: s_mov_b32 s5, s1
4274 ; VI-NEXT: s_waitcnt vmcnt(1)
4275 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4276 ; VI-NEXT: s_waitcnt vmcnt(0)
4277 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4278 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
4279 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
4280 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2
4281 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
4282 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4285 ; GFX11-TRUE16-LABEL: fcmp_v2f16_nlt:
4286 ; GFX11-TRUE16: ; %bb.0: ; %entry
4287 ; GFX11-TRUE16-NEXT: s_clause 0x1
4288 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4289 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4290 ; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
4291 ; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
4292 ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
4293 ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
4294 ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
4295 ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
4296 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
4297 ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
4298 ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
4299 ; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
4300 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
4301 ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
4302 ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
4303 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
4304 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4305 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4306 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4307 ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
4308 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4309 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
4310 ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3.l, v2.l
4311 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4312 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
4313 ; GFX11-TRUE16-NEXT: s_endpgm
4315 ; GFX11-FAKE16-LABEL: fcmp_v2f16_nlt:
4316 ; GFX11-FAKE16: ; %bb.0: ; %entry
4317 ; GFX11-FAKE16-NEXT: s_clause 0x1
4318 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4319 ; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4320 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
4321 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
4322 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
4323 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
4324 ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
4325 ; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
4326 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
4327 ; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
4328 ; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
4329 ; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0
4330 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
4331 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
4332 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
4333 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
4334 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4335 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
4336 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4337 ; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
4338 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4339 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
4340 ; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
4341 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4342 ; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
4343 ; GFX11-FAKE16-NEXT: s_endpgm
4345 ; GFX12-LABEL: fcmp_v2f16_nlt:
4346 ; GFX12: ; %bb.0: ; %entry
4347 ; GFX12-NEXT: s_clause 0x1
4348 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
4349 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
4350 ; GFX12-NEXT: s_mov_b32 s10, -1
4351 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
4352 ; GFX12-NEXT: s_mov_b32 s6, s10
4353 ; GFX12-NEXT: s_mov_b32 s7, s11
4354 ; GFX12-NEXT: s_mov_b32 s14, s10
4355 ; GFX12-NEXT: s_mov_b32 s15, s11
4356 ; GFX12-NEXT: s_wait_kmcnt 0x0
4357 ; GFX12-NEXT: s_mov_b32 s12, s2
4358 ; GFX12-NEXT: s_mov_b32 s13, s3
4359 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
4360 ; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null
4361 ; GFX12-NEXT: s_mov_b32 s8, s0
4362 ; GFX12-NEXT: s_mov_b32 s9, s1
4363 ; GFX12-NEXT: s_wait_loadcnt 0x1
4364 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
4365 ; GFX12-NEXT: s_wait_loadcnt 0x0
4366 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
4367 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
4368 ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
4369 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
4370 ; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
4371 ; GFX12-NEXT: s_wait_alu 0xfffd
4372 ; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
4373 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
4374 ; GFX12-NEXT: s_endpgm
4375 ptr addrspace(1) %r,
4376 ptr addrspace(1) %a,
4377 ptr addrspace(1) %b) {
4379 %a.val = load <2 x half>, ptr addrspace(1) %a
4380 %b.val = load <2 x half>, ptr addrspace(1) %b
4381 %r.val = fcmp uge <2 x half> %a.val, %b.val
4382 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
4383 store <2 x i32> %r.val.sext, ptr addrspace(1) %r
4387 declare half @llvm.fabs.f16(half) #1
4389 attributes #0 = { nounwind }
4390 attributes #1 = { nounwind readnone }