1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
4 ; GCN-LABEL: {{^}}fcmp_f16_lt
5 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
7 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
8 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
9 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
10 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
11 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
12 ; GCN: buffer_store_dword v[[R_I32]]
14 define amdgpu_kernel void @fcmp_f16_lt(
16 half addrspace(1)* %a,
17 half addrspace(1)* %b) {
19 %a.val = load volatile half, half addrspace(1)* %a
20 %b.val = load volatile half, half addrspace(1)* %b
21 %r.val = fcmp olt half %a.val, %b.val
22 %r.val.sext = sext i1 %r.val to i32
23 store i32 %r.val.sext, i32 addrspace(1)* %r
27 ; GCN-LABEL: {{^}}fcmp_f16_lt_abs:
28 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
29 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
31 ; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
32 ; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
34 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
35 ; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
37 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
38 ; GCN: buffer_store_dword v[[R_I32]]
40 define amdgpu_kernel void @fcmp_f16_lt_abs(
42 half addrspace(1)* %a,
43 half addrspace(1)* %b) {
45 %a.val = load volatile half, half addrspace(1)* %a
46 %b.val = load volatile half, half addrspace(1)* %b
47 %a.abs = call half @llvm.fabs.f16(half %a.val)
48 %b.abs = call half @llvm.fabs.f16(half %b.val)
49 %r.val = fcmp olt half %a.abs, %b.abs
50 %r.val.sext = sext i1 %r.val to i32
51 store i32 %r.val.sext, i32 addrspace(1)* %r
55 ; GCN-LABEL: {{^}}fcmp_f16_eq
56 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
57 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
58 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
59 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
60 ; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
61 ; VI: v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
62 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
63 ; GCN: buffer_store_dword v[[R_I32]]
65 define amdgpu_kernel void @fcmp_f16_eq(
67 half addrspace(1)* %a,
68 half addrspace(1)* %b) {
70 %a.val = load volatile half, half addrspace(1)* %a
71 %b.val = load volatile half, half addrspace(1)* %b
72 %r.val = fcmp oeq half %a.val, %b.val
73 %r.val.sext = sext i1 %r.val to i32
74 store i32 %r.val.sext, i32 addrspace(1)* %r
78 ; GCN-LABEL: {{^}}fcmp_f16_le
79 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
80 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
81 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
82 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
83 ; SI: v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
84 ; VI: v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
85 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
86 ; GCN: buffer_store_dword v[[R_I32]]
88 define amdgpu_kernel void @fcmp_f16_le(
90 half addrspace(1)* %a,
91 half addrspace(1)* %b) {
93 %a.val = load volatile half, half addrspace(1)* %a
94 %b.val = load volatile half, half addrspace(1)* %b
95 %r.val = fcmp ole half %a.val, %b.val
96 %r.val.sext = sext i1 %r.val to i32
97 store i32 %r.val.sext, i32 addrspace(1)* %r
101 ; GCN-LABEL: {{^}}fcmp_f16_gt
102 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
103 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
104 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
105 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
106 ; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
107 ; VI: v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
108 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
109 ; GCN: buffer_store_dword v[[R_I32]]
111 define amdgpu_kernel void @fcmp_f16_gt(
112 i32 addrspace(1)* %r,
113 half addrspace(1)* %a,
114 half addrspace(1)* %b) {
116 %a.val = load volatile half, half addrspace(1)* %a
117 %b.val = load volatile half, half addrspace(1)* %b
118 %r.val = fcmp ogt half %a.val, %b.val
119 %r.val.sext = sext i1 %r.val to i32
120 store i32 %r.val.sext, i32 addrspace(1)* %r
124 ; GCN-LABEL: {{^}}fcmp_f16_lg
125 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
126 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
127 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
128 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
129 ; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
130 ; VI: v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
131 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
132 ; GCN: buffer_store_dword v[[R_I32]]
134 define amdgpu_kernel void @fcmp_f16_lg(
135 i32 addrspace(1)* %r,
136 half addrspace(1)* %a,
137 half addrspace(1)* %b) {
139 %a.val = load volatile half, half addrspace(1)* %a
140 %b.val = load volatile half, half addrspace(1)* %b
141 %r.val = fcmp one half %a.val, %b.val
142 %r.val.sext = sext i1 %r.val to i32
143 store i32 %r.val.sext, i32 addrspace(1)* %r
147 ; GCN-LABEL: {{^}}fcmp_f16_ge
148 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
149 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
150 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
151 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
152 ; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
153 ; VI: v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
154 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
155 ; GCN: buffer_store_dword v[[R_I32]]
157 define amdgpu_kernel void @fcmp_f16_ge(
158 i32 addrspace(1)* %r,
159 half addrspace(1)* %a,
160 half addrspace(1)* %b) {
162 %a.val = load volatile half, half addrspace(1)* %a
163 %b.val = load volatile half, half addrspace(1)* %b
164 %r.val = fcmp oge half %a.val, %b.val
165 %r.val.sext = sext i1 %r.val to i32
166 store i32 %r.val.sext, i32 addrspace(1)* %r
170 ; GCN-LABEL: {{^}}fcmp_f16_o
171 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
172 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
173 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
174 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
175 ; SI: v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
176 ; VI: v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
177 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
178 ; GCN: buffer_store_dword v[[R_I32]]
180 define amdgpu_kernel void @fcmp_f16_o(
181 i32 addrspace(1)* %r,
182 half addrspace(1)* %a,
183 half addrspace(1)* %b) {
185 %a.val = load volatile half, half addrspace(1)* %a
186 %b.val = load volatile half, half addrspace(1)* %b
187 %r.val = fcmp ord half %a.val, %b.val
188 %r.val.sext = sext i1 %r.val to i32
189 store i32 %r.val.sext, i32 addrspace(1)* %r
193 ; GCN-LABEL: {{^}}fcmp_f16_u
194 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
195 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
196 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
197 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
198 ; SI: v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
199 ; VI: v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
200 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
201 ; GCN: buffer_store_dword v[[R_I32]]
203 define amdgpu_kernel void @fcmp_f16_u(
204 i32 addrspace(1)* %r,
205 half addrspace(1)* %a,
206 half addrspace(1)* %b) {
208 %a.val = load volatile half, half addrspace(1)* %a
209 %b.val = load volatile half, half addrspace(1)* %b
210 %r.val = fcmp uno half %a.val, %b.val
211 %r.val.sext = sext i1 %r.val to i32
212 store i32 %r.val.sext, i32 addrspace(1)* %r
216 ; GCN-LABEL: {{^}}fcmp_f16_nge
217 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
218 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
219 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
220 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
221 ; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
222 ; VI: v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
223 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
224 ; GCN: buffer_store_dword v[[R_I32]]
226 define amdgpu_kernel void @fcmp_f16_nge(
227 i32 addrspace(1)* %r,
228 half addrspace(1)* %a,
229 half addrspace(1)* %b) {
231 %a.val = load volatile half, half addrspace(1)* %a
232 %b.val = load volatile half, half addrspace(1)* %b
233 %r.val = fcmp ult half %a.val, %b.val
234 %r.val.sext = sext i1 %r.val to i32
235 store i32 %r.val.sext, i32 addrspace(1)* %r
239 ; GCN-LABEL: {{^}}fcmp_f16_nlg
240 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
241 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
242 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
243 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
244 ; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
245 ; VI: v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
246 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
247 ; GCN: buffer_store_dword v[[R_I32]]
249 define amdgpu_kernel void @fcmp_f16_nlg(
250 i32 addrspace(1)* %r,
251 half addrspace(1)* %a,
252 half addrspace(1)* %b) {
254 %a.val = load volatile half, half addrspace(1)* %a
255 %b.val = load volatile half, half addrspace(1)* %b
256 %r.val = fcmp ueq half %a.val, %b.val
257 %r.val.sext = sext i1 %r.val to i32
258 store i32 %r.val.sext, i32 addrspace(1)* %r
262 ; GCN-LABEL: {{^}}fcmp_f16_ngt
263 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
264 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
265 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
266 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
267 ; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
268 ; VI: v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
269 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
270 ; GCN: buffer_store_dword v[[R_I32]]
272 define amdgpu_kernel void @fcmp_f16_ngt(
273 i32 addrspace(1)* %r,
274 half addrspace(1)* %a,
275 half addrspace(1)* %b) {
277 %a.val = load volatile half, half addrspace(1)* %a
278 %b.val = load volatile half, half addrspace(1)* %b
279 %r.val = fcmp ule half %a.val, %b.val
280 %r.val.sext = sext i1 %r.val to i32
281 store i32 %r.val.sext, i32 addrspace(1)* %r
285 ; GCN-LABEL: {{^}}fcmp_f16_nle
286 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
287 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
288 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
289 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
290 ; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
291 ; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
292 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
293 ; GCN: buffer_store_dword v[[R_I32]]
295 define amdgpu_kernel void @fcmp_f16_nle(
296 i32 addrspace(1)* %r,
297 half addrspace(1)* %a,
298 half addrspace(1)* %b) {
300 %a.val = load volatile half, half addrspace(1)* %a
301 %b.val = load volatile half, half addrspace(1)* %b
302 %r.val = fcmp ugt half %a.val, %b.val
303 %r.val.sext = sext i1 %r.val to i32
304 store i32 %r.val.sext, i32 addrspace(1)* %r
308 ; GCN-LABEL: {{^}}fcmp_f16_neq
309 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
310 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
311 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
312 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
313 ; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
314 ; VI: v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
315 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
316 ; GCN: buffer_store_dword v[[R_I32]]
318 define amdgpu_kernel void @fcmp_f16_neq(
319 i32 addrspace(1)* %r,
320 half addrspace(1)* %a,
321 half addrspace(1)* %b) {
323 %a.val = load volatile half, half addrspace(1)* %a
324 %b.val = load volatile half, half addrspace(1)* %b
325 %r.val = fcmp une half %a.val, %b.val
326 %r.val.sext = sext i1 %r.val to i32
327 store i32 %r.val.sext, i32 addrspace(1)* %r
331 ; GCN-LABEL: {{^}}fcmp_f16_nlt
332 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
333 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
334 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
335 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
336 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
337 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
338 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
339 ; GCN: buffer_store_dword v[[R_I32]]
341 define amdgpu_kernel void @fcmp_f16_nlt(
342 i32 addrspace(1)* %r,
343 half addrspace(1)* %a,
344 half addrspace(1)* %b) {
346 %a.val = load volatile half, half addrspace(1)* %a
347 %b.val = load volatile half, half addrspace(1)* %b
348 %r.val = fcmp uge half %a.val, %b.val
349 %r.val.sext = sext i1 %r.val to i32
350 store i32 %r.val.sext, i32 addrspace(1)* %r
354 ; GCN-LABEL: {{^}}fcmp_v2f16_lt:
355 ; SI: v_cmp_lt_f32_e32 vcc,
356 ; SI: v_cmp_lt_f32_e32 vcc,
358 ; VI: v_cmp_lt_f16_e32 vcc,
359 ; VI: v_cmp_lt_f16_e32 vcc,
360 define amdgpu_kernel void @fcmp_v2f16_lt(
361 <2 x i32> addrspace(1)* %r,
362 <2 x half> addrspace(1)* %a,
363 <2 x half> addrspace(1)* %b) {
365 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
366 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
367 %r.val = fcmp olt <2 x half> %a.val, %b.val
368 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
369 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
373 ; GCN-LABEL: {{^}}fcmp_v2f16_eq
374 ; SI: v_cmp_eq_f32_e32 vcc,
375 ; SI: v_cmp_eq_f32_e32 vcc,
377 ; VI: v_cmp_eq_f16_e32 vcc,
378 ; VI: v_cmp_eq_f16_e32 vcc,
379 define amdgpu_kernel void @fcmp_v2f16_eq(
380 <2 x i32> addrspace(1)* %r,
381 <2 x half> addrspace(1)* %a,
382 <2 x half> addrspace(1)* %b) {
384 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
385 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
386 %r.val = fcmp oeq <2 x half> %a.val, %b.val
387 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
388 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
392 ; GCN-LABEL: {{^}}fcmp_v2f16_le:
393 ; SI: v_cmp_le_f32_e32 vcc
394 ; SI: v_cmp_le_f32_e32 vcc
395 ; VI: v_cmp_le_f16_e32 vcc
396 ; VI: v_cmp_le_f16_e32 vcc
397 define amdgpu_kernel void @fcmp_v2f16_le(
398 <2 x i32> addrspace(1)* %r,
399 <2 x half> addrspace(1)* %a,
400 <2 x half> addrspace(1)* %b) {
402 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
403 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
404 %r.val = fcmp ole <2 x half> %a.val, %b.val
405 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
406 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
410 ; GCN-LABEL: {{^}}fcmp_v2f16_gt:
411 ; SI: v_cmp_gt_f32_e32 vcc,
412 ; SI: v_cmp_gt_f32_e32 vcc,
414 ; VI: v_cmp_gt_f16_e32 vcc,
415 ; VI: v_cmp_gt_f16_e32 vcc,
416 define amdgpu_kernel void @fcmp_v2f16_gt(
417 <2 x i32> addrspace(1)* %r,
418 <2 x half> addrspace(1)* %a,
419 <2 x half> addrspace(1)* %b) {
421 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
422 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
423 %r.val = fcmp ogt <2 x half> %a.val, %b.val
424 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
425 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
429 ; GCN-LABEL: {{^}}fcmp_v2f16_lg:
430 ; SI: v_cmp_lg_f32_e32 vcc,
431 ; SI: v_cmp_lg_f32_e32 vcc,
433 ; VI: v_cmp_lg_f16_e32 vcc,
434 ; VI: v_cmp_lg_f16_e32 vcc,
435 define amdgpu_kernel void @fcmp_v2f16_lg(
436 <2 x i32> addrspace(1)* %r,
437 <2 x half> addrspace(1)* %a,
438 <2 x half> addrspace(1)* %b) {
440 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
441 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
442 %r.val = fcmp one <2 x half> %a.val, %b.val
443 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
444 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
448 ; GCN-LABEL: {{^}}fcmp_v2f16_ge:
449 ; SI: v_cmp_ge_f32_e32 vcc,
450 ; SI: v_cmp_ge_f32_e32 vcc,
452 ; VI: v_cmp_ge_f16_e32 vcc,
453 ; VI: v_cmp_ge_f16_e32 vcc,
454 define amdgpu_kernel void @fcmp_v2f16_ge(
455 <2 x i32> addrspace(1)* %r,
456 <2 x half> addrspace(1)* %a,
457 <2 x half> addrspace(1)* %b) {
459 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
460 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
461 %r.val = fcmp oge <2 x half> %a.val, %b.val
462 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
463 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
467 ; GCN-LABEL: {{^}}fcmp_v2f16_o:
468 ; SI: v_cmp_o_f32_e32 vcc,
469 ; SI: v_cmp_o_f32_e32 vcc,
471 ; VI: v_cmp_o_f16_e32 vcc,
472 ; VI: v_cmp_o_f16_e32 vcc,
473 define amdgpu_kernel void @fcmp_v2f16_o(
474 <2 x i32> addrspace(1)* %r,
475 <2 x half> addrspace(1)* %a,
476 <2 x half> addrspace(1)* %b) {
478 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
479 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
480 %r.val = fcmp ord <2 x half> %a.val, %b.val
481 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
482 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
486 ; GCN-LABEL: {{^}}fcmp_v2f16_u:
487 ; SI: v_cmp_u_f32_e32 vcc,
488 ; SI: v_cmp_u_f32_e32 vcc,
490 ; VI: v_cmp_u_f16_e32 vcc,
491 ; VI: v_cmp_u_f16_e32 vcc,
492 define amdgpu_kernel void @fcmp_v2f16_u(
493 <2 x i32> addrspace(1)* %r,
494 <2 x half> addrspace(1)* %a,
495 <2 x half> addrspace(1)* %b) {
497 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
498 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
499 %r.val = fcmp uno <2 x half> %a.val, %b.val
500 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
501 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
505 ; GCN-LABEL: {{^}}fcmp_v2f16_nge
506 ; SI: v_cmp_nge_f32_e32 vcc,
507 ; SI: v_cmp_nge_f32_e32 vcc,
509 ; VI: v_cmp_nge_f16_e32 vcc,
510 ; VI: v_cmp_nge_f16_e32 vcc,
511 define amdgpu_kernel void @fcmp_v2f16_nge(
512 <2 x i32> addrspace(1)* %r,
513 <2 x half> addrspace(1)* %a,
514 <2 x half> addrspace(1)* %b) {
516 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
517 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
518 %r.val = fcmp ult <2 x half> %a.val, %b.val
519 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
520 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
524 ; GCN-LABEL: {{^}}fcmp_v2f16_nlg
525 ; SI: v_cmp_nlg_f32_e32 vcc
526 ; SI: v_cmp_nlg_f32_e32 vcc
528 ; VI: v_cmp_nlg_f16_e32 vcc
529 ; VI: v_cmp_nlg_f16_e32 vcc
530 define amdgpu_kernel void @fcmp_v2f16_nlg(
531 <2 x i32> addrspace(1)* %r,
532 <2 x half> addrspace(1)* %a,
533 <2 x half> addrspace(1)* %b) {
535 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
536 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
537 %r.val = fcmp ueq <2 x half> %a.val, %b.val
538 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
539 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
543 ; GCN-LABEL: {{^}}fcmp_v2f16_ngt
544 ; SI: v_cmp_ngt_f32_e32 vcc,
545 ; SI: v_cmp_ngt_f32_e32 vcc,
547 ; VI: v_cmp_ngt_f16_e32 vcc,
548 ; VI: v_cmp_ngt_f16_e32 vcc,
549 define amdgpu_kernel void @fcmp_v2f16_ngt(
550 <2 x i32> addrspace(1)* %r,
551 <2 x half> addrspace(1)* %a,
552 <2 x half> addrspace(1)* %b) {
554 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
555 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
556 %r.val = fcmp ule <2 x half> %a.val, %b.val
557 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
558 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
562 ; GCN-LABEL: {{^}}fcmp_v2f16_nle
563 ; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
564 ; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
566 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
567 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
568 define amdgpu_kernel void @fcmp_v2f16_nle(
569 <2 x i32> addrspace(1)* %r,
570 <2 x half> addrspace(1)* %a,
571 <2 x half> addrspace(1)* %b) {
573 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
574 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
575 %r.val = fcmp ugt <2 x half> %a.val, %b.val
576 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
577 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
581 ; GCN-LABEL: {{^}}fcmp_v2f16_neq
582 ; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
583 ; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
585 ; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
586 ; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
587 define amdgpu_kernel void @fcmp_v2f16_neq(
588 <2 x i32> addrspace(1)* %r,
589 <2 x half> addrspace(1)* %a,
590 <2 x half> addrspace(1)* %b) {
592 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
593 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
594 %r.val = fcmp une <2 x half> %a.val, %b.val
595 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
596 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
600 ; GCN-LABEL: {{^}}fcmp_v2f16_nlt
601 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
602 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
603 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
604 ; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
605 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
606 ; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
608 ; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
609 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
610 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
611 ; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
612 ; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]]
613 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
615 ; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]]
616 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
617 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
619 define amdgpu_kernel void @fcmp_v2f16_nlt(
620 <2 x i32> addrspace(1)* %r,
621 <2 x half> addrspace(1)* %a,
622 <2 x half> addrspace(1)* %b) {
624 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
625 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
626 %r.val = fcmp uge <2 x half> %a.val, %b.val
627 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
628 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
632 declare half @llvm.fabs.f16(half) #1
634 attributes #0 = { nounwind }
635 attributes #1 = { nounwind readnone }