1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
4 declare half @llvm.minnum.f16(half %a, half %b)
5 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
7 ; GCN-LABEL: {{^}}minnum_f16:
8 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
9 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
10 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
11 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
12 ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
13 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
14 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
15 ; GCN: buffer_store_short v[[R_F16]]
17 define amdgpu_kernel void @minnum_f16(
18 half addrspace(1)* %r,
19 half addrspace(1)* %a,
20 half addrspace(1)* %b) {
22 %a.val = load half, half addrspace(1)* %a
23 %b.val = load half, half addrspace(1)* %b
24 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
25 store half %r.val, half addrspace(1)* %r
29 ; GCN-LABEL: {{^}}minnum_f16_imm_a:
30 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
31 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
32 ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
33 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
34 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
35 ; GCN: buffer_store_short v[[R_F16]]
37 define amdgpu_kernel void @minnum_f16_imm_a(
38 half addrspace(1)* %r,
39 half addrspace(1)* %b) {
41 %b.val = load half, half addrspace(1)* %b
42 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
43 store half %r.val, half addrspace(1)* %r
47 ; GCN-LABEL: {{^}}minnum_f16_imm_b:
48 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
49 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
50 ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
51 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
52 ; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
53 ; GCN: buffer_store_short v[[R_F16]]
55 define amdgpu_kernel void @minnum_f16_imm_b(
56 half addrspace(1)* %r,
57 half addrspace(1)* %a) {
59 %a.val = load half, half addrspace(1)* %a
60 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
61 store half %r.val, half addrspace(1)* %r
65 ; GCN-LABEL: {{^}}minnum_v2f16:
66 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
67 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
69 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
70 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
71 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
72 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
73 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
74 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
75 ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
76 ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
77 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
78 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
79 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
81 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
83 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
84 ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
86 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
88 ; GCN: buffer_store_dword v[[R_V2_F16]]
90 define amdgpu_kernel void @minnum_v2f16(
91 <2 x half> addrspace(1)* %r,
92 <2 x half> addrspace(1)* %a,
93 <2 x half> addrspace(1)* %b) {
95 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
96 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
97 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
98 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
102 ; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
103 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
105 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
106 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
107 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
108 ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
109 ; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
110 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
111 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
113 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
114 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
115 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
117 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
119 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
120 ; GCN: buffer_store_dword v[[R_V2_F16]]
122 define amdgpu_kernel void @minnum_v2f16_imm_a(
123 <2 x half> addrspace(1)* %r,
124 <2 x half> addrspace(1)* %b) {
126 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
127 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
128 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
132 ; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
133 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
134 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
135 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
136 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
137 ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
138 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
139 ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
140 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
141 ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
142 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
143 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
145 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
147 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
148 ; GCN: buffer_store_dword v[[R_V2_F16]]
150 define amdgpu_kernel void @minnum_v2f16_imm_b(
151 <2 x half> addrspace(1)* %r,
152 <2 x half> addrspace(1)* %a) {
154 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
155 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
156 store <2 x half> %r.val, <2 x half> addrspace(1)* %r