1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
5 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
7 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
8 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
9 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
10 ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s
12 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 {
13 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
15 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16 ; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
17 ; GFX11-NEXT: s_setpc_b64 s[30:31]
19 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
21 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
23 ; GFX9-NEXT: s_setpc_b64 s[30:31]
25 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
27 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
29 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
30 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
31 ; VI-NEXT: v_mac_f32_e32 v2, v0, v1
32 ; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
33 ; VI-NEXT: s_setpc_b64 s[30:31]
35 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
37 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
39 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
40 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
41 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
43 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
45 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
47 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
48 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
49 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
50 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2
51 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
52 %src0.ext = fpext half %src0 to float
53 %src1.ext = fpext half %src1 to float
54 %src2.ext = fpext half %src2 to float
55 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
56 %cvt.result = fptrunc float %result to half
57 %vec.result = insertelement <2 x half> undef, half %cvt.result, i32 1
58 ret <2 x half> %vec.result
61 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 {
62 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
64 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x3c00
66 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
67 ; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
68 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
69 ; GFX11-NEXT: s_setpc_b64 s[30:31]
71 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
73 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00
75 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
76 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
77 ; GFX9-NEXT: s_setpc_b64 s[30:31]
79 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
81 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
83 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
84 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
85 ; VI-NEXT: v_mac_f32_e32 v2, v0, v1
86 ; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
87 ; VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0
88 ; VI-NEXT: s_setpc_b64 s[30:31]
90 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
92 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
94 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
95 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
96 ; SDAG-CI-NEXT: v_mov_b32_e32 v0, 1.0
97 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
99 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
101 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
103 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
104 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
105 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
106 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2
107 ; GISEL-CI-NEXT: v_mov_b32_e32 v0, 0x3c00
108 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
109 %src0.ext = fpext half %src0 to float
110 %src1.ext = fpext half %src1 to float
111 %src2.ext = fpext half %src2 to float
112 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
113 %cvt.result = fptrunc float %result to half
114 %vec.result = insertelement <2 x half> <half 1.0, half undef>, half %cvt.result, i32 1
115 ret <2 x half> %vec.result
118 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
119 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
121 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122 ; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
123 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
124 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
125 ; GFX11-NEXT: s_setpc_b64 s[30:31]
127 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
129 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
131 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
132 ; GFX9-NEXT: s_setpc_b64 s[30:31]
134 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
136 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
138 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
139 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
140 ; VI-NEXT: v_mac_f32_e32 v2, v0, v1
141 ; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
142 ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
143 ; VI-NEXT: s_setpc_b64 s[30:31]
145 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
147 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
149 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
150 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
151 ; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3
152 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
154 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
156 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
158 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
159 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
160 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
161 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v2
162 ; GISEL-CI-NEXT: v_mov_b32_e32 v0, v3
163 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
164 %src0.ext = fpext half %src0 to float
165 %src1.ext = fpext half %src1 to float
166 %src2.ext = fpext half %src2 to float
167 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
168 %cvt.result = fptrunc float %result to half
169 %vec = insertelement <2 x half> undef, half %lo, i32 0
170 %vec.result = insertelement <2 x half> %vec, half %cvt.result, i32 1
171 ret <2 x half> %vec.result
174 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
175 ; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
176 ; SDAG-GFX11: ; %bb.0:
177 ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178 ; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
179 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
180 ; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
181 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
183 ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
184 ; SDAG-GFX9: ; %bb.0:
185 ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186 ; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
187 ; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
188 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
190 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
192 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
194 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
195 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
196 ; VI-NEXT: v_mac_f32_e32 v2, v0, v1
197 ; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
198 ; VI-NEXT: s_setpc_b64 s[30:31]
200 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
202 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
204 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
205 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
206 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
208 ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
209 ; GISEL-GFX11: ; %bb.0:
210 ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211 ; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
212 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
213 ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
214 ; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
215 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
217 ; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
218 ; GISEL-GFX9: ; %bb.0:
219 ; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220 ; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
221 ; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
222 ; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
223 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
225 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
227 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
229 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
230 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
231 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
232 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
233 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
234 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
235 %src0.ext = fpext half %src0 to float
236 %src1.ext = fpext half %src1 to float
237 %src2.ext = fpext half %src2 to float
238 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
239 %cvt.result = fptrunc float %result to half
240 %bc = bitcast half %cvt.result to i16
241 %ext = zext i16 %bc to i32
242 %shr = shl i32 %ext, 16
246 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
247 ; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
248 ; SDAG-GFX11: ; %bb.0:
249 ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250 ; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
251 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
252 ; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
253 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
255 ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
256 ; SDAG-GFX9: ; %bb.0:
257 ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258 ; SDAG-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
259 ; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
260 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
262 ; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
264 ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
266 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
267 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
268 ; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
269 ; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
270 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
272 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
274 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
276 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
277 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
278 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
280 ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
281 ; GISEL-GFX11: ; %bb.0:
282 ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283 ; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
284 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
285 ; GISEL-GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
286 ; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
287 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
289 ; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
290 ; GISEL-GFX9: ; %bb.0:
291 ; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
293 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 16
294 ; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
295 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
297 ; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
299 ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
301 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
302 ; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
303 ; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
304 ; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
305 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
306 ; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
307 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
309 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
311 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
313 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
314 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
315 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
316 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
317 ; GISEL-CI-NEXT: v_bfe_i32 v0, v0, 0, 16
318 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
319 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
320 %src0.ext = fpext half %src0 to float
321 %src1.ext = fpext half %src1 to float
322 %src2.ext = fpext half %src2 to float
323 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
324 %cvt.result = fptrunc float %result to half
325 %bc = bitcast half %cvt.result to i16
326 %ext = sext i16 %bc to i32
327 %shr = shl i32 %ext, 16
331 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 {
332 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
334 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335 ; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
336 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
337 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
338 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
339 ; GFX11-NEXT: s_setpc_b64 s[30:31]
341 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
343 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
345 ; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
346 ; GFX9-NEXT: s_setpc_b64 s[30:31]
348 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
350 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
352 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
353 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
354 ; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
355 ; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
356 ; VI-NEXT: s_setpc_b64 s[30:31]
358 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
360 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
361 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
362 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
363 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
364 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
366 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
368 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
370 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
371 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
372 ; GISEL-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
373 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
374 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
375 %src0.ext = fpext half %src0 to float
376 %src1.ext = fpext half %src1 to float
377 %src2.ext = fpext half %src2 to float
378 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
379 %max = call float @llvm.maxnum.f32(float %result, float 0.0)
380 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
381 %cvt.result = fptrunc float %clamp to half
382 %vec.result = insertelement <2 x half> undef, half %cvt.result, i32 1
383 ret <2 x half> %vec.result
386 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 {
387 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
389 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390 ; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
391 ; GFX11-NEXT: s_setpc_b64 s[30:31]
393 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
395 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
397 ; GFX9-NEXT: s_setpc_b64 s[30:31]
399 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
401 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
403 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
404 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
405 ; VI-NEXT: v_mac_f32_e32 v2, v0, v1
406 ; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
407 ; VI-NEXT: s_setpc_b64 s[30:31]
409 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
411 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
413 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
414 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp
415 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
417 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
419 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
421 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
422 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
423 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
424 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
425 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
426 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
427 ; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
428 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
429 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
430 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
431 ; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
432 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
433 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
434 %src0.ext = fpext half %src0 to float
435 %src1.ext = fpext half %src1 to float
436 %src2.ext = fpext half %src2 to float
437 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
438 %cvt.result = fptrunc float %result to half
439 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
440 %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
441 %vec.result = insertelement <2 x half> undef, half %clamp, i32 1
442 ret <2 x half> %vec.result
445 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
446 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
448 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
450 ; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
451 ; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
452 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
453 ; GFX11-NEXT: s_setpc_b64 s[30:31]
455 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
457 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
459 ; GFX9-NEXT: global_store_short v[0:1], v3, off
460 ; GFX9-NEXT: s_waitcnt vmcnt(0)
461 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
462 ; GFX9-NEXT: s_setpc_b64 s[30:31]
464 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
466 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
468 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
469 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
470 ; VI-NEXT: v_mac_f32_e32 v2, v0, v1
471 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v2
472 ; VI-NEXT: flat_store_short v[0:1], v0
473 ; VI-NEXT: s_waitcnt vmcnt(0)
474 ; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
475 ; VI-NEXT: s_setpc_b64 s[30:31]
477 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
479 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
480 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
481 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
482 ; SDAG-CI-NEXT: s_mov_b32 s7, 0xf000
483 ; SDAG-CI-NEXT: s_mov_b32 s6, -1
484 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp
485 ; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
486 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
487 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
489 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
491 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
493 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
494 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
495 ; GISEL-CI-NEXT: s_mov_b32 s6, -1
496 ; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000
497 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
498 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
499 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
500 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0
501 ; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
502 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0)
503 ; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1
504 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
505 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
506 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
507 ; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
508 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
509 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
510 %src0.ext = fpext half %src0 to float
511 %src1.ext = fpext half %src1 to float
512 %src2.ext = fpext half %src2 to float
513 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
514 %cvt.result = fptrunc float %result to half
515 store volatile half %cvt.result, ptr addrspace(1) undef
516 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
517 %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
518 %vec.result = insertelement <2 x half> undef, half %clamp, i32 1
519 ret <2 x half> %vec.result
522 declare half @llvm.minnum.f16(half, half) #1
523 declare half @llvm.maxnum.f16(half, half) #1
524 declare float @llvm.minnum.f32(float, float) #1
525 declare float @llvm.maxnum.f32(float, float) #1
526 declare float @llvm.fmuladd.f32(float, float, float) #1
527 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
529 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
530 attributes #1 = { nounwind readnone speculatable }