1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; Denormal mode shouldn't matter for f16, check with and without flushing.
3 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s
4 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s
6 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-IEEE %s
7 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-FLUSH %s
9 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s
10 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s
12 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s
13 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s
15 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s
16 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s
18 define half @v_fdiv_f16(half %a, half %b) {
19 ; GFX6-IEEE-LABEL: v_fdiv_f16:
21 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
23 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
24 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
25 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
26 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
27 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
28 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
29 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
30 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
31 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
32 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
33 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
34 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
35 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
36 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
38 ; GFX6-FLUSH-LABEL: v_fdiv_f16:
39 ; GFX6-FLUSH: ; %bb.0:
40 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
42 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
43 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
44 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
45 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
46 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
47 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
48 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
49 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
50 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
51 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
52 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
53 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
54 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
55 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
56 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
57 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
58 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
60 ; GFX8-LABEL: v_fdiv_f16:
62 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
64 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
65 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
66 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
67 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
68 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
69 ; GFX8-NEXT: s_setpc_b64 s[30:31]
71 ; GFX9-IEEE-LABEL: v_fdiv_f16:
73 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
75 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
76 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
77 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
78 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
79 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
80 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
82 ; GFX9-FLUSH-LABEL: v_fdiv_f16:
83 ; GFX9-FLUSH: ; %bb.0:
84 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
86 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
87 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
88 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
89 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
91 ; GFX10-LABEL: v_fdiv_f16:
93 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
95 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
96 ; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
97 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
98 ; GFX10-NEXT: s_setpc_b64 s[30:31]
100 ; GFX11-LABEL: v_fdiv_f16:
102 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
104 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
105 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
106 ; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
107 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
108 ; GFX11-NEXT: s_setpc_b64 s[30:31]
109 %fdiv = fdiv half %a, %b
113 define half @v_fdiv_f16_afn(half %a, half %b) {
114 ; GFX6-LABEL: v_fdiv_f16_afn:
116 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
118 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
119 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1
120 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
121 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
122 ; GFX6-NEXT: s_setpc_b64 s[30:31]
124 ; GFX89-LABEL: v_fdiv_f16_afn:
126 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GFX89-NEXT: v_rcp_f16_e32 v1, v1
128 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
129 ; GFX89-NEXT: s_setpc_b64 s[30:31]
131 ; GFX10-LABEL: v_fdiv_f16_afn:
133 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1
135 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
136 ; GFX10-NEXT: s_setpc_b64 s[30:31]
138 ; GFX11-LABEL: v_fdiv_f16_afn:
140 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
142 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
143 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
144 ; GFX11-NEXT: s_setpc_b64 s[30:31]
145 %fdiv = fdiv afn half %a, %b
149 define half @v_fdiv_f16_ulp25(half %a, half %b) {
150 ; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
151 ; GFX6-IEEE: ; %bb.0:
152 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
154 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
155 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
156 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
157 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
158 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
159 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
160 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
161 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
162 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
163 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
164 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
165 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
166 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
167 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
169 ; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
170 ; GFX6-FLUSH: ; %bb.0:
171 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
173 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
174 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
175 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
176 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
177 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
178 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
179 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
180 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
181 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
182 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
183 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
184 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
185 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
186 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
187 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
188 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
189 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
191 ; GFX8-LABEL: v_fdiv_f16_ulp25:
193 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
195 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
196 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
197 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
198 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
199 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
200 ; GFX8-NEXT: s_setpc_b64 s[30:31]
202 ; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25:
203 ; GFX9-IEEE: ; %bb.0:
204 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
206 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
207 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
208 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
209 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
210 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
211 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
213 ; GFX9-FLUSH-LABEL: v_fdiv_f16_ulp25:
214 ; GFX9-FLUSH: ; %bb.0:
215 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
217 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
218 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
219 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
220 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
222 ; GFX10-LABEL: v_fdiv_f16_ulp25:
224 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
226 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
227 ; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
228 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
229 ; GFX10-NEXT: s_setpc_b64 s[30:31]
231 ; GFX11-LABEL: v_fdiv_f16_ulp25:
233 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
235 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
236 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
237 ; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
238 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
239 ; GFX11-NEXT: s_setpc_b64 s[30:31]
240 %fdiv = fdiv half %a, %b
244 define half @v_neg_rcp_f16(half %x) {
245 ; GFX6-IEEE-LABEL: v_neg_rcp_f16:
246 ; GFX6-IEEE: ; %bb.0:
247 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
249 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
250 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
251 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
252 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
253 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
254 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
255 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
256 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
257 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
258 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
259 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
260 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
261 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
262 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
264 ; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
265 ; GFX6-FLUSH: ; %bb.0:
266 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
268 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
269 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
270 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
271 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
272 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
273 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
274 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
275 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
276 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
277 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
278 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
279 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
280 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
281 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
282 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
283 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
284 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
286 ; GFX89-LABEL: v_neg_rcp_f16:
288 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289 ; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
290 ; GFX89-NEXT: s_setpc_b64 s[30:31]
292 ; GFX10PLUS-LABEL: v_neg_rcp_f16:
293 ; GFX10PLUS: ; %bb.0:
294 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295 ; GFX10PLUS-NEXT: v_rcp_f16_e64 v0, -v0
296 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
297 %fdiv = fdiv half -1.0, %x
301 define half @v_rcp_f16(half %x) {
302 ; GFX6-IEEE-LABEL: v_rcp_f16:
303 ; GFX6-IEEE: ; %bb.0:
304 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
306 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
307 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
308 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
309 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
310 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
311 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
312 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
313 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
314 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
315 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
316 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
317 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
318 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
319 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
321 ; GFX6-FLUSH-LABEL: v_rcp_f16:
322 ; GFX6-FLUSH: ; %bb.0:
323 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
325 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
326 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
327 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
328 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
329 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
330 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
331 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
332 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
333 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
334 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
335 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
336 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
337 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
338 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
339 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
340 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
341 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
343 ; GFX89-LABEL: v_rcp_f16:
345 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
347 ; GFX89-NEXT: s_setpc_b64 s[30:31]
349 ; GFX10PLUS-LABEL: v_rcp_f16:
350 ; GFX10PLUS: ; %bb.0:
351 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352 ; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0
353 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
354 %fdiv = fdiv half 1.0, %x
358 define half @v_rcp_f16_arcp(half %x) {
359 ; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
360 ; GFX6-IEEE: ; %bb.0:
361 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
363 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
364 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
365 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
366 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
367 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
368 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
369 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
370 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
371 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
372 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
373 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
374 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
375 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
376 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
378 ; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
379 ; GFX6-FLUSH: ; %bb.0:
380 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
382 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
383 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
384 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
385 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
386 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
387 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
388 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
389 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
390 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
391 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
392 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
393 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
394 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
395 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
396 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
397 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
398 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
400 ; GFX89-LABEL: v_rcp_f16_arcp:
402 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
404 ; GFX89-NEXT: s_setpc_b64 s[30:31]
406 ; GFX10PLUS-LABEL: v_rcp_f16_arcp:
407 ; GFX10PLUS: ; %bb.0:
408 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409 ; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0
410 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
411 %fdiv = fdiv arcp half 1.0, %x
415 define half @v_rcp_f16_arcp_afn(half %x) {
416 ; GFX6-LABEL: v_rcp_f16_arcp_afn:
418 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
420 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
421 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
422 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
423 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
424 ; GFX6-NEXT: s_setpc_b64 s[30:31]
426 ; GFX89-LABEL: v_rcp_f16_arcp_afn:
428 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
430 ; GFX89-NEXT: s_setpc_b64 s[30:31]
432 ; GFX10PLUS-LABEL: v_rcp_f16_arcp_afn:
433 ; GFX10PLUS: ; %bb.0:
434 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
435 ; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0
436 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
437 %fdiv = fdiv arcp afn half 1.0, %x
441 define half @v_rcp_f16_ulp25(half %x) {
442 ; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
443 ; GFX6-IEEE: ; %bb.0:
444 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
446 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
447 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
448 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
449 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
450 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
451 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
452 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
453 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
454 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
455 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
456 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
457 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
458 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
459 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
461 ; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
462 ; GFX6-FLUSH: ; %bb.0:
463 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
464 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
465 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
466 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
467 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
468 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
469 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
470 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
471 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
472 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
473 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
474 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
475 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
476 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
477 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
478 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
479 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
480 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
481 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
483 ; GFX89-LABEL: v_rcp_f16_ulp25:
485 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
487 ; GFX89-NEXT: s_setpc_b64 s[30:31]
489 ; GFX10PLUS-LABEL: v_rcp_f16_ulp25:
490 ; GFX10PLUS: ; %bb.0:
491 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492 ; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0
493 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
494 %fdiv = fdiv half 1.0, %x
498 define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
499 ; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
501 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
503 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
504 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1
505 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
506 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
507 ; GFX6-NEXT: s_setpc_b64 s[30:31]
509 ; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
511 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512 ; GFX89-NEXT: v_rcp_f16_e32 v1, v1
513 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
514 ; GFX89-NEXT: s_setpc_b64 s[30:31]
516 ; GFX10-LABEL: v_fdiv_f16_afn_ulp25:
518 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1
520 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
521 ; GFX10-NEXT: s_setpc_b64 s[30:31]
523 ; GFX11-LABEL: v_fdiv_f16_afn_ulp25:
525 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
527 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
528 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
529 ; GFX11-NEXT: s_setpc_b64 s[30:31]
530 %fdiv = fdiv afn half %a, %b
534 define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
535 ; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
536 ; GFX6-IEEE: ; %bb.0:
537 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
539 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
540 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
541 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
542 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
543 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
544 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
545 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
546 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
547 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
548 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
549 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
550 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
551 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
552 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
554 ; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
555 ; GFX6-FLUSH: ; %bb.0:
556 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
558 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
559 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
560 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
561 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
562 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
563 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
564 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
565 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
566 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
567 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
568 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
569 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
570 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
571 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
572 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
573 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
574 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
576 ; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
578 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
579 ; GFX89-NEXT: v_rcp_f16_e32 v1, v1
580 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
581 ; GFX89-NEXT: s_setpc_b64 s[30:31]
583 ; GFX10-LABEL: v_fdiv_f16_arcp_ulp25:
585 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1
587 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
588 ; GFX10-NEXT: s_setpc_b64 s[30:31]
590 ; GFX11-LABEL: v_fdiv_f16_arcp_ulp25:
592 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
594 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
595 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
596 ; GFX11-NEXT: s_setpc_b64 s[30:31]
597 %fdiv = fdiv arcp half %a, %b
601 define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
602 ; GFX6-IEEE-LABEL: v_fdiv_v2f16:
603 ; GFX6-IEEE: ; %bb.0:
604 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
606 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
607 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
608 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
609 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
610 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
611 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
612 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
613 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
614 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
615 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
616 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
617 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
618 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
619 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
620 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
621 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
622 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
623 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
624 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
625 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
626 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
627 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
628 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
629 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
630 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
631 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
632 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
633 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
635 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
636 ; GFX6-FLUSH: ; %bb.0:
637 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
639 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
640 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
641 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
642 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
643 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
644 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
645 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
646 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
647 ; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
648 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
649 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
650 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
651 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
652 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
653 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
654 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
655 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
656 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
657 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
658 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
659 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
660 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
661 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
662 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
663 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
664 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
665 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
666 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
667 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
668 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
669 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
670 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
671 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
673 ; GFX8-LABEL: v_fdiv_v2f16:
675 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
676 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
677 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
678 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
679 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
680 ; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
681 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
682 ; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
683 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5
684 ; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
685 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
686 ; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
687 ; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
688 ; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
689 ; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
690 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
691 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
692 ; GFX8-NEXT: s_setpc_b64 s[30:31]
694 ; GFX9-IEEE-LABEL: v_fdiv_v2f16:
695 ; GFX9-IEEE: ; %bb.0:
696 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
698 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
699 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
700 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
701 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
702 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
703 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
704 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
705 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
706 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
707 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
708 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
709 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
710 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
711 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
712 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
714 ; GFX9-FLUSH-LABEL: v_fdiv_v2f16:
715 ; GFX9-FLUSH: ; %bb.0:
716 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
717 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
718 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
719 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
720 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
721 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
722 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
723 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
724 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
725 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
726 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
727 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
728 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
730 ; GFX10-LABEL: v_fdiv_v2f16:
732 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
734 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
735 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
736 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
737 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
738 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
739 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
740 ; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
741 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
742 ; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
743 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
744 ; GFX10-NEXT: s_setpc_b64 s[30:31]
746 ; GFX11-LABEL: v_fdiv_v2f16:
748 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
749 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
750 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
751 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
752 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
753 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
754 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4
755 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
756 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
757 ; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
758 ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
759 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
760 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
761 ; GFX11-NEXT: s_setpc_b64 s[30:31]
762 %fdiv = fdiv <2 x half> %a, %b
766 define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
767 ; GFX6-LABEL: v_fdiv_v2f16_afn:
769 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
770 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
771 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
772 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
773 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
774 ; GFX6-NEXT: v_rcp_f32_e32 v2, v2
775 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3
776 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
777 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
778 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
779 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
780 ; GFX6-NEXT: s_setpc_b64 s[30:31]
782 ; GFX8-LABEL: v_fdiv_v2f16_afn:
784 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
785 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
786 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
787 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
788 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
789 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
790 ; GFX8-NEXT: s_setpc_b64 s[30:31]
792 ; GFX9-LABEL: v_fdiv_v2f16_afn:
794 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
796 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
797 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
798 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
799 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
800 ; GFX9-NEXT: s_setpc_b64 s[30:31]
802 ; GFX10-LABEL: v_fdiv_v2f16_afn:
804 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
806 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
807 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
808 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
809 ; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
810 ; GFX10-NEXT: s_setpc_b64 s[30:31]
812 ; GFX11-LABEL: v_fdiv_v2f16_afn:
814 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
815 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
816 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
817 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
818 ; GFX11-NEXT: v_rcp_f16_e32 v2, v2
819 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
820 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
821 ; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
822 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
823 ; GFX11-NEXT: s_setpc_b64 s[30:31]
824 %fdiv = fdiv afn <2 x half> %a, %b
828 define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
829 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25:
830 ; GFX6-IEEE: ; %bb.0:
831 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
833 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
834 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
835 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
836 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
837 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
838 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
839 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
840 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
841 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
842 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
843 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
844 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
845 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
846 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
847 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
848 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
849 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
850 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
851 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
852 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
853 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
854 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
855 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
856 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
857 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
858 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
859 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
860 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
862 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
863 ; GFX6-FLUSH: ; %bb.0:
864 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
866 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
867 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
868 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
869 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
870 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
871 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
872 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
873 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
874 ; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
875 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
876 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
877 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
878 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
879 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
880 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
881 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
882 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
883 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
884 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
885 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
886 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
887 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
888 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
889 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
890 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
891 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
892 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
893 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
894 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
895 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
896 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
897 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
898 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
900 ; GFX8-LABEL: v_fdiv_v2f16_ulp25:
902 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
904 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
905 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
906 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
907 ; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
908 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
909 ; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
910 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5
911 ; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
912 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
913 ; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
914 ; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
915 ; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
916 ; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
917 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
918 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
919 ; GFX8-NEXT: s_setpc_b64 s[30:31]
921 ; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25:
922 ; GFX9-IEEE: ; %bb.0:
923 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
925 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
926 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
927 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
928 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
929 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
930 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
931 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
932 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
933 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
934 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
935 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
936 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
937 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
938 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
939 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
941 ; GFX9-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
942 ; GFX9-FLUSH: ; %bb.0:
943 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
944 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
945 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
946 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
947 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
948 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
949 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
950 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
951 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
952 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
953 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
954 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
955 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
957 ; GFX10-LABEL: v_fdiv_v2f16_ulp25:
959 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
960 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
961 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
962 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
963 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
964 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
965 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
966 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
967 ; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
968 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
969 ; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
970 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
971 ; GFX10-NEXT: s_setpc_b64 s[30:31]
973 ; GFX11-LABEL: v_fdiv_v2f16_ulp25:
975 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
976 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
977 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
978 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
979 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
980 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
981 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4
982 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
983 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
984 ; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
985 ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
986 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
987 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
988 ; GFX11-NEXT: s_setpc_b64 s[30:31]
989 %fdiv = fdiv <2 x half> %a, %b
993 define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
994 ; GFX6-IEEE-LABEL: v_rcp_v2f16:
995 ; GFX6-IEEE: ; %bb.0:
996 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
997 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
998 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
999 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
1000 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1001 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1002 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1003 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1004 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1005 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1006 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1007 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1008 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1009 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1010 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1011 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
1012 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1013 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
1014 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1015 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1016 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1017 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1018 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1019 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1020 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1021 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1022 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
1023 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1024 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1026 ; GFX6-FLUSH-LABEL: v_rcp_v2f16:
1027 ; GFX6-FLUSH: ; %bb.0:
1028 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1029 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1030 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1031 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1032 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1033 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1034 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1035 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1036 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1037 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1038 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1039 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1040 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1041 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1042 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1043 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1044 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1045 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1046 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1047 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1048 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
1049 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
1050 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
1051 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1052 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1053 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
1054 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
1055 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1056 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
1057 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1058 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1059 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1060 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
1061 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1062 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1064 ; GFX8-LABEL: v_rcp_v2f16:
1066 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1067 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1068 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
1069 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
1070 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1071 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
1072 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
1073 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
1074 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
1075 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
1076 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1077 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1078 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1079 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1080 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1081 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1083 ; GFX9-IEEE-LABEL: v_rcp_v2f16:
1084 ; GFX9-IEEE: ; %bb.0:
1085 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1087 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
1088 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
1089 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1090 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
1091 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
1092 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
1093 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
1094 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1095 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
1096 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1097 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1098 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
1099 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
1101 ; GFX9-FLUSH-LABEL: v_rcp_v2f16:
1102 ; GFX9-FLUSH: ; %bb.0:
1103 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1104 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
1105 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1106 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
1107 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
1108 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
1109 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
1110 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1111 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
1112 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
1113 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
1114 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
1116 ; GFX10-LABEL: v_rcp_v2f16:
1118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1119 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1120 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
1121 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1122 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
1123 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1124 ; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1125 ; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1126 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
1127 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
1128 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
1129 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1131 ; GFX11-LABEL: v_rcp_v2f16:
1133 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1135 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
1136 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
1137 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
1138 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
1139 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1140 ; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1141 ; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1142 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
1143 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
1144 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1145 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1146 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
1147 ret <2 x half> %fdiv
1150 define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
1151 ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
1152 ; GFX6-IEEE: ; %bb.0:
1153 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1154 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
1155 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
1156 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
1157 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1158 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1159 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1160 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1161 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1162 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1163 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1164 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1165 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1166 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1167 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1168 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
1169 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1170 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
1171 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1172 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1173 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1174 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1175 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1176 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1177 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1178 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1179 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
1180 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1181 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1183 ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
1184 ; GFX6-FLUSH: ; %bb.0:
1185 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
1187 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1188 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1189 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1190 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1191 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1192 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1193 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1194 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1195 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1196 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1197 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1198 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1199 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1200 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
1201 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1202 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1203 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1204 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1205 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
1206 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
1207 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
1208 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1209 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1210 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
1211 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
1212 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1213 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
1214 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1215 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1216 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1217 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
1218 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1219 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1221 ; GFX8-LABEL: v_neg_rcp_v2f16:
1223 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1224 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1225 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
1226 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
1227 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
1228 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
1229 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
1230 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
1231 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
1232 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
1233 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1234 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
1235 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
1236 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1237 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1238 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1240 ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16:
1241 ; GFX9-IEEE: ; %bb.0:
1242 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1244 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
1245 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
1246 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
1247 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
1248 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
1249 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
1250 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
1251 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1252 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
1253 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
1254 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
1255 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
1256 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
1258 ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
1259 ; GFX9-FLUSH: ; %bb.0:
1260 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
1262 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1263 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
1264 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
1265 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
1266 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
1267 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
1268 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
1269 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0
1270 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
1271 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
1273 ; GFX10-LABEL: v_neg_rcp_v2f16:
1275 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1276 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1277 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
1278 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1279 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
1280 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1281 ; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
1282 ; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
1283 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
1284 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
1285 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
1286 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1288 ; GFX11-LABEL: v_neg_rcp_v2f16:
1290 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1291 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1292 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
1293 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
1294 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
1295 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
1296 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1297 ; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
1298 ; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
1299 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
1300 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
1301 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1302 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1303 %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
1304 ret <2 x half> %fdiv
1307 define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
1308 ; GFX6-IEEE-LABEL: v_rcp_v2f16_fabs:
1309 ; GFX6-IEEE: ; %bb.0:
1310 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311 ; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1312 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
1313 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
1314 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1315 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
1316 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
1317 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1318 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
1319 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1320 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
1321 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1322 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1323 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1324 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1325 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1326 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1327 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
1328 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
1329 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
1330 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1331 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
1332 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
1333 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
1334 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1335 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
1336 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
1337 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
1338 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
1339 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
1340 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1341 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
1342 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1343 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1345 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs:
1346 ; GFX6-FLUSH: ; %bb.0:
1347 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1348 ; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1349 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
1350 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
1351 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1352 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
1353 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
1354 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1355 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
1356 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1357 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
1358 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1359 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1360 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1361 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1362 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1363 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1364 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1365 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1366 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1367 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1368 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
1369 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
1370 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1371 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1372 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
1373 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
1374 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
1375 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1376 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
1377 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
1378 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
1379 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
1380 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
1381 ; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
1382 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1383 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
1384 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
1385 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1386 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1388 ; GFX8-LABEL: v_rcp_v2f16_fabs:
1390 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1391 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1392 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1393 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
1394 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
1395 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1396 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
1397 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
1398 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
1399 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
1400 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
1401 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1402 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1403 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1404 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1405 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1406 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1408 ; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs:
1409 ; GFX9-IEEE: ; %bb.0:
1410 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1411 ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1412 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1413 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
1414 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
1415 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1416 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
1417 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
1418 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
1419 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
1420 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1421 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
1422 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1423 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1424 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
1425 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
1427 ; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs:
1428 ; GFX9-FLUSH: ; %bb.0:
1429 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1430 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1431 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
1432 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1433 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
1434 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
1435 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
1436 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
1437 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1438 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
1439 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
1440 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
1441 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
1443 ; GFX10-LABEL: v_rcp_v2f16_fabs:
1445 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1446 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1447 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1448 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
1449 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1450 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
1451 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1452 ; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1453 ; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1454 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
1455 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
1456 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
1457 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1459 ; GFX11-LABEL: v_rcp_v2f16_fabs:
1461 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1462 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1463 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1464 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
1465 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
1466 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
1467 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
1468 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1469 ; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1470 ; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1471 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
1472 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
1473 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1474 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1475 %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
1476 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x.fabs
1477 ret <2 x half> %fdiv
1480 define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
1481 ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
1482 ; GFX6-IEEE: ; %bb.0:
1483 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1484 ; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1485 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
1486 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
1487 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1488 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
1489 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
1490 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1491 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
1492 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1493 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
1494 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1495 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1496 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1497 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1498 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1499 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1500 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
1501 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
1502 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
1503 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1504 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
1505 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
1506 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
1507 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1508 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
1509 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
1510 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
1511 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
1512 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
1513 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1514 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
1515 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1516 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1518 ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
1519 ; GFX6-FLUSH: ; %bb.0:
1520 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1521 ; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1522 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
1523 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
1524 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1525 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
1526 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
1527 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1528 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
1529 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1530 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
1531 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1532 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1533 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1534 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1535 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1536 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1537 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1538 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1539 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1540 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
1541 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
1542 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
1543 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1544 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1545 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
1546 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
1547 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
1548 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1549 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
1550 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
1551 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
1552 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
1553 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
1554 ; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
1555 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1556 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
1557 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
1558 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1559 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1561 ; GFX8-LABEL: v_neg_rcp_v2f16_fabs:
1563 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1564 ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1565 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1566 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
1567 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
1568 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
1569 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
1570 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
1571 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
1572 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
1573 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
1574 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1575 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
1576 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
1577 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1578 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1579 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1581 ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
1582 ; GFX9-IEEE: ; %bb.0:
1583 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1584 ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1585 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1586 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
1587 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
1588 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
1589 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
1590 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
1591 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
1592 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
1593 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1594 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
1595 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
1596 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
1597 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
1598 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
1600 ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
1601 ; GFX9-FLUSH: ; %bb.0:
1602 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1603 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1604 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
1605 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1606 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
1607 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
1608 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
1609 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
1610 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
1611 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
1612 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0
1613 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
1614 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
1616 ; GFX10-LABEL: v_neg_rcp_v2f16_fabs:
1618 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1619 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1620 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1621 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
1622 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1623 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
1624 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1625 ; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
1626 ; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
1627 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
1628 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
1629 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
1630 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1632 ; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
1634 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1635 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
1636 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1637 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
1638 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
1639 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
1640 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
1641 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1642 ; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
1643 ; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
1644 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
1645 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
1646 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1647 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1648 %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
1649 %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x.fabs
1650 ret <2 x half> %fdiv
1653 define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
1654 ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
1655 ; GFX6-IEEE: ; %bb.0:
1656 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1657 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1658 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
1659 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
1660 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1661 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1662 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1663 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1664 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1665 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1666 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1667 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1668 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1669 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1670 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1671 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
1672 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1673 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
1674 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1675 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1676 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1677 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1678 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1679 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1680 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1681 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1682 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
1683 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1684 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1686 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
1687 ; GFX6-FLUSH: ; %bb.0:
1688 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1689 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1690 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1691 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1692 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1693 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1694 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1695 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1696 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1697 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1698 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1699 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1700 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1701 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1702 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1703 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1704 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1705 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1706 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1707 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1708 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
1709 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
1710 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
1711 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1712 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1713 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
1714 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
1715 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1716 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
1717 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1718 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1719 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1720 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
1721 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1722 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1724 ; GFX8-LABEL: v_rcp_v2f16_arcp:
1726 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1727 ; GFX8-NEXT: v_rcp_f16_e32 v1, v0
1728 ; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1729 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
1730 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1732 ; GFX9-LABEL: v_rcp_v2f16_arcp:
1734 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1735 ; GFX9-NEXT: v_rcp_f16_e32 v1, v0
1736 ; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1737 ; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
1738 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1740 ; GFX10-LABEL: v_rcp_v2f16_arcp:
1742 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1743 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0
1744 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1745 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
1746 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1748 ; GFX11-LABEL: v_rcp_v2f16_arcp:
1750 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1752 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0
1753 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
1754 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1755 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1756 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1757 %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
1758 ret <2 x half> %fdiv
1761 define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
1762 ; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
1764 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
1766 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
1767 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1768 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
1769 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1
1770 ; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
1771 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
1772 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
1773 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
1774 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1776 ; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
1778 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1779 ; GFX8-NEXT: v_rcp_f16_e32 v1, v0
1780 ; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1781 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
1782 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1784 ; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
1786 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1787 ; GFX9-NEXT: v_rcp_f16_e32 v1, v0
1788 ; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1789 ; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
1790 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1792 ; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
1794 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1795 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0
1796 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1797 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
1798 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1800 ; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
1802 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1803 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1804 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0
1805 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
1806 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1807 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1808 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1809 %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
1810 ret <2 x half> %fdiv
1813 define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
1814 ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
1815 ; GFX6-IEEE: ; %bb.0:
1816 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1817 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1818 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
1819 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
1820 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1821 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1822 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1823 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1824 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1825 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1826 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1827 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1828 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1829 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1830 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1831 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
1832 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1833 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
1834 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1835 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1836 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1837 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1838 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1839 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1840 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1841 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1842 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
1843 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1844 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1846 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
1847 ; GFX6-FLUSH: ; %bb.0:
1848 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1849 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1850 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1851 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1852 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1853 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1854 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1855 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1856 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1857 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1858 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1859 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1860 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1861 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1862 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1863 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1864 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1865 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1866 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1867 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1868 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
1869 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
1870 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
1871 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1872 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1873 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
1874 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
1875 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1876 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
1877 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1878 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1879 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1880 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
1881 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1882 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1884 ; GFX8-LABEL: v_rcp_v2f16_ulp25:
1886 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1887 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1888 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
1889 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
1890 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1891 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
1892 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
1893 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
1894 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
1895 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
1896 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1897 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1898 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1899 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1900 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
1901 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1903 ; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25:
1904 ; GFX9-IEEE: ; %bb.0:
1905 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1906 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1907 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
1908 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
1909 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1910 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
1911 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
1912 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
1913 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
1914 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1915 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
1916 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1917 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1918 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
1919 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
1921 ; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
1922 ; GFX9-FLUSH: ; %bb.0:
1923 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1924 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
1925 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1926 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
1927 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
1928 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
1929 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
1930 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1931 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
1932 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
1933 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
1934 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
1936 ; GFX10-LABEL: v_rcp_v2f16_ulp25:
1938 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1939 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1940 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
1941 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1942 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
1943 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1944 ; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1945 ; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1946 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
1947 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
1948 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
1949 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1951 ; GFX11-LABEL: v_rcp_v2f16_ulp25:
1953 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1954 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1955 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
1956 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
1957 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
1958 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
1959 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1960 ; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1961 ; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1962 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
1963 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
1964 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1965 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1966 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
1967 ret <2 x half> %fdiv
1970 define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
1971 ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
1973 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1974 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
1975 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
1976 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
1977 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
1978 ; GFX6-NEXT: v_rcp_f32_e32 v2, v2
1979 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3
1980 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
1981 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
1982 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
1983 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
1984 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1986 ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
1988 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1989 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
1990 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1991 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
1992 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1993 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
1994 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1996 ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
1998 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1999 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
2000 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2001 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
2002 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2003 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
2004 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2006 ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
2008 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2009 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
2010 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2011 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
2012 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2013 ; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
2014 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2016 ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
2018 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2019 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
2020 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
2021 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2022 ; GFX11-NEXT: v_rcp_f16_e32 v2, v2
2023 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2024 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
2025 ; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
2026 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2027 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2028 %fdiv = fdiv afn <2 x half> %a, %b
2029 ret <2 x half> %fdiv
2032 define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
2033 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
2034 ; GFX6-IEEE: ; %bb.0:
2035 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2036 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
2037 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
2038 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
2039 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
2040 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2041 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
2042 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
2043 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
2044 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
2045 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
2046 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
2047 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
2048 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
2049 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
2050 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
2051 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
2052 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
2053 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
2054 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2055 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
2056 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
2057 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
2058 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
2059 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
2060 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
2061 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
2062 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
2063 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
2064 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
2066 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
2067 ; GFX6-FLUSH: ; %bb.0:
2068 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2069 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
2070 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
2071 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2072 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
2073 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
2074 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2075 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
2076 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
2077 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
2078 ; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
2079 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
2080 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
2081 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2082 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
2083 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
2084 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
2085 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
2086 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2087 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2088 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
2089 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
2090 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
2091 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2092 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
2093 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
2094 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
2095 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
2096 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
2097 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
2098 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2099 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
2100 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
2101 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
2102 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
2104 ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
2106 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2107 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
2108 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2109 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
2110 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2111 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
2112 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2114 ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
2116 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2117 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
2118 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2119 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
2120 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2121 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
2122 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2124 ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
2126 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2127 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
2128 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2129 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
2130 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2131 ; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
2132 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2134 ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
2136 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2137 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
2138 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
2139 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2140 ; GFX11-NEXT: v_rcp_f16_e32 v2, v2
2141 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2142 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
2143 ; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
2144 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2145 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2146 %fdiv = fdiv arcp <2 x half> %a, %b
2147 ret <2 x half> %fdiv
2150 define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
2151 ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
2153 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2154 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
2155 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
2156 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
2157 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
2158 ; GFX6-NEXT: v_rcp_f32_e32 v2, v2
2159 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3
2160 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
2161 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
2162 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
2163 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
2164 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2166 ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
2168 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2169 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
2170 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2171 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
2172 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2173 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
2174 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2176 ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
2178 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2179 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
2180 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2181 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
2182 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2183 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
2184 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2186 ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
2188 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2189 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
2190 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2191 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
2192 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2193 ; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
2194 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2196 ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
2198 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2199 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
2200 ; GFX11-NEXT: v_rcp_f16_e32 v1, v1
2201 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
2202 ; GFX11-NEXT: v_rcp_f16_e32 v2, v2
2203 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2204 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
2205 ; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
2206 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2207 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2208 %fdiv = fdiv afn arcp <2 x half> %a, %b
2209 ret <2 x half> %fdiv
2212 define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
2213 ; GFX6-IEEE-LABEL: s_fdiv_f16:
2214 ; GFX6-IEEE: ; %bb.0:
2215 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
2216 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s1
2217 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2218 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
2219 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2220 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2221 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
2222 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
2223 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
2224 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
2225 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
2226 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2227 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2228 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2229 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2230 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2232 ; GFX6-FLUSH-LABEL: s_fdiv_f16:
2233 ; GFX6-FLUSH: ; %bb.0:
2234 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
2235 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s1
2236 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2237 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2238 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2239 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2240 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2241 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
2242 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
2243 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
2244 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
2245 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
2246 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2247 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2248 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2249 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2250 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2251 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2252 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2254 ; GFX8-LABEL: s_fdiv_f16:
2256 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
2257 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0
2258 ; GFX8-NEXT: v_rcp_f32_e32 v0, v0
2259 ; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
2260 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
2261 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2262 ; GFX8-NEXT: v_div_fixup_f16 v0, v0, v1, s0
2263 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2264 ; GFX8-NEXT: ; return to shader part epilog
2266 ; GFX9-IEEE-LABEL: s_fdiv_f16:
2267 ; GFX9-IEEE: ; %bb.0:
2268 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
2269 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
2270 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0
2271 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0
2272 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2273 ; GFX9-IEEE-NEXT: v_mov_b32_e32 v1, s1
2274 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0
2275 ; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2276 ; GFX9-IEEE-NEXT: ; return to shader part epilog
2278 ; GFX9-FLUSH-LABEL: s_fdiv_f16:
2279 ; GFX9-FLUSH: ; %bb.0:
2280 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
2281 ; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s1
2282 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
2283 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2284 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0
2285 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2286 ; GFX9-FLUSH-NEXT: ; return to shader part epilog
2288 ; GFX10-LABEL: s_fdiv_f16:
2290 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
2291 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0
2292 ; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2293 ; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
2294 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2295 ; GFX10-NEXT: ; return to shader part epilog
2297 ; GFX11-LABEL: s_fdiv_f16:
2299 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
2300 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0
2301 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2302 ; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2303 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
2304 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2305 ; GFX11-NEXT: ; return to shader part epilog
2306 %a = bitcast i16 %a.arg to half
2307 %b = bitcast i16 %b.arg to half
2308 %fdiv = fdiv half %a, %b
2309 %result = bitcast half %fdiv to i16
2313 define amdgpu_ps i16 @s_fdiv_f16_arcp(i16 inreg %a.arg, i16 inreg %b.arg) {
2314 ; GFX6-IEEE-LABEL: s_fdiv_f16_arcp:
2315 ; GFX6-IEEE: ; %bb.0:
2316 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
2317 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s1
2318 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2319 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
2320 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2321 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2322 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
2323 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
2324 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
2325 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
2326 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
2327 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2328 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2329 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2330 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2331 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2333 ; GFX6-FLUSH-LABEL: s_fdiv_f16_arcp:
2334 ; GFX6-FLUSH: ; %bb.0:
2335 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
2336 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s1
2337 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2338 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2339 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2340 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2341 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2342 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
2343 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
2344 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
2345 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
2346 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
2347 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2348 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2349 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2350 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2351 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2352 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2353 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2355 ; GFX89-LABEL: s_fdiv_f16_arcp:
2357 ; GFX89-NEXT: v_rcp_f16_e32 v0, s1
2358 ; GFX89-NEXT: v_mul_f16_e32 v0, s0, v0
2359 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0
2360 ; GFX89-NEXT: ; return to shader part epilog
2362 ; GFX10-LABEL: s_fdiv_f16_arcp:
2364 ; GFX10-NEXT: v_rcp_f16_e32 v0, s1
2365 ; GFX10-NEXT: v_mul_f16_e32 v0, s0, v0
2366 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2367 ; GFX10-NEXT: ; return to shader part epilog
2369 ; GFX11-LABEL: s_fdiv_f16_arcp:
2371 ; GFX11-NEXT: v_rcp_f16_e32 v0, s1
2372 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2373 ; GFX11-NEXT: v_mul_f16_e32 v0, s0, v0
2374 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2375 ; GFX11-NEXT: ; return to shader part epilog
2376 %a = bitcast i16 %a.arg to half
2377 %b = bitcast i16 %b.arg to half
2378 %fdiv = fdiv arcp half %a, %b
2379 %result = bitcast half %fdiv to i16
2383 define amdgpu_ps i16 @s_fdiv_f16_afn(i16 inreg %a.arg, i16 inreg %b.arg) {
2384 ; GFX6-LABEL: s_fdiv_f16_afn:
2386 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s1
2387 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s0
2388 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
2389 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
2390 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
2391 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2392 ; GFX6-NEXT: ; return to shader part epilog
2394 ; GFX89-LABEL: s_fdiv_f16_afn:
2396 ; GFX89-NEXT: v_rcp_f16_e32 v0, s1
2397 ; GFX89-NEXT: v_mul_f16_e32 v0, s0, v0
2398 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0
2399 ; GFX89-NEXT: ; return to shader part epilog
2401 ; GFX10-LABEL: s_fdiv_f16_afn:
2403 ; GFX10-NEXT: v_rcp_f16_e32 v0, s1
2404 ; GFX10-NEXT: v_mul_f16_e32 v0, s0, v0
2405 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2406 ; GFX10-NEXT: ; return to shader part epilog
2408 ; GFX11-LABEL: s_fdiv_f16_afn:
2410 ; GFX11-NEXT: v_rcp_f16_e32 v0, s1
2411 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2412 ; GFX11-NEXT: v_mul_f16_e32 v0, s0, v0
2413 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2414 ; GFX11-NEXT: ; return to shader part epilog
2415 %a = bitcast i16 %a.arg to half
2416 %b = bitcast i16 %b.arg to half
2417 %fdiv = fdiv afn half %a, %b
2418 %result = bitcast half %fdiv to i16
2422 define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
2423 ; GFX6-IEEE-LABEL: s_fdiv_v2f16:
2424 ; GFX6-IEEE: ; %bb.0:
2425 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
2426 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s1
2427 ; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
2428 ; GFX6-IEEE-NEXT: s_lshr_b32 s1, s1, 16
2429 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
2430 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
2431 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2432 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2433 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
2434 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
2435 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
2436 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
2437 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
2438 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s0
2439 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, s1
2440 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2441 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2442 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2443 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v6, v6, v4
2444 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
2445 ; GFX6-IEEE-NEXT: v_div_scale_f32 v1, vcc, v4, v6, v4
2446 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, 1.0
2447 ; GFX6-IEEE-NEXT: v_fma_f32 v2, v2, v5, v5
2448 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v1, v2
2449 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v1
2450 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v2, v5
2451 ; GFX6-IEEE-NEXT: v_fma_f32 v1, -v3, v5, v1
2452 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
2453 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v6, v4
2454 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
2455 ; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2456 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
2457 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2458 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2460 ; GFX6-FLUSH-LABEL: s_fdiv_v2f16:
2461 ; GFX6-FLUSH: ; %bb.0:
2462 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
2463 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s1
2464 ; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
2465 ; GFX6-FLUSH-NEXT: s_lshr_b32 s1, s1, 16
2466 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
2467 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2468 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2469 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2470 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2471 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
2472 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
2473 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
2474 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
2475 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
2476 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2477 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2478 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s0
2479 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s1
2480 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2481 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2482 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2483 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v4, v4, v3
2484 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
2485 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v4, v3
2486 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2487 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
2488 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
2489 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
2490 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v5
2491 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
2492 ; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v5
2493 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2494 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
2495 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v4, v3
2496 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
2497 ; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2498 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
2499 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2500 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2502 ; GFX8-LABEL: s_fdiv_v2f16:
2504 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
2505 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
2506 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s3
2507 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16
2508 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s0
2509 ; GFX8-NEXT: v_rcp_f32_e32 v0, v0
2510 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s2
2511 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
2512 ; GFX8-NEXT: v_mul_f32_e32 v0, v2, v0
2513 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
2514 ; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1
2515 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
2516 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
2517 ; GFX8-NEXT: v_div_fixup_f16 v0, v0, v2, s0
2518 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
2519 ; GFX8-NEXT: v_div_fixup_f16 v1, v1, v2, s2
2520 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2521 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2522 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2523 ; GFX8-NEXT: ; return to shader part epilog
2525 ; GFX9-IEEE-LABEL: s_fdiv_v2f16:
2526 ; GFX9-IEEE: ; %bb.0:
2527 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
2528 ; GFX9-IEEE-NEXT: s_lshr_b32 s3, s1, 16
2529 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s3
2530 ; GFX9-IEEE-NEXT: s_lshr_b32 s2, s0, 16
2531 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0
2532 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0
2533 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2
2534 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
2535 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0
2536 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2537 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1
2538 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
2539 ; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s1
2540 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0
2541 ; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s3
2542 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v1, v2, s2
2543 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
2544 ; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2545 ; GFX9-IEEE-NEXT: ; return to shader part epilog
2547 ; GFX9-FLUSH-LABEL: s_fdiv_v2f16:
2548 ; GFX9-FLUSH: ; %bb.0:
2549 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
2550 ; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s1, 16
2551 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2
2552 ; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s0, 16
2553 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
2554 ; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1
2555 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
2556 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2557 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
2558 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2559 ; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
2560 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
2561 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
2562 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2563 ; GFX9-FLUSH-NEXT: ; return to shader part epilog
2565 ; GFX10-LABEL: s_fdiv_v2f16:
2567 ; GFX10-NEXT: s_lshr_b32 s2, s1, 16
2568 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
2569 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s2
2570 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16
2571 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0
2572 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1
2573 ; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2574 ; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2575 ; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
2576 ; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
2577 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
2578 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2579 ; GFX10-NEXT: ; return to shader part epilog
2581 ; GFX11-LABEL: s_fdiv_v2f16:
2583 ; GFX11-NEXT: s_lshr_b32 s2, s1, 16
2584 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
2585 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2
2586 ; GFX11-NEXT: s_lshr_b32 s3, s0, 16
2587 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0
2588 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1
2589 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2590 ; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
2591 ; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2592 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
2593 ; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
2594 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2595 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2596 ; GFX11-NEXT: ; return to shader part epilog
2597 %a = bitcast i32 %a.arg to <2 x half>
2598 %b = bitcast i32 %b.arg to <2 x half>
2599 %fdiv = fdiv <2 x half> %a, %b
2600 %result = bitcast <2 x half> %fdiv to i32
2604 define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
2605 ; GFX6-IEEE-LABEL: s_rcp_f16:
2606 ; GFX6-IEEE: ; %bb.0:
2607 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
2608 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
2609 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2610 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
2611 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2612 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2613 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
2614 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
2615 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
2616 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
2617 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
2618 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2619 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2620 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2621 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2622 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2624 ; GFX6-FLUSH-LABEL: s_rcp_f16:
2625 ; GFX6-FLUSH: ; %bb.0:
2626 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
2627 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
2628 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2629 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2630 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2631 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2632 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2633 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
2634 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
2635 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
2636 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
2637 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
2638 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2639 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2640 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2641 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2642 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2643 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2644 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2646 ; GFX89-LABEL: s_rcp_f16:
2648 ; GFX89-NEXT: v_rcp_f16_e32 v0, s0
2649 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0
2650 ; GFX89-NEXT: ; return to shader part epilog
2652 ; GFX10-LABEL: s_rcp_f16:
2654 ; GFX10-NEXT: v_rcp_f16_e32 v0, s0
2655 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2656 ; GFX10-NEXT: ; return to shader part epilog
2658 ; GFX11-LABEL: s_rcp_f16:
2660 ; GFX11-NEXT: v_rcp_f16_e32 v0, s0
2661 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2662 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2663 ; GFX11-NEXT: ; return to shader part epilog
2664 %a = bitcast i16 %a.arg to half
2665 %fdiv = fdiv half 1.0, %a
2666 %result = bitcast half %fdiv to i16
2670 define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
2671 ; GFX6-IEEE-LABEL: s_neg_rcp_f16:
2672 ; GFX6-IEEE: ; %bb.0:
2673 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
2674 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
2675 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2676 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
2677 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2678 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2679 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
2680 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
2681 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
2682 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
2683 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
2684 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2685 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2686 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2687 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2688 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2690 ; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
2691 ; GFX6-FLUSH: ; %bb.0:
2692 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
2693 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
2694 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
2695 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2696 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
2697 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2698 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2699 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
2700 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
2701 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
2702 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
2703 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
2704 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2705 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2706 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
2707 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2708 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2709 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2710 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2712 ; GFX89-LABEL: s_neg_rcp_f16:
2714 ; GFX89-NEXT: v_rcp_f16_e64 v0, -s0
2715 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0
2716 ; GFX89-NEXT: ; return to shader part epilog
2718 ; GFX10-LABEL: s_neg_rcp_f16:
2720 ; GFX10-NEXT: v_rcp_f16_e64 v0, -s0
2721 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2722 ; GFX10-NEXT: ; return to shader part epilog
2724 ; GFX11-LABEL: s_neg_rcp_f16:
2726 ; GFX11-NEXT: v_rcp_f16_e64 v0, -s0
2727 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2728 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2729 ; GFX11-NEXT: ; return to shader part epilog
2730 %a = bitcast i16 %a.arg to half
2731 %fdiv = fdiv half -1.0, %a
2732 %result = bitcast half %fdiv to i16
2736 define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
2737 ; GFX6-IEEE-LABEL: s_rsq_f16:
2738 ; GFX6-IEEE: ; %bb.0:
2739 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
2740 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
2741 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
2742 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2743 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
2744 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
2745 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
2746 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
2747 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2748 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
2749 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
2750 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
2751 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
2752 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
2753 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2754 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
2755 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2756 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2757 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2759 ; GFX6-FLUSH-LABEL: s_rsq_f16:
2760 ; GFX6-FLUSH: ; %bb.0:
2761 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
2762 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
2763 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
2764 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2765 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2766 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
2767 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
2768 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2769 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
2770 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2771 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
2772 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
2773 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
2774 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
2775 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
2776 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
2777 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2778 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
2779 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
2780 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2781 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2782 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2784 ; GFX89-LABEL: s_rsq_f16:
2786 ; GFX89-NEXT: v_rsq_f16_e32 v0, s0
2787 ; GFX89-NEXT: v_readfirstlane_b32 s0, v0
2788 ; GFX89-NEXT: ; return to shader part epilog
2790 ; GFX10-LABEL: s_rsq_f16:
2792 ; GFX10-NEXT: v_rsq_f16_e32 v0, s0
2793 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2794 ; GFX10-NEXT: ; return to shader part epilog
2796 ; GFX11-LABEL: s_rsq_f16:
2798 ; GFX11-NEXT: v_rsq_f16_e32 v0, s0
2799 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2800 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2801 ; GFX11-NEXT: ; return to shader part epilog
2802 %a = bitcast i16 %a.arg to half
2803 %sqrt = call contract half @llvm.sqrt.f16(half %a)
2804 %fdiv = fdiv contract half 1.0, %sqrt
2805 %result = bitcast half %fdiv to i16
2809 define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
2810 ; GFX6-IEEE-LABEL: s_rsq_v2f16:
2811 ; GFX6-IEEE: ; %bb.0:
2812 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
2813 ; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
2814 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
2815 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
2816 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
2817 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
2818 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2819 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
2820 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
2821 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
2822 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
2823 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
2824 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
2825 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
2826 ; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
2827 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
2828 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
2829 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
2830 ; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
2831 ; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
2832 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
2833 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
2834 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
2835 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
2836 ; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
2837 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
2838 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
2839 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
2840 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
2841 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
2842 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
2843 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
2844 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
2845 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
2846 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
2847 ; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2848 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
2849 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2850 ; GFX6-IEEE-NEXT: ; return to shader part epilog
2852 ; GFX6-FLUSH-LABEL: s_rsq_v2f16:
2853 ; GFX6-FLUSH: ; %bb.0:
2854 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
2855 ; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
2856 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
2857 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
2858 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
2859 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
2860 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2861 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2862 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
2863 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
2864 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
2865 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
2866 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
2867 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2868 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
2869 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
2870 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
2871 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
2872 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
2873 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
2874 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2875 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
2876 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
2877 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
2878 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
2879 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
2880 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
2881 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
2882 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
2883 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2884 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
2885 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
2886 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
2887 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
2888 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
2889 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
2890 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2891 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
2892 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
2893 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
2894 ; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2895 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
2896 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2897 ; GFX6-FLUSH-NEXT: ; return to shader part epilog
2899 ; GFX8-LABEL: s_rsq_v2f16:
2901 ; GFX8-NEXT: v_sqrt_f16_e32 v0, s0
2902 ; GFX8-NEXT: s_lshr_b32 s0, s0, 16
2903 ; GFX8-NEXT: v_sqrt_f16_e32 v1, s0
2904 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
2905 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v0
2906 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
2907 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
2908 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
2909 ; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
2910 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
2911 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
2912 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
2913 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
2914 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
2915 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2916 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
2917 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
2918 ; GFX8-NEXT: ; return to shader part epilog
2920 ; GFX9-IEEE-LABEL: s_rsq_v2f16:
2921 ; GFX9-IEEE: ; %bb.0:
2922 ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
2923 ; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
2924 ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
2925 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
2926 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
2927 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
2928 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
2929 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
2930 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
2931 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
2932 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
2933 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
2934 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
2935 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
2936 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
2937 ; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0
2938 ; GFX9-IEEE-NEXT: ; return to shader part epilog
2940 ; GFX9-FLUSH-LABEL: s_rsq_v2f16:
2941 ; GFX9-FLUSH: ; %bb.0:
2942 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
2943 ; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
2944 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
2945 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
2946 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
2947 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
2948 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
2949 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
2950 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
2951 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v3, 0 op_sel_hi:[1,0,0]
2952 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
2953 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
2954 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
2955 ; GFX9-FLUSH-NEXT: ; return to shader part epilog
2957 ; GFX10-LABEL: s_rsq_v2f16:
2959 ; GFX10-NEXT: s_lshr_b32 s1, s0, 16
2960 ; GFX10-NEXT: v_sqrt_f16_e32 v0, s0
2961 ; GFX10-NEXT: v_sqrt_f16_e32 v1, s1
2962 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
2963 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
2964 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
2965 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
2966 ; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
2967 ; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
2968 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
2969 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
2970 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
2971 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
2972 ; GFX10-NEXT: ; return to shader part epilog
2974 ; GFX11-LABEL: s_rsq_v2f16:
2976 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
2977 ; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
2978 ; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
2979 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2980 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
2981 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
2982 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
2983 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
2984 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2985 ; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
2986 ; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
2987 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
2988 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
2989 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2990 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2991 ; GFX11-NEXT: ; return to shader part epilog
2992 %a = bitcast i32 %a.arg to <2 x half>
2993 %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
2994 %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
2995 %result = bitcast <2 x half> %fdiv to i32
2999 define half @v_rsq_f16(half %a) {
3000 ; GFX6-IEEE-LABEL: v_rsq_f16:
3001 ; GFX6-IEEE: ; %bb.0:
3002 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3003 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3004 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3005 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3006 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3007 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3008 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3009 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3010 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3011 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3012 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3013 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3014 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3015 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3016 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3017 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3018 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3019 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3020 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3022 ; GFX6-FLUSH-LABEL: v_rsq_f16:
3023 ; GFX6-FLUSH: ; %bb.0:
3024 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3025 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3026 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3027 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3028 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3029 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3030 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3031 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3032 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3033 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3034 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3035 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3036 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3037 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3038 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3039 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3040 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3041 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3042 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3043 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3044 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3045 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3047 ; GFX89-LABEL: v_rsq_f16:
3049 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3050 ; GFX89-NEXT: v_rsq_f16_e32 v0, v0
3051 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3053 ; GFX10PLUS-LABEL: v_rsq_f16:
3054 ; GFX10PLUS: ; %bb.0:
3055 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3056 ; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
3057 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3058 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3059 %fdiv = fdiv contract half 1.0, %sqrt
3063 define half @v_neg_rsq_f16(half %a) {
3064 ; GFX6-IEEE-LABEL: v_neg_rsq_f16:
3065 ; GFX6-IEEE: ; %bb.0:
3066 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3067 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3068 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3069 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3070 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3071 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3072 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3073 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3074 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3075 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3076 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3077 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3078 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3079 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3080 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3081 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3082 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3083 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3084 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3086 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16:
3087 ; GFX6-FLUSH: ; %bb.0:
3088 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3089 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3090 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3091 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3092 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3093 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3094 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3095 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3096 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3097 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3098 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3099 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3100 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3101 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3102 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3103 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3104 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3105 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3106 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3107 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3108 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3109 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3111 ; GFX89-LABEL: v_neg_rsq_f16:
3113 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3114 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3115 ; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
3116 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3118 ; GFX10-LABEL: v_neg_rsq_f16:
3120 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3121 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3122 ; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
3123 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3125 ; GFX11-LABEL: v_neg_rsq_f16:
3127 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3128 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3129 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3130 ; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
3131 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3132 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3133 %fdiv = fdiv contract half -1.0, %sqrt
3137 define { half, half } @v_rsq_f16_multi_use(half %a) {
3138 ; GFX6-IEEE-LABEL: v_rsq_f16_multi_use:
3139 ; GFX6-IEEE: ; %bb.0:
3140 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3141 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3142 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3143 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3144 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3145 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
3146 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
3147 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
3148 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
3149 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
3150 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
3151 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
3152 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
3153 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
3154 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
3155 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
3156 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
3157 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
3158 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3160 ; GFX6-FLUSH-LABEL: v_rsq_f16_multi_use:
3161 ; GFX6-FLUSH: ; %bb.0:
3162 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3163 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3164 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3165 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3166 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3167 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3168 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
3169 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
3170 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
3171 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
3172 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3173 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
3174 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
3175 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
3176 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
3177 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
3178 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
3179 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3180 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
3181 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
3182 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
3183 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3185 ; GFX89-LABEL: v_rsq_f16_multi_use:
3187 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3188 ; GFX89-NEXT: v_sqrt_f16_e32 v2, v0
3189 ; GFX89-NEXT: v_rsq_f16_e32 v1, v0
3190 ; GFX89-NEXT: v_mov_b32_e32 v0, v2
3191 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3193 ; GFX10-LABEL: v_rsq_f16_multi_use:
3195 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3196 ; GFX10-NEXT: v_sqrt_f16_e32 v2, v0
3197 ; GFX10-NEXT: v_rsq_f16_e32 v1, v0
3198 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
3199 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3201 ; GFX11-LABEL: v_rsq_f16_multi_use:
3203 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3204 ; GFX11-NEXT: v_sqrt_f16_e32 v2, v0
3205 ; GFX11-NEXT: v_rsq_f16_e32 v1, v0
3206 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3207 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
3208 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3209 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3210 %insert.0 = insertvalue { half, half } poison, half %sqrt, 0
3211 %fdiv = fdiv contract half 1.0, %sqrt
3212 %insert.1 = insertvalue { half, half } %insert.0, half %fdiv, 1
3213 ret { half, half } %insert.1
3216 define half @v_rsq_f16_missing_contract0(half %a) {
3217 ; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract0:
3218 ; GFX6-IEEE: ; %bb.0:
3219 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3220 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3221 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3222 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3223 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3224 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3225 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3226 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3227 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3228 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3229 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3230 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3231 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3232 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3233 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3234 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3235 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3236 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3237 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3239 ; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract0:
3240 ; GFX6-FLUSH: ; %bb.0:
3241 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3242 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3243 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3244 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3245 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3246 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3247 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3248 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3249 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3250 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3251 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3252 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3253 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3254 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3255 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3256 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3257 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3258 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3259 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3260 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3261 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3262 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3264 ; GFX89-LABEL: v_rsq_f16_missing_contract0:
3266 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3267 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3268 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
3269 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3271 ; GFX10-LABEL: v_rsq_f16_missing_contract0:
3273 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3274 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3275 ; GFX10-NEXT: v_rcp_f16_e32 v0, v0
3276 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3278 ; GFX11-LABEL: v_rsq_f16_missing_contract0:
3280 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3281 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3282 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3283 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0
3284 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3285 %sqrt = call half @llvm.sqrt.f16(half %a)
3286 %fdiv = fdiv contract half 1.0, %sqrt
3290 define half @v_rsq_f16_missing_contract1(half %a) {
3291 ; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract1:
3292 ; GFX6-IEEE: ; %bb.0:
3293 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3294 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3295 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3296 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3297 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3298 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3299 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3300 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3301 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3302 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3303 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3304 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3305 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3306 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3307 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3308 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3309 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3310 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3311 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3313 ; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract1:
3314 ; GFX6-FLUSH: ; %bb.0:
3315 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3316 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3317 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3318 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3319 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3320 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3321 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3322 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3323 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3324 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3325 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3326 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3327 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3328 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3329 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3330 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3331 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3332 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3333 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3334 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3335 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3336 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3338 ; GFX89-LABEL: v_rsq_f16_missing_contract1:
3340 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3341 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3342 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
3343 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3345 ; GFX10-LABEL: v_rsq_f16_missing_contract1:
3347 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3348 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3349 ; GFX10-NEXT: v_rcp_f16_e32 v0, v0
3350 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3352 ; GFX11-LABEL: v_rsq_f16_missing_contract1:
3354 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3355 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3356 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3357 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0
3358 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3359 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3360 %fdiv = fdiv half 1.0, %sqrt
3364 define half @v_neg_rsq_f16_missing_contract0(half %a) {
3365 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract0:
3366 ; GFX6-IEEE: ; %bb.0:
3367 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3368 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3369 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3370 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3371 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3372 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3373 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3374 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3375 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3376 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3377 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3378 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3379 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3380 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3381 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3382 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3383 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3384 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3385 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3387 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract0:
3388 ; GFX6-FLUSH: ; %bb.0:
3389 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3390 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3391 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3392 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3393 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3394 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3395 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3396 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3397 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3398 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3399 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3400 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3401 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3402 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3403 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3404 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3405 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3406 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3407 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3408 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3409 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3410 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3412 ; GFX89-LABEL: v_neg_rsq_f16_missing_contract0:
3414 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3415 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3416 ; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
3417 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3419 ; GFX10-LABEL: v_neg_rsq_f16_missing_contract0:
3421 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3422 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3423 ; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
3424 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3426 ; GFX11-LABEL: v_neg_rsq_f16_missing_contract0:
3428 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3429 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3430 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3431 ; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
3432 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3433 %sqrt = call half @llvm.sqrt.f16(half %a)
3434 %fdiv = fdiv contract half -1.0, %sqrt
3438 define half @v_neg_rsq_f16_missing_contract1(half %a) {
3439 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract1:
3440 ; GFX6-IEEE: ; %bb.0:
3441 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3442 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3443 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3444 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3445 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3446 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3447 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3448 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3449 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3450 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3451 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3452 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3453 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3454 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3455 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3456 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3457 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3458 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3459 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3461 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract1:
3462 ; GFX6-FLUSH: ; %bb.0:
3463 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3464 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3465 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3466 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3467 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3468 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3469 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3470 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3471 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3472 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3473 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3474 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3475 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3476 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3477 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3478 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3479 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3480 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3481 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3482 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3483 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3484 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3486 ; GFX89-LABEL: v_neg_rsq_f16_missing_contract1:
3488 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3489 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3490 ; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
3491 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3493 ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
3495 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3496 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3497 ; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
3498 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3500 ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
3502 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3503 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3504 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3505 ; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
3506 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3507 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3508 %fdiv = fdiv half -1.0, %sqrt
3512 define half @v_neg_rsq_f16_fabs(half %a) {
3513 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_fabs:
3514 ; GFX6-IEEE: ; %bb.0:
3515 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3516 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
3517 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3518 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3519 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3520 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3521 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3522 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3523 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3524 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3525 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3526 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3527 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3528 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3529 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3530 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3531 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3532 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3533 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3535 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_fabs:
3536 ; GFX6-FLUSH: ; %bb.0:
3537 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3538 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
3539 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3540 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3541 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3542 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3543 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3544 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3545 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3546 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3547 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3548 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3549 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3550 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3551 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3552 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3553 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3554 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3555 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3556 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3557 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3558 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3560 ; GFX89-LABEL: v_neg_rsq_f16_fabs:
3562 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3563 ; GFX89-NEXT: v_sqrt_f16_e64 v0, |v0|
3564 ; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
3565 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3567 ; GFX10-LABEL: v_neg_rsq_f16_fabs:
3569 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3570 ; GFX10-NEXT: v_sqrt_f16_e64 v0, |v0|
3571 ; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
3572 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3574 ; GFX11-LABEL: v_neg_rsq_f16_fabs:
3576 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3577 ; GFX11-NEXT: v_sqrt_f16_e64 v0, |v0|
3578 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3579 ; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
3580 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3581 %a.fabs = call half @llvm.fabs.f16(half %a)
3582 %sqrt = call contract half @llvm.sqrt.f16(half %a.fabs)
3583 %fdiv = fdiv contract half -1.0, %sqrt
3587 define half @v_rsq_f16_arcp(half %a) {
3588 ; GFX6-IEEE-LABEL: v_rsq_f16_arcp:
3589 ; GFX6-IEEE: ; %bb.0:
3590 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3591 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3592 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3593 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3594 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3595 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3596 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3597 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3598 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3599 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3600 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3601 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3602 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3603 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3604 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3605 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3606 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3607 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3608 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3610 ; GFX6-FLUSH-LABEL: v_rsq_f16_arcp:
3611 ; GFX6-FLUSH: ; %bb.0:
3612 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3613 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3614 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3615 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3616 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3617 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3618 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3619 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3620 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3621 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3622 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3623 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3624 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3625 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3626 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3627 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3628 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3629 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3630 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3631 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3632 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3633 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3635 ; GFX89-LABEL: v_rsq_f16_arcp:
3637 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3638 ; GFX89-NEXT: v_rsq_f16_e32 v0, v0
3639 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3641 ; GFX10PLUS-LABEL: v_rsq_f16_arcp:
3642 ; GFX10PLUS: ; %bb.0:
3643 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3644 ; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
3645 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3646 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3647 %fdiv = fdiv contract arcp half 1.0, %sqrt
3651 define half @v_neg_rsq_f16_arcp(half %a) {
3652 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_arcp:
3653 ; GFX6-IEEE: ; %bb.0:
3654 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3655 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3656 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3657 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3658 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3659 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3660 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3661 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
3662 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3663 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3664 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
3665 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
3666 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
3667 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
3668 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
3669 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3670 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3671 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3672 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3674 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_arcp:
3675 ; GFX6-FLUSH: ; %bb.0:
3676 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3677 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3678 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
3679 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3680 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3681 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3682 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3683 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
3684 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3685 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
3686 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3687 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
3688 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
3689 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
3690 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
3691 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
3692 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
3693 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3694 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
3695 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
3696 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3697 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3699 ; GFX89-LABEL: v_neg_rsq_f16_arcp:
3701 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3702 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3703 ; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
3704 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3706 ; GFX10-LABEL: v_neg_rsq_f16_arcp:
3708 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3709 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3710 ; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
3711 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3713 ; GFX11-LABEL: v_neg_rsq_f16_arcp:
3715 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3716 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3717 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3718 ; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
3719 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3720 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3721 %fdiv = fdiv contract arcp half -1.0, %sqrt
3725 define half @v_rsq_f16_afn(half %a) {
3726 ; GFX6-LABEL: v_rsq_f16_afn:
3728 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3729 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
3730 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3731 ; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
3732 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
3733 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
3734 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
3735 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
3736 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
3737 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3739 ; GFX89-LABEL: v_rsq_f16_afn:
3741 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3742 ; GFX89-NEXT: v_rsq_f16_e32 v0, v0
3743 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3745 ; GFX10PLUS-LABEL: v_rsq_f16_afn:
3746 ; GFX10PLUS: ; %bb.0:
3747 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3748 ; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
3749 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
3750 %sqrt = call contract half @llvm.sqrt.f16(half %a)
3751 %fdiv = fdiv afn contract half 1.0, %sqrt
3755 define half @v_rsq_f16_afn_nocontract(half %a) {
3756 ; GFX6-LABEL: v_rsq_f16_afn_nocontract:
3758 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3759 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
3760 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
3761 ; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
3762 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
3763 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
3764 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
3765 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
3766 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
3767 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3769 ; GFX89-LABEL: v_rsq_f16_afn_nocontract:
3771 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3772 ; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
3773 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
3774 ; GFX89-NEXT: s_setpc_b64 s[30:31]
3776 ; GFX10-LABEL: v_rsq_f16_afn_nocontract:
3778 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3779 ; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
3780 ; GFX10-NEXT: v_rcp_f16_e32 v0, v0
3781 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3783 ; GFX11-LABEL: v_rsq_f16_afn_nocontract:
3785 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3786 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3787 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3788 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0
3789 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3790 %sqrt = call half @llvm.sqrt.f16(half %a)
3791 %fdiv = fdiv afn half 1.0, %sqrt
3795 define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
3796 ; GFX6-IEEE-LABEL: v_rsq_v2f16:
3797 ; GFX6-IEEE: ; %bb.0:
3798 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3799 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3800 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
3801 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
3802 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3803 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
3804 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3805 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
3806 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3807 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
3808 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
3809 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
3810 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
3811 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
3812 ; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
3813 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
3814 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
3815 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
3816 ; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
3817 ; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
3818 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
3819 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
3820 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
3821 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
3822 ; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
3823 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
3824 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
3825 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
3826 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
3827 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
3828 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
3829 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
3830 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
3831 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3832 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
3833 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
3835 ; GFX6-FLUSH-LABEL: v_rsq_v2f16:
3836 ; GFX6-FLUSH: ; %bb.0:
3837 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3838 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3839 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
3840 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
3841 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
3842 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
3843 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
3844 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3845 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
3846 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
3847 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
3848 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
3849 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
3850 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3851 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
3852 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
3853 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
3854 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
3855 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
3856 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
3857 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3858 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
3859 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
3860 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
3861 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
3862 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
3863 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
3864 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
3865 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
3866 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3867 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
3868 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
3869 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
3870 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
3871 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
3872 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
3873 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3874 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
3875 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
3876 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
3877 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
3879 ; GFX8-LABEL: v_rsq_v2f16:
3881 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3882 ; GFX8-NEXT: v_sqrt_f16_e32 v1, v0
3883 ; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
3884 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
3885 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
3886 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
3887 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
3888 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
3889 ; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
3890 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
3891 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
3892 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
3893 ; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
3894 ; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
3895 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3896 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
3897 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3899 ; GFX9-IEEE-LABEL: v_rsq_v2f16:
3900 ; GFX9-IEEE: ; %bb.0:
3901 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3902 ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
3903 ; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
3904 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
3905 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
3906 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
3907 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
3908 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
3909 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
3910 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
3911 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
3912 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
3913 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
3914 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
3915 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
3916 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
3918 ; GFX9-FLUSH-LABEL: v_rsq_v2f16:
3919 ; GFX9-FLUSH: ; %bb.0:
3920 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3921 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
3922 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
3923 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
3924 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
3925 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
3926 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
3927 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
3928 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
3929 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
3930 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
3931 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
3932 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
3934 ; GFX10-LABEL: v_rsq_v2f16:
3936 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3937 ; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
3938 ; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
3939 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
3940 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
3941 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
3942 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
3943 ; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
3944 ; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
3945 ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
3946 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
3947 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
3948 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3950 ; GFX11-LABEL: v_rsq_v2f16:
3952 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3953 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
3954 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
3955 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
3956 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3957 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
3958 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
3959 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
3960 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
3961 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3962 ; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
3963 ; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
3964 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
3965 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
3966 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
3967 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3968 %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
3969 %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
3970 ret <2 x half> %fdiv
3973 define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
3974 ; GFX6-IEEE-LABEL: v_neg_rsq_v2f16:
3975 ; GFX6-IEEE: ; %bb.0:
3976 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3977 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3978 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
3979 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
3980 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
3981 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
3982 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
3983 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
3984 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
3985 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
3986 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
3987 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
3988 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
3989 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
3990 ; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
3991 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
3992 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
3993 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
3994 ; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
3995 ; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
3996 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
3997 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
3998 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
3999 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
4000 ; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
4001 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
4002 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
4003 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
4004 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
4005 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
4006 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
4007 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
4008 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
4009 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
4010 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
4011 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
4013 ; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
4014 ; GFX6-FLUSH: ; %bb.0:
4015 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4016 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
4017 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
4018 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
4019 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
4020 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
4021 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
4022 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
4023 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
4024 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
4025 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
4026 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
4027 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
4028 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
4029 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
4030 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
4031 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
4032 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
4033 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
4034 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
4035 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
4036 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
4037 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
4038 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
4039 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
4040 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
4041 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
4042 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
4043 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
4044 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
4045 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
4046 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
4047 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
4048 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
4049 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
4050 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
4051 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
4052 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
4053 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
4054 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
4055 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
4057 ; GFX8-LABEL: v_neg_rsq_v2f16:
4059 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4060 ; GFX8-NEXT: v_sqrt_f16_e32 v1, v0
4061 ; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
4062 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
4063 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
4064 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
4065 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
4066 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
4067 ; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
4068 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
4069 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
4070 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
4071 ; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
4072 ; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
4073 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
4074 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
4075 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4077 ; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
4078 ; GFX9-IEEE: ; %bb.0:
4079 ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4080 ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
4081 ; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
4082 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
4083 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
4084 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
4085 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
4086 ; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
4087 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
4088 ; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
4089 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
4090 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
4091 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
4092 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
4093 ; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
4094 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
4096 ; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16:
4097 ; GFX9-FLUSH: ; %bb.0:
4098 ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4099 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
4100 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
4101 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
4102 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
4103 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
4104 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
4105 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
4106 ; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
4107 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
4108 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
4109 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
4110 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
4112 ; GFX10-LABEL: v_neg_rsq_v2f16:
4114 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4115 ; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
4116 ; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
4117 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
4118 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
4119 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
4120 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
4121 ; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
4122 ; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
4123 ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
4124 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
4125 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
4126 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4128 ; GFX11-LABEL: v_neg_rsq_v2f16:
4130 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4131 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
4132 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
4133 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
4134 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
4135 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
4136 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
4137 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
4138 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
4139 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
4140 ; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
4141 ; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
4142 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
4143 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
4144 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
4145 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4146 %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
4147 %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
4148 ret <2 x half> %fdiv
4151 declare half @llvm.fabs.f16(half)
4152 declare half @llvm.sqrt.f16(half)
4153 declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
4154 declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
4156 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
4157 ; GFX10-FLUSH: {{.*}}
4158 ; GFX10-IEEE: {{.*}}
4159 ; GFX11-FLUSH: {{.*}}
4160 ; GFX11-IEEE: {{.*}}
4161 ; GFX8-FLUSH: {{.*}}