1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; Denormal mode shouldn't matter for f16, check with and without flushing.
3 ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s
4 ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s
6 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
7 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
9 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
10 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
12 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
13 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
15 define half @v_fdiv_f16(half %a, half %b) {
16 ; GFX6-IEEE-LABEL: v_fdiv_f16:
18 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
20 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
21 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
22 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
23 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
24 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
25 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
26 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
27 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
28 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
29 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
30 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
31 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
32 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
33 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
35 ; GFX6-FLUSH-LABEL: v_fdiv_f16:
36 ; GFX6-FLUSH: ; %bb.0:
37 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
39 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
40 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
41 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
42 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
43 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
44 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
45 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
46 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
47 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
48 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
49 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
50 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
51 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
52 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
53 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
54 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
55 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
57 ; GFX89-LABEL: v_fdiv_f16:
59 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60 ; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
61 ; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
62 ; GFX89-NEXT: v_rcp_f32_e32 v2, v2
63 ; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
64 ; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
65 ; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
66 ; GFX89-NEXT: s_setpc_b64 s[30:31]
68 ; GFX10-LABEL: v_fdiv_f16:
70 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
72 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
73 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
74 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
75 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
76 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
77 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
78 ; GFX10-NEXT: s_setpc_b64 s[30:31]
79 %fdiv = fdiv half %a, %b
83 define half @v_fdiv_f16_afn(half %a, half %b) {
84 ; GFX6-LABEL: v_fdiv_f16_afn:
86 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
88 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
89 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1
90 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
91 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
92 ; GFX6-NEXT: s_setpc_b64 s[30:31]
94 ; GFX89-LABEL: v_fdiv_f16_afn:
96 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97 ; GFX89-NEXT: v_rcp_f16_e32 v1, v1
98 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
99 ; GFX89-NEXT: s_setpc_b64 s[30:31]
101 ; GFX10-LABEL: v_fdiv_f16_afn:
103 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
105 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1
106 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
107 ; GFX10-NEXT: s_setpc_b64 s[30:31]
108 %fdiv = fdiv afn half %a, %b
112 define half @v_fdiv_f16_ulp25(half %a, half %b) {
113 ; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
114 ; GFX6-IEEE: ; %bb.0:
115 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
117 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
118 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
119 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
120 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
121 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
122 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
123 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
124 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
125 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
126 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
127 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
128 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
129 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
130 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
132 ; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
133 ; GFX6-FLUSH: ; %bb.0:
134 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
136 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
137 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
138 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
139 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
140 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
141 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
142 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
143 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
144 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
145 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
146 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
147 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
148 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
149 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
150 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
151 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
152 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
154 ; GFX89-LABEL: v_fdiv_f16_ulp25:
156 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
158 ; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
159 ; GFX89-NEXT: v_rcp_f32_e32 v2, v2
160 ; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
161 ; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
162 ; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
163 ; GFX89-NEXT: s_setpc_b64 s[30:31]
165 ; GFX10-LABEL: v_fdiv_f16_ulp25:
167 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
169 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
170 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
171 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
172 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
173 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
174 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
175 ; GFX10-NEXT: s_setpc_b64 s[30:31]
176 %fdiv = fdiv half %a, %b, !fpmath !0
180 define half @v_rcp_f16(half %x) {
181 ; GFX6-IEEE-LABEL: v_rcp_f16:
182 ; GFX6-IEEE: ; %bb.0:
183 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
185 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
186 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
187 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
188 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
189 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
190 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
191 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
192 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
193 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
194 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
195 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
196 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
197 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
198 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
200 ; GFX6-FLUSH-LABEL: v_rcp_f16:
201 ; GFX6-FLUSH: ; %bb.0:
202 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
204 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
205 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
206 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
207 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
208 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
209 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
210 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
211 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
212 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
213 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
214 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
215 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
216 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
217 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
218 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
219 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
220 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
222 ; GFX89-LABEL: v_rcp_f16:
224 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225 ; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
226 ; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
227 ; GFX89-NEXT: v_rcp_f32_e32 v1, v1
228 ; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
229 ; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
230 ; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
231 ; GFX89-NEXT: s_setpc_b64 s[30:31]
233 ; GFX10-LABEL: v_rcp_f16:
235 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
237 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
238 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0
239 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1
240 ; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
241 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
242 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
243 ; GFX10-NEXT: s_setpc_b64 s[30:31]
244 %fdiv = fdiv half 1.0, %x
248 define half @v_rcp_f16_arcp(half %x) {
249 ; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
250 ; GFX6-IEEE: ; %bb.0:
251 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
253 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
254 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
255 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
256 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
257 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
258 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
259 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
260 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
261 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
262 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
263 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
264 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
265 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
266 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
268 ; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
269 ; GFX6-FLUSH: ; %bb.0:
270 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
272 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
273 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
274 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
275 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
276 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
277 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
278 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
279 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
280 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
281 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
282 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
283 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
284 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
285 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
286 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
287 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
288 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
290 ; GFX89-LABEL: v_rcp_f16_arcp:
292 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
294 ; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
295 ; GFX89-NEXT: v_rcp_f32_e32 v1, v1
296 ; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
297 ; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
298 ; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
299 ; GFX89-NEXT: s_setpc_b64 s[30:31]
301 ; GFX10-LABEL: v_rcp_f16_arcp:
303 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
305 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
306 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0
307 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1
308 ; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
309 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
310 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
311 ; GFX10-NEXT: s_setpc_b64 s[30:31]
312 %fdiv = fdiv arcp half 1.0, %x
316 define half @v_rcp_f16_arcp_afn(half %x) {
317 ; GFX6-LABEL: v_rcp_f16_arcp_afn:
319 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
321 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
322 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
323 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
324 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
325 ; GFX6-NEXT: s_setpc_b64 s[30:31]
327 ; GFX89-LABEL: v_rcp_f16_arcp_afn:
329 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
331 ; GFX89-NEXT: s_setpc_b64 s[30:31]
333 ; GFX10-LABEL: v_rcp_f16_arcp_afn:
335 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
337 ; GFX10-NEXT: v_rcp_f16_e32 v0, v0
338 ; GFX10-NEXT: s_setpc_b64 s[30:31]
339 %fdiv = fdiv arcp afn half 1.0, %x
343 define half @v_rcp_f16_ulp25(half %x) {
344 ; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
345 ; GFX6-IEEE: ; %bb.0:
346 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
348 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
349 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
350 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
351 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
352 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
353 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
354 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
355 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
356 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
357 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
358 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
359 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
360 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
361 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
363 ; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
364 ; GFX6-FLUSH: ; %bb.0:
365 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
367 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
368 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
369 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
370 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
371 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
372 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
373 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
374 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
375 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
376 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
377 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
378 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
379 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
380 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
381 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
382 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
383 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
385 ; GFX89-LABEL: v_rcp_f16_ulp25:
387 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GFX89-NEXT: v_rcp_f16_e32 v0, v0
389 ; GFX89-NEXT: s_setpc_b64 s[30:31]
391 ; GFX10-LABEL: v_rcp_f16_ulp25:
393 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
395 ; GFX10-NEXT: v_rcp_f16_e32 v0, v0
396 ; GFX10-NEXT: s_setpc_b64 s[30:31]
397 %fdiv = fdiv half 1.0, %x, !fpmath !0
401 define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
402 ; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
404 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
406 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
407 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1
408 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
409 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
410 ; GFX6-NEXT: s_setpc_b64 s[30:31]
412 ; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
414 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415 ; GFX89-NEXT: v_rcp_f16_e32 v1, v1
416 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
417 ; GFX89-NEXT: s_setpc_b64 s[30:31]
419 ; GFX10-LABEL: v_fdiv_f16_afn_ulp25:
421 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
423 ; GFX10-NEXT: v_rcp_f16_e32 v1, v1
424 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
425 ; GFX10-NEXT: s_setpc_b64 s[30:31]
426 %fdiv = fdiv afn half %a, %b, !fpmath !0
430 define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
431 ; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
432 ; GFX6-IEEE: ; %bb.0:
433 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
435 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
436 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
437 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
438 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
439 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
440 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
441 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
442 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
443 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
444 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
445 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
446 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
447 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
448 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
450 ; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
451 ; GFX6-FLUSH: ; %bb.0:
452 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
454 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
455 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
456 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
457 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
458 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
459 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
460 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
461 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
462 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
463 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
464 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
465 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
466 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
467 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
468 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
469 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
470 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
472 ; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
474 ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475 ; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
476 ; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
477 ; GFX89-NEXT: v_rcp_f32_e32 v2, v2
478 ; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
479 ; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
480 ; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
481 ; GFX89-NEXT: s_setpc_b64 s[30:31]
483 ; GFX10-LABEL: v_fdiv_f16_arcp_ulp25:
485 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
487 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
488 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
489 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
490 ; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2
491 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
492 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
493 ; GFX10-NEXT: s_setpc_b64 s[30:31]
494 %fdiv = fdiv arcp half %a, %b, !fpmath !0
498 define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
499 ; GFX6-IEEE-LABEL: v_fdiv_v2f16:
500 ; GFX6-IEEE: ; %bb.0:
501 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
503 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
504 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
505 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
506 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
507 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
508 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
509 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
510 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
511 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
512 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
513 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
514 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
515 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
516 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
517 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
518 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
519 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
520 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
521 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
522 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
523 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
524 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
525 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
526 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
527 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
528 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
529 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
530 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
532 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
533 ; GFX6-FLUSH: ; %bb.0:
534 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
536 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
537 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
538 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
539 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
540 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
541 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
542 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
543 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
544 ; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
545 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
546 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
547 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
548 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
549 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
550 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
551 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
552 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
553 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
554 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
555 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
556 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
557 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
558 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
559 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
560 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
561 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
562 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
563 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
564 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
565 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
566 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
567 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
568 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
570 ; GFX8-LABEL: v_fdiv_v2f16:
572 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
574 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
575 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
576 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
577 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
578 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
579 ; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
580 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5
581 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
582 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
583 ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
584 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
585 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
586 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
587 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
588 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
589 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
590 ; GFX8-NEXT: s_setpc_b64 s[30:31]
592 ; GFX9-LABEL: v_fdiv_v2f16:
594 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
596 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
597 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
598 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
599 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
600 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2
601 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
602 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
603 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
604 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
605 ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
606 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
607 ; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
608 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
609 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
610 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
611 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
612 ; GFX9-NEXT: s_setpc_b64 s[30:31]
614 ; GFX10-LABEL: v_fdiv_v2f16:
616 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
617 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
618 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
619 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
620 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
621 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0
622 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
623 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
624 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5
625 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
626 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
627 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
628 ; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
629 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
630 ; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0
631 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5
632 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
633 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
634 ; GFX10-NEXT: s_setpc_b64 s[30:31]
635 %fdiv = fdiv <2 x half> %a, %b
639 define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
640 ; GFX6-LABEL: v_fdiv_v2f16_afn:
642 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
644 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
645 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
646 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
647 ; GFX6-NEXT: v_rcp_f32_e32 v2, v2
648 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3
649 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
650 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
651 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
652 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
653 ; GFX6-NEXT: s_setpc_b64 s[30:31]
655 ; GFX8-LABEL: v_fdiv_v2f16_afn:
657 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
659 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
660 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
661 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
662 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
663 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
664 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
665 ; GFX8-NEXT: s_setpc_b64 s[30:31]
667 ; GFX9-LABEL: v_fdiv_v2f16_afn:
669 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
671 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
672 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
673 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
674 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
675 ; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
676 ; GFX9-NEXT: s_setpc_b64 s[30:31]
678 ; GFX10-LABEL: v_fdiv_v2f16_afn:
680 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
681 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
682 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
683 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
684 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
685 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
686 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
687 ; GFX10-NEXT: s_setpc_b64 s[30:31]
688 %fdiv = fdiv afn <2 x half> %a, %b
692 define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
693 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25:
694 ; GFX6-IEEE: ; %bb.0:
695 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
697 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
698 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
699 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
700 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
701 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
702 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
703 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
704 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
705 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
706 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
707 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
708 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
709 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
710 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
711 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
712 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
713 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
714 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
715 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
716 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
717 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
718 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
719 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
720 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
721 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
722 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
723 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
724 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
726 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
727 ; GFX6-FLUSH: ; %bb.0:
728 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
730 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
731 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
732 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
733 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
734 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
735 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
736 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
737 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
738 ; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
739 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
740 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
741 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
742 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
743 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
744 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
745 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
746 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
747 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
748 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
749 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
750 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
751 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
752 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
753 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
754 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
755 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
756 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
757 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
758 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
759 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
760 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
761 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
762 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
764 ; GFX8-LABEL: v_fdiv_v2f16_ulp25:
766 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
768 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
769 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
770 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
771 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
772 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
773 ; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
774 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5
775 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
776 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
777 ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
778 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
779 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
780 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
781 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
782 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
783 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
784 ; GFX8-NEXT: s_setpc_b64 s[30:31]
786 ; GFX9-LABEL: v_fdiv_v2f16_ulp25:
788 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
789 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
790 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
791 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
792 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
793 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
794 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2
795 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
796 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
797 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
798 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
799 ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
800 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
801 ; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
802 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
803 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
804 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
805 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
806 ; GFX9-NEXT: s_setpc_b64 s[30:31]
808 ; GFX10-LABEL: v_fdiv_v2f16_ulp25:
810 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
812 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
813 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
814 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
815 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0
816 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
817 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
818 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5
819 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
820 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
821 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
822 ; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
823 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
824 ; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0
825 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5
826 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
827 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
828 ; GFX10-NEXT: s_setpc_b64 s[30:31]
829 %fdiv = fdiv <2 x half> %a, %b, !fpmath !0
833 define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
834 ; GFX6-IEEE-LABEL: v_rcp_v2f16:
835 ; GFX6-IEEE: ; %bb.0:
836 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
838 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
839 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
840 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
841 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
842 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
843 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
844 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
845 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
846 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
847 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
848 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
849 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
850 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
851 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
852 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
853 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
854 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
855 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
856 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
857 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
858 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
859 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
860 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
861 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
862 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
863 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
864 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
866 ; GFX6-FLUSH-LABEL: v_rcp_v2f16:
867 ; GFX6-FLUSH: ; %bb.0:
868 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
869 ; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
870 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
871 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
872 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
873 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
874 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
875 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
876 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
877 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
878 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
879 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
880 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
881 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
882 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
883 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
884 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
885 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
886 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
887 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
888 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
889 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
890 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
891 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
892 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
893 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
894 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
895 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
896 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
897 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
898 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
899 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
900 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
901 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
902 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
903 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
905 ; GFX8-LABEL: v_rcp_v2f16:
907 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
908 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
909 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
910 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
911 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
912 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
913 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
914 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
915 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
916 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
917 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
918 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
919 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
920 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
921 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
922 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
923 ; GFX8-NEXT: s_setpc_b64 s[30:31]
925 ; GFX9-LABEL: v_rcp_v2f16:
927 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
929 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
930 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
931 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
932 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1
933 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
934 ; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
935 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
936 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
937 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
938 ; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
939 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
940 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
941 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
942 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
943 ; GFX9-NEXT: s_setpc_b64 s[30:31]
945 ; GFX10-LABEL: v_rcp_v2f16:
947 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
948 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
949 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
950 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
951 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
952 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
953 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
954 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
955 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
956 ; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
957 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
958 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
959 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
960 ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
961 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
962 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
963 ; GFX10-NEXT: s_setpc_b64 s[30:31]
964 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
968 define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
969 ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
970 ; GFX6-IEEE: ; %bb.0:
971 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
972 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
973 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
974 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
975 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
976 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
977 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
978 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
979 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
980 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
981 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
982 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
983 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
984 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
985 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
986 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
987 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
988 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
989 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
990 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
991 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
992 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
993 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
994 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
995 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
996 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
997 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
998 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
999 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1001 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
1002 ; GFX6-FLUSH: ; %bb.0:
1003 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004 ; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
1005 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
1006 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1007 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1008 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1009 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1010 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1011 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1012 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1013 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1014 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1015 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1016 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1017 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1018 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1019 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
1020 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1021 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1022 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1023 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1024 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
1025 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
1026 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
1027 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1028 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1029 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
1030 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
1031 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1032 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
1033 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1034 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1035 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1036 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
1037 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1038 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1040 ; GFX8-LABEL: v_rcp_v2f16_arcp:
1042 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1043 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1044 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
1045 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
1046 ; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1047 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1
1048 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3
1049 ; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
1050 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
1051 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
1052 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1053 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1054 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1055 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1056 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1057 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1058 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1060 ; GFX9-LABEL: v_rcp_v2f16_arcp:
1062 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1063 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1064 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
1065 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
1066 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1067 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1
1068 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
1069 ; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
1070 ; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
1071 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
1072 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
1073 ; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
1074 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
1075 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
1076 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1077 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
1078 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1080 ; GFX10-LABEL: v_rcp_v2f16_arcp:
1082 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1083 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1084 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1085 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
1086 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
1087 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
1088 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1089 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
1090 ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
1091 ; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
1092 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
1093 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
1094 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
1095 ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
1096 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1097 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
1098 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1099 %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
1100 ret <2 x half> %fdiv
1103 define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
1104 ; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
1106 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1107 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
1108 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
1109 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1110 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
1111 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1
1112 ; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
1113 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
1114 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
1115 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
1116 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1118 ; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
1120 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1121 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1122 ; GFX8-NEXT: v_rcp_f16_e32 v0, v0
1123 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1124 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1125 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1126 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1128 ; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
1130 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1131 ; GFX9-NEXT: v_rcp_f16_e32 v1, v0
1132 ; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1133 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
1134 ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
1135 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1137 ; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
1139 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1140 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1141 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0
1142 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1143 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
1144 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1145 %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
1146 ret <2 x half> %fdiv
1149 define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
1150 ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
1151 ; GFX6-IEEE: ; %bb.0:
1152 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
1154 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
1155 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
1156 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1157 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1158 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1159 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1160 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1161 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1162 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1163 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1164 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1165 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1166 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1167 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
1168 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
1169 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
1170 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1171 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1172 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1173 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1174 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
1175 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1176 ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
1177 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1178 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
1179 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1180 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1182 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
1183 ; GFX6-FLUSH: ; %bb.0:
1184 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185 ; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
1186 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
1187 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1188 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
1189 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
1190 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
1191 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1192 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
1193 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1194 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1195 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
1196 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1197 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
1198 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1199 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
1200 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
1201 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1202 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
1203 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1204 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1205 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
1206 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
1207 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
1208 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1209 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
1210 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
1211 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
1212 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1213 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
1214 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1215 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1216 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
1217 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
1218 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1219 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1221 ; GFX8-LABEL: v_rcp_v2f16_ulp25:
1223 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1224 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1225 ; GFX8-NEXT: v_rcp_f16_e32 v0, v0
1226 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1227 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1228 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1229 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1231 ; GFX9-LABEL: v_rcp_v2f16_ulp25:
1233 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1234 ; GFX9-NEXT: v_rcp_f16_e32 v1, v0
1235 ; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1236 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
1237 ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
1238 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1240 ; GFX10-LABEL: v_rcp_v2f16_ulp25:
1242 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1244 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0
1245 ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1246 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
1247 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1248 %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
1249 ret <2 x half> %fdiv
1252 define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
1253 ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
1255 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1256 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
1257 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
1258 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
1259 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
1260 ; GFX6-NEXT: v_rcp_f32_e32 v2, v2
1261 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3
1262 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
1263 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
1264 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
1265 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
1266 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1268 ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
1270 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
1272 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1273 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
1274 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1275 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
1276 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1277 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1278 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1280 ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
1282 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
1284 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1285 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
1286 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1287 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
1288 ; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
1289 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1291 ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
1293 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1294 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1295 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
1296 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1297 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
1298 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1299 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
1300 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1301 %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
1302 ret <2 x half> %fdiv
1305 define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
1306 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
1307 ; GFX6-IEEE: ; %bb.0:
1308 ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1309 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
1310 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
1311 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
1312 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
1313 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
1314 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
1315 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
1316 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
1317 ; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
1318 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
1319 ; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
1320 ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
1321 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
1322 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
1323 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
1324 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
1325 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
1326 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
1327 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
1328 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
1329 ; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
1330 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
1331 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
1332 ; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
1333 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
1334 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
1335 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
1336 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
1337 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
1339 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
1340 ; GFX6-FLUSH: ; %bb.0:
1341 ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1342 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
1343 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
1344 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
1345 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
1346 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
1347 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1348 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
1349 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
1350 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
1351 ; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
1352 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
1353 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
1354 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1355 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
1356 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
1357 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
1358 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
1359 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1360 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
1361 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
1362 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
1363 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
1364 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1365 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
1366 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
1367 ; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
1368 ; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
1369 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
1370 ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
1371 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1372 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
1373 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
1374 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
1375 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
1377 ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
1379 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1380 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1381 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
1382 ; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
1383 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
1384 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
1385 ; GFX8-NEXT: v_rcp_f32_e32 v2, v2
1386 ; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
1387 ; GFX8-NEXT: v_rcp_f32_e32 v5, v5
1388 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
1389 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
1390 ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
1391 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
1392 ; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
1393 ; GFX8-NEXT: v_mov_b32_e32 v2, 16
1394 ; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
1395 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1396 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1397 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1399 ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
1401 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1402 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1403 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
1404 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
1405 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
1406 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
1407 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2
1408 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
1409 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
1410 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
1411 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
1412 ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
1413 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
1414 ; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
1415 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
1416 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
1417 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1418 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
1419 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1421 ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
1423 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1424 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1425 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
1426 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
1427 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
1428 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0
1429 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
1430 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
1431 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5
1432 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1433 ; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
1434 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
1435 ; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
1436 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
1437 ; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0
1438 ; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5
1439 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
1440 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
1441 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1442 %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
1443 ret <2 x half> %fdiv
1446 define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
1447 ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1449 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1450 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
1451 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
1452 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
1453 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
1454 ; GFX6-NEXT: v_rcp_f32_e32 v2, v2
1455 ; GFX6-NEXT: v_rcp_f32_e32 v3, v3
1456 ; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
1457 ; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
1458 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
1459 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
1460 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1462 ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1464 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1465 ; GFX8-NEXT: v_rcp_f16_e32 v2, v1
1466 ; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1467 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
1468 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1469 ; GFX8-NEXT: v_mov_b32_e32 v1, 16
1470 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1471 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1472 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1474 ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1476 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1477 ; GFX9-NEXT: v_rcp_f16_e32 v2, v1
1478 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1479 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
1480 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1481 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
1482 ; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
1483 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1485 ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
1487 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1488 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1489 ; GFX10-NEXT: v_rcp_f16_e32 v2, v1
1490 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1491 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
1492 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1493 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
1494 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1495 %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
1496 ret <2 x half> %fdiv
1499 !0 = !{float 2.500000e+00}