1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
4 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600,EG %s
5 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,CM %s
7 define float @v_rcp_f32_ieee(float %x) #3 {
8 ; SI-LABEL: v_rcp_f32_ieee:
10 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
12 ; SI-NEXT: v_rcp_f32_e32 v2, v1
13 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
14 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
15 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2
16 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2
17 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
18 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4
19 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
20 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
21 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
22 ; SI-NEXT: s_setpc_b64 s[30:31]
24 ; VI-LABEL: v_rcp_f32_ieee:
26 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
28 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
29 ; VI-NEXT: v_rcp_f32_e32 v3, v1
30 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
31 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
32 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
33 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
34 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
35 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
36 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
37 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
38 ; VI-NEXT: s_setpc_b64 s[30:31]
40 ; R600-LABEL: v_rcp_f32_ieee:
44 %rcp = fdiv float 1.0, %x
48 define float @v_rcp_f32_ieee_unsafe(float %x) #4 {
49 ; GCN-LABEL: v_rcp_f32_ieee_unsafe:
51 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52 ; GCN-NEXT: v_rcp_f32_e32 v0, v0
53 ; GCN-NEXT: s_setpc_b64 s[30:31]
55 ; R600-LABEL: v_rcp_f32_ieee_unsafe:
59 %rcp = fdiv float 1.0, %x
63 define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
64 ; SI-LABEL: v_rcp_f32_ieee_known_not_denormal:
66 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67 ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
68 ; SI-NEXT: v_rcp_f32_e32 v2, v1
69 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
70 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
71 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2
72 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2
73 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
74 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4
75 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
76 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
77 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
78 ; SI-NEXT: s_setpc_b64 s[30:31]
80 ; VI-LABEL: v_rcp_f32_ieee_known_not_denormal:
82 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
84 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
85 ; VI-NEXT: v_rcp_f32_e32 v3, v1
86 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
87 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
88 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
89 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
90 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
91 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
92 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
93 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
94 ; VI-NEXT: s_setpc_b64 s[30:31]
96 ; R600-LABEL: v_rcp_f32_ieee_known_not_denormal:
100 %rcp = fdiv float 1.0, %x
104 define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
105 ; SI-LABEL: v_rcp_f32_ieee_nnan_ninf:
107 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
109 ; SI-NEXT: v_rcp_f32_e32 v2, v1
110 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
111 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
112 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2
113 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2
114 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
115 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4
116 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
117 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
118 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
119 ; SI-NEXT: s_setpc_b64 s[30:31]
121 ; VI-LABEL: v_rcp_f32_ieee_nnan_ninf:
123 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
125 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
126 ; VI-NEXT: v_rcp_f32_e32 v3, v1
127 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
128 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
129 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
130 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
131 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
132 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
133 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
134 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
135 ; VI-NEXT: s_setpc_b64 s[30:31]
137 ; R600-LABEL: v_rcp_f32_ieee_nnan_ninf:
141 %rcp = fdiv nnan ninf float 1.0, %x
145 define float @v_neg_rcp_f32_ieee(float %x) #3 {
146 ; SI-LABEL: v_neg_rcp_f32_ieee:
148 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
150 ; SI-NEXT: v_rcp_f32_e32 v2, v1
151 ; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
152 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
153 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2
154 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2
155 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
156 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4
157 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
158 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
159 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
160 ; SI-NEXT: s_setpc_b64 s[30:31]
162 ; VI-LABEL: v_neg_rcp_f32_ieee:
164 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
166 ; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
167 ; VI-NEXT: v_rcp_f32_e32 v3, v1
168 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
169 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
170 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
171 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
172 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
173 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
174 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
175 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
176 ; VI-NEXT: s_setpc_b64 s[30:31]
178 ; R600-LABEL: v_neg_rcp_f32_ieee:
182 %rcp = fdiv float -1.0, %x
186 define float @v_rcp_f32_daz(float %x) #0 {
187 ; SI-LABEL: v_rcp_f32_daz:
189 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
191 ; SI-NEXT: v_rcp_f32_e32 v2, v1
192 ; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
193 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
194 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
195 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2
196 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2
197 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
198 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4
199 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
200 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
201 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
202 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
203 ; SI-NEXT: s_setpc_b64 s[30:31]
205 ; VI-LABEL: v_rcp_f32_daz:
207 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
209 ; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
210 ; VI-NEXT: v_rcp_f32_e32 v3, v1
211 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
212 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
213 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
214 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
215 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
216 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
217 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
218 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
219 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
220 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
221 ; VI-NEXT: s_setpc_b64 s[30:31]
223 ; R600-LABEL: v_rcp_f32_daz:
227 %rcp = fdiv float 1.0, %x
231 define float @v_neg_rcp_f32_daz(float %x) #0 {
232 ; SI-LABEL: v_neg_rcp_f32_daz:
234 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
236 ; SI-NEXT: v_rcp_f32_e32 v2, v1
237 ; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
238 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
239 ; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0
240 ; SI-NEXT: v_fma_f32 v2, v4, v2, v2
241 ; SI-NEXT: v_mul_f32_e32 v4, v3, v2
242 ; SI-NEXT: v_fma_f32 v5, -v1, v4, v3
243 ; SI-NEXT: v_fma_f32 v4, v5, v2, v4
244 ; SI-NEXT: v_fma_f32 v1, -v1, v4, v3
245 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
246 ; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4
247 ; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
248 ; SI-NEXT: s_setpc_b64 s[30:31]
250 ; VI-LABEL: v_neg_rcp_f32_daz:
252 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
254 ; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
255 ; VI-NEXT: v_rcp_f32_e32 v3, v1
256 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
257 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
258 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
259 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
260 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
261 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
262 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
263 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
264 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
265 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
266 ; VI-NEXT: s_setpc_b64 s[30:31]
268 ; R600-LABEL: v_neg_rcp_f32_daz:
272 %rcp = fdiv float -1.0, %x
276 define float @v_rcp_f32_ieee_ulp25(float %x) #3 {
277 ; SI-LABEL: v_rcp_f32_ieee_ulp25:
279 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
281 ; SI-NEXT: v_frexp_mant_f32_e32 v1, v0
282 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
283 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
284 ; SI-NEXT: v_rcp_f32_e32 v1, v1
285 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
286 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
287 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
288 ; SI-NEXT: s_setpc_b64 s[30:31]
290 ; VI-LABEL: v_rcp_f32_ieee_ulp25:
292 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
294 ; VI-NEXT: v_rcp_f32_e32 v1, v1
295 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
296 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
297 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
298 ; VI-NEXT: s_setpc_b64 s[30:31]
300 ; R600-LABEL: v_rcp_f32_ieee_ulp25:
304 %rcp = fdiv float 1.0, %x, !fpmath !0
308 define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
309 ; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
311 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
313 ; SI-NEXT: v_frexp_mant_f32_e32 v1, v0
314 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
315 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
316 ; SI-NEXT: v_rcp_f32_e32 v1, v1
317 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
318 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
319 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
320 ; SI-NEXT: s_setpc_b64 s[30:31]
322 ; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
324 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
326 ; VI-NEXT: v_rcp_f32_e32 v1, v1
327 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
328 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
329 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
330 ; VI-NEXT: s_setpc_b64 s[30:31]
332 ; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
336 %rcp = fdiv float 1.0, %x, !fpmath !0
340 define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
341 ; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
343 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
345 ; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0
346 ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
347 ; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
348 ; SI-NEXT: v_rcp_f32_e32 v1, v1
349 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
350 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
351 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
352 ; SI-NEXT: s_setpc_b64 s[30:31]
354 ; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
356 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
357 ; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0
358 ; VI-NEXT: v_rcp_f32_e32 v1, v1
359 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
360 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
361 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
362 ; VI-NEXT: s_setpc_b64 s[30:31]
364 ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
368 %rcp = fdiv float -1.0, %x, !fpmath !0
372 define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 {
373 ; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
375 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
377 ; SI-NEXT: v_frexp_mant_f32_e32 v1, v0
378 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
379 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
380 ; SI-NEXT: v_rcp_f32_e32 v1, v1
381 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
382 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
383 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
384 ; SI-NEXT: s_setpc_b64 s[30:31]
386 ; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
388 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389 ; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
390 ; VI-NEXT: v_rcp_f32_e32 v1, v1
391 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
392 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
393 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
394 ; VI-NEXT: s_setpc_b64 s[30:31]
396 ; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
400 %rcp = fdiv ninf nnan float 1.0, %x, !fpmath !0
404 define float @v_rcp_f32_daz_ulp25(float %x) #0 {
405 ; GCN-LABEL: v_rcp_f32_daz_ulp25:
407 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408 ; GCN-NEXT: v_rcp_f32_e32 v0, v0
409 ; GCN-NEXT: s_setpc_b64 s[30:31]
411 ; R600-LABEL: v_rcp_f32_daz_ulp25:
415 %rcp = fdiv float 1.0, %x, !fpmath !0
419 define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 {
420 ; SI-LABEL: v_neg_rcp_f32_ieee_ulp25:
422 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
424 ; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0
425 ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
426 ; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
427 ; SI-NEXT: v_rcp_f32_e32 v1, v1
428 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
429 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
430 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
431 ; SI-NEXT: s_setpc_b64 s[30:31]
433 ; VI-LABEL: v_neg_rcp_f32_ieee_ulp25:
435 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436 ; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0
437 ; VI-NEXT: v_rcp_f32_e32 v1, v1
438 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
439 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
440 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
441 ; VI-NEXT: s_setpc_b64 s[30:31]
443 ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25:
447 %rcp = fdiv float -1.0, %x, !fpmath !0
451 define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 {
452 ; GCN-LABEL: v_neg_rcp_f32_daz_ulp25:
454 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455 ; GCN-NEXT: v_rcp_f32_e64 v0, -v0
456 ; GCN-NEXT: s_setpc_b64 s[30:31]
458 ; R600-LABEL: v_neg_rcp_f32_daz_ulp25:
462 %rcp = fdiv float -1.0, %x, !fpmath !0
466 define float @v_rcp_fabs_f32_ieee(float %x) #3 {
467 ; SI-LABEL: v_rcp_fabs_f32_ieee:
469 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470 ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
471 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
472 ; SI-NEXT: v_rcp_f32_e32 v3, v2
473 ; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
474 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
475 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3
476 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3
477 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
478 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4
479 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
480 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
481 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
482 ; SI-NEXT: s_setpc_b64 s[30:31]
484 ; VI-LABEL: v_rcp_fabs_f32_ieee:
486 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487 ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
488 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
489 ; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
490 ; VI-NEXT: v_rcp_f32_e32 v3, v2
491 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
492 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
493 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3
494 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
495 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
496 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
497 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
498 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
499 ; VI-NEXT: s_setpc_b64 s[30:31]
501 ; R600-LABEL: v_rcp_fabs_f32_ieee:
505 %fabs.x = call float @llvm.fabs.f32(float %x)
506 %rcp = fdiv float 1.0, %fabs.x
510 define float @v_rcp_fabs_f32_daz(float %x) #0 {
511 ; SI-LABEL: v_rcp_fabs_f32_daz:
513 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514 ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
515 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
516 ; SI-NEXT: v_rcp_f32_e32 v3, v2
517 ; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
518 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
519 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
520 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3
521 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3
522 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
523 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4
524 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
525 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
526 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
527 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
528 ; SI-NEXT: s_setpc_b64 s[30:31]
530 ; VI-LABEL: v_rcp_fabs_f32_daz:
532 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
533 ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
534 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
535 ; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
536 ; VI-NEXT: v_rcp_f32_e32 v3, v2
537 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
538 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
539 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
540 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3
541 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
542 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
543 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
544 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
545 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
546 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0
547 ; VI-NEXT: s_setpc_b64 s[30:31]
549 ; R600-LABEL: v_rcp_fabs_f32_daz:
553 %fabs.x = call float @llvm.fabs.f32(float %x)
554 %rcp = fdiv float 1.0, %fabs.x
558 define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
559 ; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
561 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
563 ; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
564 ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
565 ; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
566 ; SI-NEXT: v_rcp_f32_e32 v1, v1
567 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
568 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
569 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
570 ; SI-NEXT: s_setpc_b64 s[30:31]
572 ; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
574 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
575 ; VI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
576 ; VI-NEXT: v_rcp_f32_e32 v1, v1
577 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
578 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
579 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
580 ; VI-NEXT: s_setpc_b64 s[30:31]
582 ; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25:
586 %fabs.x = call float @llvm.fabs.f32(float %x)
587 %rcp = fdiv float 1.0, %fabs.x, !fpmath !0
591 define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 {
592 ; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25:
594 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595 ; GCN-NEXT: v_rcp_f32_e64 v0, |v0|
596 ; GCN-NEXT: s_setpc_b64 s[30:31]
598 ; R600-LABEL: v_rcp_fabs_f32_daz_ulp25:
602 %fabs.x = call float @llvm.fabs.f32(float %x)
603 %rcp = fdiv float 1.0, %fabs.x, !fpmath !0
607 define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
608 ; SI-LABEL: v_rcp_neg_fabs_f32_ieee:
610 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
611 ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
612 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
613 ; SI-NEXT: v_rcp_f32_e32 v3, v2
614 ; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
615 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
616 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3
617 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3
618 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
619 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4
620 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
621 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
622 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
623 ; SI-NEXT: s_setpc_b64 s[30:31]
625 ; VI-LABEL: v_rcp_neg_fabs_f32_ieee:
627 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
629 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
630 ; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
631 ; VI-NEXT: v_rcp_f32_e32 v3, v2
632 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
633 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
634 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3
635 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
636 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
637 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
638 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
639 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
640 ; VI-NEXT: s_setpc_b64 s[30:31]
642 ; R600-LABEL: v_rcp_neg_fabs_f32_ieee:
646 %fabs.x = call float @llvm.fabs.f32(float %x)
647 %rcp = fdiv float -1.0, %fabs.x
651 define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
652 ; SI-LABEL: v_rcp_neg_fabs_f32_daz:
654 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655 ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
656 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
657 ; SI-NEXT: v_rcp_f32_e32 v3, v2
658 ; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
659 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
660 ; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
661 ; SI-NEXT: v_fma_f32 v3, v4, v3, v3
662 ; SI-NEXT: v_mul_f32_e32 v4, v1, v3
663 ; SI-NEXT: v_fma_f32 v5, -v2, v4, v1
664 ; SI-NEXT: v_fma_f32 v4, v5, v3, v4
665 ; SI-NEXT: v_fma_f32 v1, -v2, v4, v1
666 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
667 ; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
668 ; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
669 ; SI-NEXT: s_setpc_b64 s[30:31]
671 ; VI-LABEL: v_rcp_neg_fabs_f32_daz:
673 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
674 ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
675 ; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
676 ; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
677 ; VI-NEXT: v_rcp_f32_e32 v3, v2
678 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
679 ; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0
680 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
681 ; VI-NEXT: v_mul_f32_e32 v4, v1, v3
682 ; VI-NEXT: v_fma_f32 v5, -v2, v4, v1
683 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
684 ; VI-NEXT: v_fma_f32 v1, -v2, v4, v1
685 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
686 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
687 ; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0
688 ; VI-NEXT: s_setpc_b64 s[30:31]
690 ; R600-LABEL: v_rcp_neg_fabs_f32_daz:
694 %fabs.x = call float @llvm.fabs.f32(float %x)
695 %rcp = fdiv float -1.0, %fabs.x
699 define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
700 ; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
702 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; SI-NEXT: s_mov_b32 s4, 0x7f800000
704 ; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0|
705 ; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
706 ; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5]
707 ; SI-NEXT: v_rcp_f32_e32 v1, v1
708 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
709 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
710 ; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0
711 ; SI-NEXT: s_setpc_b64 s[30:31]
713 ; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
715 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716 ; VI-NEXT: v_frexp_mant_f32_e64 v1, -|v0|
717 ; VI-NEXT: v_rcp_f32_e32 v1, v1
718 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
719 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0
720 ; VI-NEXT: v_ldexp_f32 v0, v1, v0
721 ; VI-NEXT: s_setpc_b64 s[30:31]
723 ; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
727 %fabs.x = call float @llvm.fabs.f32(float %x)
728 %rcp = fdiv float -1.0, %fabs.x, !fpmath !0
732 define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 {
733 ; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
735 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
736 ; GCN-NEXT: v_rcp_f32_e64 v0, -|v0|
737 ; GCN-NEXT: s_setpc_b64 s[30:31]
739 ; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
743 %fabs.x = call float @llvm.fabs.f32(float %x)
744 %rcp = fdiv float -1.0, %fabs.x, !fpmath !0
748 define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
749 ; SI-LABEL: s_rcp_pat_f32_daz:
751 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
752 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
753 ; SI-NEXT: s_mov_b32 s3, 0xf000
754 ; SI-NEXT: s_waitcnt lgkmcnt(0)
755 ; SI-NEXT: v_rcp_f32_e32 v0, s2
756 ; SI-NEXT: s_mov_b32 s2, -1
757 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
760 ; VI-LABEL: s_rcp_pat_f32_daz:
762 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
763 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
764 ; VI-NEXT: s_waitcnt lgkmcnt(0)
765 ; VI-NEXT: v_rcp_f32_e32 v2, s2
766 ; VI-NEXT: v_mov_b32_e32 v0, s0
767 ; VI-NEXT: v_mov_b32_e32 v1, s1
768 ; VI-NEXT: flat_store_dword v[0:1], v2
771 ; EG-LABEL: s_rcp_pat_f32_daz:
773 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
774 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
777 ; EG-NEXT: ALU clause starting at 4:
778 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
779 ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
780 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
782 ; CM-LABEL: s_rcp_pat_f32_daz:
784 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
785 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
788 ; CM-NEXT: ALU clause starting at 4:
789 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
790 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
791 ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
792 ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
793 ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
794 ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
795 %rcp = fdiv float 1.0, %src, !fpmath !0
796 store float %rcp, ptr addrspace(1) %out, align 4
800 define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
801 ; SI-LABEL: s_rcp_ulp25_pat_f32_daz:
803 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
804 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
805 ; SI-NEXT: s_mov_b32 s3, 0xf000
806 ; SI-NEXT: s_waitcnt lgkmcnt(0)
807 ; SI-NEXT: v_rcp_f32_e32 v0, s2
808 ; SI-NEXT: s_mov_b32 s2, -1
809 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
812 ; VI-LABEL: s_rcp_ulp25_pat_f32_daz:
814 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
815 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
816 ; VI-NEXT: s_waitcnt lgkmcnt(0)
817 ; VI-NEXT: v_rcp_f32_e32 v2, s2
818 ; VI-NEXT: v_mov_b32_e32 v0, s0
819 ; VI-NEXT: v_mov_b32_e32 v1, s1
820 ; VI-NEXT: flat_store_dword v[0:1], v2
823 ; EG-LABEL: s_rcp_ulp25_pat_f32_daz:
825 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
826 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
829 ; EG-NEXT: ALU clause starting at 4:
830 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
831 ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
832 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
834 ; CM-LABEL: s_rcp_ulp25_pat_f32_daz:
836 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
837 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
840 ; CM-NEXT: ALU clause starting at 4:
841 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
842 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
843 ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
844 ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
845 ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
846 ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
847 %rcp = fdiv float 1.0, %src, !fpmath !0
848 store float %rcp, ptr addrspace(1) %out, align 4
852 define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
853 ; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
855 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
856 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
857 ; SI-NEXT: s_mov_b32 s3, 0xf000
858 ; SI-NEXT: s_waitcnt lgkmcnt(0)
859 ; SI-NEXT: v_rcp_f32_e32 v0, s2
860 ; SI-NEXT: s_mov_b32 s2, -1
861 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
864 ; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
866 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
867 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
868 ; VI-NEXT: s_waitcnt lgkmcnt(0)
869 ; VI-NEXT: v_rcp_f32_e32 v2, s2
870 ; VI-NEXT: v_mov_b32_e32 v0, s0
871 ; VI-NEXT: v_mov_b32_e32 v1, s1
872 ; VI-NEXT: flat_store_dword v[0:1], v2
875 ; EG-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
877 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
878 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
881 ; EG-NEXT: ALU clause starting at 4:
882 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
883 ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
884 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
886 ; CM-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
888 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
889 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
892 ; CM-NEXT: ALU clause starting at 4:
893 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
894 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
895 ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
896 ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
897 ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
898 ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
899 %rcp = fdiv fast float 1.0, %src, !fpmath !0
900 store float %rcp, ptr addrspace(1) %out, align 4
904 define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
905 ; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
907 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
908 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
909 ; SI-NEXT: s_mov_b32 s3, 0xf000
910 ; SI-NEXT: s_waitcnt lgkmcnt(0)
911 ; SI-NEXT: v_rcp_f32_e32 v0, s2
912 ; SI-NEXT: s_mov_b32 s2, -1
913 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
916 ; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
918 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
919 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
920 ; VI-NEXT: s_waitcnt lgkmcnt(0)
921 ; VI-NEXT: v_rcp_f32_e32 v2, s2
922 ; VI-NEXT: v_mov_b32_e32 v0, s0
923 ; VI-NEXT: v_mov_b32_e32 v1, s1
924 ; VI-NEXT: flat_store_dword v[0:1], v2
927 ; EG-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
929 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
930 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
933 ; EG-NEXT: ALU clause starting at 4:
934 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
935 ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
936 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
938 ; CM-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
940 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
941 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
944 ; CM-NEXT: ALU clause starting at 4:
945 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
946 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
947 ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
948 ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
949 ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
950 ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
951 %rcp = fdiv arcp float 1.0, %src, !fpmath !0
952 store float %rcp, ptr addrspace(1) %out, align 4
956 define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 {
957 ; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
959 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
960 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
961 ; SI-NEXT: s_mov_b32 s3, 0xf000
962 ; SI-NEXT: s_waitcnt lgkmcnt(0)
963 ; SI-NEXT: v_rcp_f32_e32 v0, s2
964 ; SI-NEXT: s_mov_b32 s2, -1
965 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
968 ; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
970 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
971 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
972 ; VI-NEXT: s_waitcnt lgkmcnt(0)
973 ; VI-NEXT: v_rcp_f32_e32 v2, s2
974 ; VI-NEXT: v_mov_b32_e32 v0, s0
975 ; VI-NEXT: v_mov_b32_e32 v1, s1
976 ; VI-NEXT: flat_store_dword v[0:1], v2
979 ; EG-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
981 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
982 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
985 ; EG-NEXT: ALU clause starting at 4:
986 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
987 ; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z,
988 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
990 ; CM-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
992 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
993 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
996 ; CM-NEXT: ALU clause starting at 4:
997 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
998 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
999 ; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z,
1000 ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
1001 ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
1002 ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
1003 %rcp = fdiv float 1.0, %src, !fpmath !0
1004 store float %rcp, ptr addrspace(1) %out, align 4
1008 define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1009 ; SI-LABEL: s_rcp_fabs_pat_f32_daz:
1011 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
1012 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1013 ; SI-NEXT: s_mov_b32 s3, 0xf000
1014 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1015 ; SI-NEXT: v_rcp_f32_e64 v0, |s2|
1016 ; SI-NEXT: s_mov_b32 s2, -1
1017 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1020 ; VI-LABEL: s_rcp_fabs_pat_f32_daz:
1022 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1023 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1024 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1025 ; VI-NEXT: v_rcp_f32_e64 v2, |s2|
1026 ; VI-NEXT: v_mov_b32_e32 v0, s0
1027 ; VI-NEXT: v_mov_b32_e32 v1, s1
1028 ; VI-NEXT: flat_store_dword v[0:1], v2
1031 ; EG-LABEL: s_rcp_fabs_pat_f32_daz:
1033 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1034 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1037 ; EG-NEXT: ALU clause starting at 4:
1038 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1039 ; EG-NEXT: RECIP_IEEE * T1.X, |KC0[2].Z|,
1040 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1042 ; CM-LABEL: s_rcp_fabs_pat_f32_daz:
1044 ; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
1045 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1048 ; CM-NEXT: ALU clause starting at 4:
1049 ; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1050 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1051 ; CM-NEXT: RECIP_IEEE T1.X, |KC0[2].Z|,
1052 ; CM-NEXT: RECIP_IEEE T1.Y (MASKED), |KC0[2].Z|,
1053 ; CM-NEXT: RECIP_IEEE T1.Z (MASKED), |KC0[2].Z|,
1054 ; CM-NEXT: RECIP_IEEE * T1.W (MASKED), |KC0[2].Z|,
1055 %src.fabs = call float @llvm.fabs.f32(float %src)
1056 %rcp = fdiv float 1.0, %src.fabs, !fpmath !0
1057 store float %rcp, ptr addrspace(1) %out, align 4
1061 define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1062 ; SI-LABEL: s_neg_rcp_pat_f32_daz:
1064 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
1065 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1066 ; SI-NEXT: s_mov_b32 s3, 0xf000
1067 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1068 ; SI-NEXT: v_rcp_f32_e64 v0, -s2
1069 ; SI-NEXT: s_mov_b32 s2, -1
1070 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1073 ; VI-LABEL: s_neg_rcp_pat_f32_daz:
1075 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1076 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1077 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1078 ; VI-NEXT: v_rcp_f32_e64 v2, -s2
1079 ; VI-NEXT: v_mov_b32_e32 v0, s0
1080 ; VI-NEXT: v_mov_b32_e32 v1, s1
1081 ; VI-NEXT: flat_store_dword v[0:1], v2
1084 ; EG-LABEL: s_neg_rcp_pat_f32_daz:
1086 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1087 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1090 ; EG-NEXT: ALU clause starting at 4:
1091 ; EG-NEXT: RECIP_IEEE * T0.X, KC0[2].Z,
1092 ; EG-NEXT: MUL_IEEE T0.X, literal.x, PS,
1093 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1094 ; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45)
1096 ; CM-LABEL: s_neg_rcp_pat_f32_daz:
1098 ; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1099 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1102 ; CM-NEXT: ALU clause starting at 4:
1103 ; CM-NEXT: RECIP_IEEE T0.X, KC0[2].Z,
1104 ; CM-NEXT: RECIP_IEEE T0.Y (MASKED), KC0[2].Z,
1105 ; CM-NEXT: RECIP_IEEE T0.Z (MASKED), KC0[2].Z,
1106 ; CM-NEXT: RECIP_IEEE * T0.W (MASKED), KC0[2].Z,
1107 ; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X,
1108 ; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00)
1109 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1110 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1111 %rcp = fdiv float -1.0, %src, !fpmath !0
1112 store float %rcp, ptr addrspace(1) %out, align 4
1116 define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1117 ; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1119 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
1120 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1121 ; SI-NEXT: s_mov_b32 s3, 0xf000
1122 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1123 ; SI-NEXT: v_rcp_f32_e64 v0, -|s2|
1124 ; SI-NEXT: s_mov_b32 s2, -1
1125 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1128 ; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1130 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1131 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1132 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1133 ; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
1134 ; VI-NEXT: v_mov_b32_e32 v0, s0
1135 ; VI-NEXT: v_mov_b32_e32 v1, s1
1136 ; VI-NEXT: flat_store_dword v[0:1], v2
1139 ; EG-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1141 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1142 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1145 ; EG-NEXT: ALU clause starting at 4:
1146 ; EG-NEXT: RECIP_IEEE * T0.X, |KC0[2].Z|,
1147 ; EG-NEXT: MUL_IEEE T0.X, literal.x, PS,
1148 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1149 ; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45)
1151 ; CM-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1153 ; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1154 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1157 ; CM-NEXT: ALU clause starting at 4:
1158 ; CM-NEXT: RECIP_IEEE T0.X, |KC0[2].Z|,
1159 ; CM-NEXT: RECIP_IEEE T0.Y (MASKED), |KC0[2].Z|,
1160 ; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|,
1161 ; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|,
1162 ; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X,
1163 ; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00)
1164 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1165 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1166 %src.fabs = call float @llvm.fabs.f32(float %src)
1167 %src.fabs.fneg = fneg float %src.fabs
1168 %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
1169 store float %rcp, ptr addrspace(1) %out, align 4
1173 define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1174 ; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1176 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1177 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1178 ; SI-NEXT: s_mov_b32 s3, 0xf000
1179 ; SI-NEXT: s_mov_b32 s2, -1
1180 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1181 ; SI-NEXT: v_rcp_f32_e64 v0, -|s4|
1182 ; SI-NEXT: v_mul_f32_e64 v1, s4, -|s4|
1183 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1184 ; SI-NEXT: s_waitcnt vmcnt(0)
1185 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
1186 ; SI-NEXT: s_waitcnt vmcnt(0)
1189 ; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1191 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1192 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1193 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1194 ; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
1195 ; VI-NEXT: v_mov_b32_e32 v0, s0
1196 ; VI-NEXT: v_mov_b32_e32 v1, s1
1197 ; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2|
1198 ; VI-NEXT: flat_store_dword v[0:1], v2
1199 ; VI-NEXT: s_waitcnt vmcnt(0)
1200 ; VI-NEXT: flat_store_dword v[0:1], v3
1201 ; VI-NEXT: s_waitcnt vmcnt(0)
1204 ; EG-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1206 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
1207 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
1208 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
1210 ; EG-NEXT: ALU clause starting at 4:
1211 ; EG-NEXT: MUL_IEEE T0.X, KC0[2].Z, -|KC0[2].Z|,
1212 ; EG-NEXT: RECIP_IEEE * T0.Y, |KC0[2].Z|,
1213 ; EG-NEXT: MUL_IEEE T1.X, literal.x, PS,
1214 ; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.y,
1215 ; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45)
1217 ; CM-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1219 ; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
1220 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
1221 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
1223 ; CM-NEXT: ALU clause starting at 4:
1224 ; CM-NEXT: MUL_IEEE * T0.X, KC0[2].Z, -|KC0[2].Z|,
1225 ; CM-NEXT: RECIP_IEEE T0.X (MASKED), |KC0[2].Z|,
1226 ; CM-NEXT: RECIP_IEEE T0.Y, |KC0[2].Z|,
1227 ; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|,
1228 ; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|,
1229 ; CM-NEXT: MUL_IEEE * T1.X, literal.x, PV.Y,
1230 ; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00)
1231 ; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
1232 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1233 %src.fabs = call float @llvm.fabs.f32(float %src)
1234 %src.fabs.fneg = fneg float %src.fabs
1235 %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
1236 store volatile float %rcp, ptr addrspace(1) %out, align 4
1238 %other = fmul float %src, %src.fabs.fneg
1239 store volatile float %other, ptr addrspace(1) %out, align 4
1243 define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
1244 ; SI-LABEL: s_div_arcp_2_x_pat_f32_daz:
1246 ; SI-NEXT: s_load_dword s4, s[0:1], 0x0
1247 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1248 ; SI-NEXT: s_mov_b32 s3, 0xf000
1249 ; SI-NEXT: s_mov_b32 s2, -1
1250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1251 ; SI-NEXT: v_mul_f32_e64 v0, s4, 0.5
1252 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1255 ; VI-LABEL: s_div_arcp_2_x_pat_f32_daz:
1257 ; VI-NEXT: s_load_dword s2, s[0:1], 0x0
1258 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1259 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1260 ; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5
1261 ; VI-NEXT: v_mov_b32_e32 v0, s0
1262 ; VI-NEXT: v_mov_b32_e32 v1, s1
1263 ; VI-NEXT: flat_store_dword v[0:1], v2
1266 ; EG-LABEL: s_div_arcp_2_x_pat_f32_daz:
1269 ; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
1270 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1272 ; EG-NEXT: Fetch clause starting at 4:
1273 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1274 ; EG-NEXT: ALU clause starting at 6:
1275 ; EG-NEXT: MUL_IEEE T0.X, T0.X, 0.5,
1276 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1277 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1279 ; CM-LABEL: s_div_arcp_2_x_pat_f32_daz:
1282 ; CM-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
1283 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1285 ; CM-NEXT: Fetch clause starting at 4:
1286 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1287 ; CM-NEXT: ALU clause starting at 6:
1288 ; CM-NEXT: MUL_IEEE * T0.X, T0.X, 0.5,
1289 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1290 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1291 %x = load float, ptr addrspace(1) undef
1292 %rcp = fdiv arcp float %x, 2.0
1293 store float %rcp, ptr addrspace(1) %out, align 4
1297 define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
1298 ; SI-LABEL: s_div_arcp_k_x_pat_f32_daz:
1300 ; SI-NEXT: s_load_dword s4, s[0:1], 0x0
1301 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1302 ; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
1303 ; SI-NEXT: s_mov_b32 s3, 0xf000
1304 ; SI-NEXT: s_mov_b32 s2, -1
1305 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1306 ; SI-NEXT: v_mul_f32_e32 v0, s4, v0
1307 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1310 ; VI-LABEL: s_div_arcp_k_x_pat_f32_daz:
1312 ; VI-NEXT: s_load_dword s2, s[0:1], 0x0
1313 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1314 ; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
1315 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1316 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0
1317 ; VI-NEXT: v_mov_b32_e32 v0, s0
1318 ; VI-NEXT: v_mov_b32_e32 v1, s1
1319 ; VI-NEXT: flat_store_dword v[0:1], v2
1322 ; EG-LABEL: s_div_arcp_k_x_pat_f32_daz:
1325 ; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
1326 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1328 ; EG-NEXT: Fetch clause starting at 4:
1329 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1330 ; EG-NEXT: ALU clause starting at 6:
1331 ; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x,
1332 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1333 ; EG-NEXT: 1036831949(1.000000e-01), 2(2.802597e-45)
1335 ; CM-LABEL: s_div_arcp_k_x_pat_f32_daz:
1338 ; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[]
1339 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1341 ; CM-NEXT: Fetch clause starting at 4:
1342 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1343 ; CM-NEXT: ALU clause starting at 6:
1344 ; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x,
1345 ; CM-NEXT: 1036831949(1.000000e-01), 0(0.000000e+00)
1346 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1347 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1348 %x = load float, ptr addrspace(1) undef
1349 %rcp = fdiv arcp float %x, 10.0
1350 store float %rcp, ptr addrspace(1) %out, align 4
1354 define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
1355 ; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1357 ; SI-NEXT: s_load_dword s4, s[0:1], 0x0
1358 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1359 ; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
1360 ; SI-NEXT: s_mov_b32 s3, 0xf000
1361 ; SI-NEXT: s_mov_b32 s2, -1
1362 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1363 ; SI-NEXT: v_mul_f32_e32 v0, s4, v0
1364 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1367 ; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1369 ; VI-NEXT: s_load_dword s2, s[0:1], 0x0
1370 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1371 ; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
1372 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1373 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0
1374 ; VI-NEXT: v_mov_b32_e32 v0, s0
1375 ; VI-NEXT: v_mov_b32_e32 v1, s1
1376 ; VI-NEXT: flat_store_dword v[0:1], v2
1379 ; EG-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1382 ; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[]
1383 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1385 ; EG-NEXT: Fetch clause starting at 4:
1386 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1387 ; EG-NEXT: ALU clause starting at 6:
1388 ; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x,
1389 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1390 ; EG-NEXT: -1110651699(-1.000000e-01), 2(2.802597e-45)
1392 ; CM-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1395 ; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[]
1396 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1398 ; CM-NEXT: Fetch clause starting at 4:
1399 ; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1400 ; CM-NEXT: ALU clause starting at 6:
1401 ; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x,
1402 ; CM-NEXT: -1110651699(-1.000000e-01), 0(0.000000e+00)
1403 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1404 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1405 %x = load float, ptr addrspace(1) undef
1406 %rcp = fdiv arcp float %x, -10.0
1407 store float %rcp, ptr addrspace(1) %out, align 4
1411 declare float @llvm.fabs.f32(float) #1
1412 declare float @llvm.sqrt.f32(float) #1
1414 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1415 attributes #1 = { nounwind readnone }
1416 attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1417 attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
1418 attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" }
1420 !0 = !{float 2.500000e+00}