1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s
9 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s
10 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s
12 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s
13 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s
16 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
17 declare float @llvm.sqrt.f32(float) nounwind readnone
18 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
20 define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
21 ; GCN-DAZ-UNSAFE-LABEL: rsq_f32:
22 ; GCN-DAZ-UNSAFE: ; %bb.0:
23 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
24 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
25 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1
26 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6
27 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7
28 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
29 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2
30 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3
31 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
32 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0
33 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
34 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
35 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
36 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
37 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
39 ; GCN-IEEE-UNSAFE-LABEL: rsq_f32:
40 ; GCN-IEEE-UNSAFE: ; %bb.0:
41 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
42 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
43 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1
44 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6
45 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7
46 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
47 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2
48 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3
49 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
50 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0
51 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1
52 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
53 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
54 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
55 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
57 ; GCN-DAZ-SAFE-LABEL: rsq_f32:
58 ; GCN-DAZ-SAFE: ; %bb.0:
59 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
60 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
61 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
62 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
63 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
64 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
65 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
66 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
67 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
68 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
69 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
70 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
71 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
72 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
73 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
74 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
75 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
76 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
77 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
78 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
79 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
80 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
81 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
82 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
83 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
84 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
85 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
86 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
87 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
88 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
89 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
90 ; GCN-DAZ-SAFE-NEXT: s_endpgm
92 ; SI-IEEE-SAFE-LABEL: rsq_f32:
93 ; SI-IEEE-SAFE: ; %bb.0:
94 ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
95 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
96 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
97 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
98 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
99 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
100 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
101 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
102 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
103 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
104 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
105 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
106 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
107 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
108 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
109 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
110 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
111 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
112 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
113 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
114 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
115 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
116 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
117 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
118 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
119 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
120 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
121 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
122 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
123 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
124 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
125 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
126 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
127 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
128 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
129 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
130 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
131 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
132 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
133 ; SI-IEEE-SAFE-NEXT: s_endpgm
135 ; CI-IEEE-SAFE-LABEL: rsq_f32:
136 ; CI-IEEE-SAFE: ; %bb.0:
137 ; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
138 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
139 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
140 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
141 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
142 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
143 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
144 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
145 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
146 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
147 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
148 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
149 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
150 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
151 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
152 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
153 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
154 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
155 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
156 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
157 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
158 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
159 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
160 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
161 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
162 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
163 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
164 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
165 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
166 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
167 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
168 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
169 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
170 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
171 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
172 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
173 ; CI-IEEE-SAFE-NEXT: s_endpgm
174 ; GCN-UNSAFE-LABEL: rsq_f32:
175 ; GCN-UNSAFE: ; %bb.0:
176 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
177 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
178 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1
179 ; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6
180 ; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7
181 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
182 ; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2
183 ; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3
184 ; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
185 ; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0
186 ; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1
187 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
188 ; GCN-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
189 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
190 ; GCN-UNSAFE-NEXT: s_endpgm
191 %val = load float, ptr addrspace(1) %in, align 4
192 %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
193 %div = fdiv contract float 1.0, %sqrt, !fpmath !0
194 store float %div, ptr addrspace(1) %out, align 4
198 define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) {
199 ; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr:
200 ; GCN-DAZ-UNSAFE: ; %bb.0:
201 ; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb
202 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
203 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000
204 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
205 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2
206 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1
207 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
208 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
210 ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr:
211 ; GCN-IEEE-UNSAFE: ; %bb.0:
212 ; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb
213 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
214 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000
215 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
216 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2
217 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1
218 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
219 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
221 ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr:
222 ; GCN-DAZ-SAFE: ; %bb.0:
223 ; GCN-DAZ-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb
224 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
225 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
226 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
227 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000
228 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
229 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1
230 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s2
231 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
232 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
233 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
234 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1
235 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
236 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
237 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
238 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
239 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
240 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v2, v2, v0
241 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v3, v1, v2
242 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
243 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
244 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
245 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
246 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
247 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
248 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
249 ; GCN-DAZ-SAFE-NEXT: s_endpgm
251 ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
252 ; SI-IEEE-SAFE: ; %bb.0:
253 ; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb
254 ; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
255 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
256 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
257 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
258 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
259 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1
260 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2
261 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0
262 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
263 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
264 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
265 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1
266 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
267 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3
268 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
269 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1
270 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
271 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
272 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
273 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
274 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
275 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
276 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
277 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
278 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x7f800000
279 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
280 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s0
281 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
282 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
283 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
284 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
285 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
286 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
287 ; SI-IEEE-SAFE-NEXT: s_endpgm
289 ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
290 ; CI-IEEE-SAFE: ; %bb.0:
291 ; CI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb
292 ; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
293 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
294 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
295 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
296 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
297 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1
298 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2
299 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0
300 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
301 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
302 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
303 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1
304 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
305 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3
306 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
307 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1
308 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
309 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
310 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
311 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
312 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
313 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
314 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
315 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
316 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
317 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
318 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
319 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
320 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
321 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
322 ; CI-IEEE-SAFE-NEXT: s_endpgm
323 ; GCN-UNSAFE-LABEL: rsq_f32_sgpr:
324 ; GCN-UNSAFE: ; %bb.0:
325 ; GCN-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb
326 ; GCN-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
327 ; GCN-UNSAFE-NEXT: s_mov_b32 s3, 0xf000
328 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
329 ; GCN-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2
330 ; GCN-UNSAFE-NEXT: s_mov_b32 s2, -1
331 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
332 ; GCN-UNSAFE-NEXT: s_endpgm
333 %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
334 %div = fdiv contract float 1.0, %sqrt, !fpmath !0
335 store float %div, ptr addrspace(1) %out, align 4
339 ; Recognize that this is rsqrt(a) * rcp(b) * c,
340 ; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
342 ; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
343 define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) {
344 ; GCN-UNSAFE-LABEL: rsqrt_fmul:
345 ; GCN-UNSAFE: ; %bb.0:
346 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
347 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
348 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, 0
349 ; GCN-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
350 ; GCN-UNSAFE-NEXT: v_mov_b32_e32 v1, 0
351 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
352 ; GCN-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3]
353 ; GCN-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7]
354 ; GCN-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
355 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
356 ; GCN-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
357 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
358 ; GCN-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
359 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
360 ; GCN-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1]
361 ; GCN-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2
362 ; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3
363 ; GCN-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2
364 ; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2
365 ; GCN-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
366 ; GCN-UNSAFE-NEXT: s_endpgm
367 ; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul:
368 ; GCN-DAZ-UNSAFE: ; %bb.0:
369 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
370 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
371 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0
372 ; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v1, 0
374 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
375 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3]
376 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7]
377 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
378 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
379 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
380 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
381 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
382 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
383 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1]
384 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2
385 ; GCN-DAZ-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3
386 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3
387 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2
388 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
389 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
391 ; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul:
392 ; GCN-IEEE-UNSAFE: ; %bb.0:
393 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
394 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
395 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0
396 ; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
397 ; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v1, 0
398 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
399 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3]
400 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7]
401 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
402 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
403 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
404 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
405 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
406 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
407 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1]
408 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2
409 ; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3
410 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3
411 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2
412 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
413 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
415 ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul:
416 ; GCN-DAZ-SAFE: ; %bb.0:
417 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
418 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000
419 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0
420 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
421 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0
422 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
423 ; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7]
424 ; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3]
425 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
426 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
427 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
428 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
429 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
430 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
431 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000
432 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
433 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
434 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
435 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
436 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2
437 ; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5]
438 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5
439 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
440 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5
441 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v7, v8, v7
442 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v8, v5
443 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v7, v7, v2
444 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v8, v5, v7
445 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
446 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
447 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
448 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
449 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3
450 ; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4
451 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3
452 ; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
453 ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
454 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
455 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5
456 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5
457 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6
458 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7
459 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6
460 ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
461 ; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
462 ; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
463 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
464 ; GCN-DAZ-SAFE-NEXT: s_endpgm
466 ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul:
467 ; GCN-IEEE-SAFE: ; %bb.0:
468 ; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
469 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000
470 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0
471 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
472 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0
473 ; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
474 ; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7]
475 ; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3]
476 ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
477 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
478 ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
479 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
480 ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
481 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
482 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
483 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
484 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
485 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
486 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
487 ; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v5, v2
488 ; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5
489 ; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5
490 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v9, -v7, v5, v2
491 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v10, -v8, v5, v2
492 ; GCN-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9
493 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1]
494 ; GCN-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10
495 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1]
496 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
497 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
498 ; GCN-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
499 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
500 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3
501 ; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4
502 ; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3
503 ; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
504 ; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5]
505 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
506 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5
507 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5
508 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6
509 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7
510 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6
511 ; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
512 ; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
513 ; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
514 ; GCN-IEEE-SAFE-NEXT: s_endpgm
515 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
516 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
517 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
518 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
519 %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
521 %a = load volatile float, ptr addrspace(1) %gep.0
522 %b = load volatile float, ptr addrspace(1) %gep.1
523 %c = load volatile float, ptr addrspace(1) %gep.2
525 %x = call contract float @llvm.sqrt.f32(float %a)
526 %y = fmul contract float %x, %b
527 %z = fdiv contract float %c, %y
528 store float %z, ptr addrspace(1) %out.gep
532 define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
533 ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32:
534 ; GCN-DAZ-UNSAFE: ; %bb.0:
535 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
536 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
537 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1
538 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6
539 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7
540 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
541 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2
542 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3
543 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
544 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0
545 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
546 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
547 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
548 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
549 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
550 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
552 ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32:
553 ; GCN-IEEE-UNSAFE: ; %bb.0:
554 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
555 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
556 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1
557 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6
558 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7
559 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
560 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2
561 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3
562 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
563 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0
564 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1
565 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
566 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
567 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
568 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
569 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
571 ; GCN-DAZ-SAFE-LABEL: neg_rsq_f32:
572 ; GCN-DAZ-SAFE: ; %bb.0:
573 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
574 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
575 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
576 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
577 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
578 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
579 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
580 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
581 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
582 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
583 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
584 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
585 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
586 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
587 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
588 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
589 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
590 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
591 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
592 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
593 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
594 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
595 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
596 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
597 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
598 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
599 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
600 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
601 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
602 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
603 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
604 ; GCN-DAZ-SAFE-NEXT: s_endpgm
606 ; SI-IEEE-SAFE-LABEL: neg_rsq_f32:
607 ; SI-IEEE-SAFE: ; %bb.0:
608 ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
609 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
610 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
611 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
612 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
613 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
614 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
615 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
616 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
617 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
618 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
619 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
620 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
621 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
622 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
623 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
624 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
625 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
626 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
627 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
628 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
629 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
630 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
631 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
632 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
633 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
634 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
635 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
636 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
637 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
638 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
639 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
640 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
641 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
642 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
643 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
644 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
645 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
646 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
647 ; SI-IEEE-SAFE-NEXT: s_endpgm
649 ; CI-IEEE-SAFE-LABEL: neg_rsq_f32:
650 ; CI-IEEE-SAFE: ; %bb.0:
651 ; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
652 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
653 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
654 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
655 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
656 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
657 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
658 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
659 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
660 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
661 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
662 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
663 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
664 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
665 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
666 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
667 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
668 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
669 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
670 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
671 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
672 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
673 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
674 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
675 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
676 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
677 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
678 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
679 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
680 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
681 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
682 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
683 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
684 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
685 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
686 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
687 ; CI-IEEE-SAFE-NEXT: s_endpgm
688 ; GCN-UNSAFE-LABEL: neg_rsq_f32:
689 ; GCN-UNSAFE: ; %bb.0:
690 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
691 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
692 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1
693 ; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6
694 ; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7
695 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
696 ; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2
697 ; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3
698 ; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
699 ; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0
700 ; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1
701 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
702 ; GCN-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0
703 ; GCN-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0
704 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
705 ; GCN-UNSAFE-NEXT: s_endpgm
706 %val = load float, ptr addrspace(1) %in, align 4
707 %sqrt = call contract float @llvm.sqrt.f32(float %val)
708 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
709 store float %div, ptr addrspace(1) %out, align 4
713 define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
714 ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32:
715 ; GCN-DAZ-UNSAFE: ; %bb.0:
716 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
717 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
718 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1
719 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6
720 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7
721 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
722 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2
723 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3
724 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
725 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0
726 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
727 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
728 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
729 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
730 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
731 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
733 ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32:
734 ; GCN-IEEE-UNSAFE: ; %bb.0:
735 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
736 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
737 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1
738 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6
739 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7
740 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
741 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2
742 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3
743 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
744 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0
745 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1
746 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
747 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
748 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
749 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
750 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
752 ; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32:
753 ; GCN-DAZ-SAFE: ; %bb.0:
754 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
755 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
756 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
757 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
758 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
759 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
760 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
761 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
762 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
763 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000
764 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
765 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
766 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
767 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
768 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
769 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
770 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
771 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
772 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
773 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
774 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
775 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
776 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
777 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
778 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
779 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
780 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
781 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
782 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
783 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
784 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
785 ; GCN-DAZ-SAFE-NEXT: s_endpgm
787 ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
788 ; SI-IEEE-SAFE: ; %bb.0:
789 ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
790 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
791 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
792 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
793 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
794 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
795 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
796 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
797 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
798 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000
799 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
800 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
801 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
802 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
803 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
804 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
805 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
806 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
807 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
808 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
809 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
810 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
811 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
812 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
813 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
814 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
815 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
816 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
817 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
818 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
819 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
820 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
821 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
822 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
823 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
824 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
825 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
826 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
827 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
828 ; SI-IEEE-SAFE-NEXT: s_endpgm
830 ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
831 ; CI-IEEE-SAFE: ; %bb.0:
832 ; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
833 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
834 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
835 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
836 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
837 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
838 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
839 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
840 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
841 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000
842 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
843 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
844 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
845 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
846 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
847 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
848 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
849 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
850 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
851 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
852 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
853 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
854 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
855 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
856 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
857 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
858 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
859 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
860 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
861 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
862 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
863 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
864 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
865 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
866 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
867 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
868 ; CI-IEEE-SAFE-NEXT: s_endpgm
869 ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32:
870 ; GCN-UNSAFE: ; %bb.0:
871 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
872 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
873 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1
874 ; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6
875 ; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7
876 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
877 ; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2
878 ; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3
879 ; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
880 ; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0
881 ; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1
882 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
883 ; GCN-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0
884 ; GCN-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0
885 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
886 ; GCN-UNSAFE-NEXT: s_endpgm
887 %val = load float, ptr addrspace(1) %in, align 4
888 %val.fneg = fneg float %val
889 %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
890 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
891 store float %div, ptr addrspace(1) %out, align 4
895 define float @v_neg_rsq_neg_f32(float %val) {
896 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32:
897 ; GCN-DAZ-UNSAFE: ; %bb.0:
898 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
900 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
901 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
903 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32:
904 ; GCN-IEEE-UNSAFE: ; %bb.0:
905 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
906 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
907 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
908 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
910 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
911 ; GCN-DAZ-SAFE: ; %bb.0:
912 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
914 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
915 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
916 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
917 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
918 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
919 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
920 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
921 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
922 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0
923 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
924 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2
925 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
926 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
927 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
928 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
929 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
930 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
931 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
933 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
934 ; SI-IEEE-SAFE: ; %bb.0:
935 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
937 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
938 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
939 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
940 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
941 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
942 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
943 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
944 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
945 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
946 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
947 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
948 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
949 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
950 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
951 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
952 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
953 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
954 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
955 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
956 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
957 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
958 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
959 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
960 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
961 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
962 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
964 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
965 ; CI-IEEE-SAFE: ; %bb.0:
966 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
967 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
968 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
969 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
970 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
971 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
972 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
973 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
974 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
975 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
976 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
977 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
978 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
979 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
980 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
981 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
982 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
983 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
984 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
985 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
986 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
987 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
988 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
989 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
990 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
991 %val.fneg = fneg float %val
992 %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
993 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
997 define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
998 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
999 ; GCN-DAZ-UNSAFE: ; %bb.0:
1000 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1001 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1002 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1003 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1004 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1005 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1007 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
1008 ; GCN-IEEE-UNSAFE: ; %bb.0:
1009 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1011 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1012 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1013 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1014 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1016 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
1017 ; GCN-DAZ-SAFE: ; %bb.0:
1018 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1019 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1020 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000
1021 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s5
1022 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
1023 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
1024 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1
1025 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2
1026 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1027 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1028 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1029 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1
1030 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1031 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1032 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1033 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1034 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v3, -v0, s5
1035 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1036 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc
1037 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0
1038 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260
1039 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4
1040 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1041 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
1042 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3
1043 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5
1044 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2
1045 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0
1046 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3
1047 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2
1048 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1049 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1050 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4
1051 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1052 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1053 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1054 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1056 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
1057 ; SI-IEEE-SAFE: ; %bb.0:
1058 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1060 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1061 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7
1062 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1063 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
1064 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1065 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1066 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1067 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1068 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1069 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1070 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1071 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1072 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1073 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1074 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1075 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7
1076 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1077 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc
1078 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1079 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1080 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1081 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1082 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1083 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1084 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1085 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1086 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1087 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1088 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1089 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1090 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1091 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1092 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1093 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1094 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1095 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1096 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1097 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1098 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1099 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1100 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1101 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1102 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1103 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1104 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
1105 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1106 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1107 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1108 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1109 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1111 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
1112 ; CI-IEEE-SAFE: ; %bb.0:
1113 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1114 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1115 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1116 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7
1117 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1118 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
1119 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1120 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1121 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1122 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1123 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1124 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1125 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1126 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1127 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1128 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1129 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1130 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7
1131 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1132 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc
1133 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1134 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1135 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1136 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1137 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1138 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1139 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1140 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1141 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1142 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1143 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1144 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1145 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1146 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1147 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1148 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1149 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1150 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1151 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1152 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1153 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1154 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1155 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1156 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1157 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1158 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1159 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1160 %val.fneg = fneg <2 x float> %val
1161 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
1162 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1163 ret <2 x float> %div
1166 define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
1167 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1168 ; GCN-DAZ-UNSAFE: ; %bb.0:
1169 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1170 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1171 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1172 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1174 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1175 ; GCN-IEEE-UNSAFE: ; %bb.0:
1176 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1177 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1178 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1179 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1181 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1182 ; GCN-DAZ-SAFE: ; %bb.0:
1183 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1184 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1185 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
1186 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1187 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
1188 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0
1189 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2
1190 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1191 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1192 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1193 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0
1194 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1195 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1196 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1197 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1198 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1199 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1200 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1201 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1202 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1203 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1205 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1206 ; SI-IEEE-SAFE: ; %bb.0:
1207 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1209 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
1210 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1211 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
1212 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1213 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1214 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1215 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1216 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1217 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1218 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1219 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1220 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1221 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1222 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1223 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1224 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1225 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1226 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
1227 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1228 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
1229 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1230 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1231 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1232 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1233 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1234 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1235 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1237 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1238 ; CI-IEEE-SAFE: ; %bb.0:
1239 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1241 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
1242 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1243 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
1244 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1245 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1246 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1247 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1248 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1249 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1250 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1251 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1252 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1253 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1254 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1255 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1256 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1257 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1258 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1259 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1260 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1261 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1262 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1263 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1264 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1265 %val0.neg = fneg float %val0
1266 %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
1267 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
1268 %user = fmul contract float %div, %val1
1272 define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
1273 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1274 ; GCN-DAZ-UNSAFE: ; %bb.0:
1275 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1276 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1277 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1278 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1279 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1280 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1282 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1283 ; GCN-IEEE-UNSAFE: ; %bb.0:
1284 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1285 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1286 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1287 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1288 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1289 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1291 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1292 ; GCN-DAZ-SAFE: ; %bb.0:
1293 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1294 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1295 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000
1296 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s5
1297 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
1298 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
1299 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1
1300 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4
1301 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4
1302 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5
1303 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5
1304 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1
1305 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4
1306 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5
1307 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1308 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1309 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v5, -v0, s5
1310 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1311 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v5, vcc
1312 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0
1313 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
1314 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6
1315 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1316 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5
1317 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
1318 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5
1319 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4
1320 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0
1321 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5
1322 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4
1323 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1324 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1325 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6
1326 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1327 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1328 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1329 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1330 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1331 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1333 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1334 ; SI-IEEE-SAFE: ; %bb.0:
1335 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1337 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1338 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7
1339 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1340 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
1341 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1342 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1343 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1344 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1345 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1346 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1347 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1348 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1349 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1350 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1351 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1352 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7
1353 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1354 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc
1355 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1356 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1357 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1358 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1359 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1360 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1361 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1362 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1363 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1364 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1365 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1366 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1367 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1368 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1369 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1370 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1371 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1372 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1373 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1374 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
1375 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1376 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1377 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1378 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1379 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1380 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1381 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
1382 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1383 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1384 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1385 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1386 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1387 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1388 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1390 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1391 ; CI-IEEE-SAFE: ; %bb.0:
1392 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1393 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1394 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1395 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7
1396 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1397 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
1398 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1399 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1400 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1401 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1402 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1403 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1404 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1405 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1406 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1407 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1408 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1409 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7
1410 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1411 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc
1412 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1413 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1414 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1415 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1416 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1417 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1418 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1419 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1420 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1421 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1422 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1423 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1424 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1425 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1426 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1427 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1428 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1429 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1430 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1431 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1432 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1433 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1434 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1435 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1436 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1437 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1438 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1439 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1440 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1441 %val0.fneg = fneg <2 x float> %val0
1442 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
1443 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1444 %user = fmul contract <2 x float> %div, %val1
1445 ret <2 x float> %user
1448 define float @v_neg_rsq_f32(float %val) {
1449 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32:
1450 ; GCN-DAZ-UNSAFE: ; %bb.0:
1451 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1452 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1453 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1454 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1456 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32:
1457 ; GCN-IEEE-UNSAFE: ; %bb.0:
1458 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1459 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1460 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1461 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1463 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
1464 ; GCN-DAZ-SAFE: ; %bb.0:
1465 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1467 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1468 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1469 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1470 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
1471 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
1472 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
1473 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
1474 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
1475 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0
1476 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
1477 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2
1478 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1479 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1480 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
1481 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1482 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1483 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1484 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1486 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
1487 ; SI-IEEE-SAFE: ; %bb.0:
1488 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1489 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1490 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1491 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1492 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1493 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
1494 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
1495 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
1496 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
1497 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
1498 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
1499 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
1500 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
1501 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
1502 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1503 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1504 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
1505 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1506 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1507 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
1508 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
1509 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
1510 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
1511 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
1512 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1513 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1514 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
1515 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1517 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
1518 ; CI-IEEE-SAFE: ; %bb.0:
1519 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1520 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1521 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1522 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1523 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1524 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
1525 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
1526 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
1527 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
1528 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
1529 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
1530 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
1531 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
1532 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
1533 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1534 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1535 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
1536 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1537 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1538 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
1539 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
1540 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1541 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1542 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
1543 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1544 %sqrt = call contract float @llvm.sqrt.f32(float %val)
1545 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
1549 define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
1550 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32:
1551 ; GCN-DAZ-UNSAFE: ; %bb.0:
1552 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1553 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1554 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1555 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1556 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1557 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1559 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32:
1560 ; GCN-IEEE-UNSAFE: ; %bb.0:
1561 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1562 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1563 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1564 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1565 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1566 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1568 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
1569 ; GCN-DAZ-SAFE: ; %bb.0:
1570 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1571 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1572 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
1573 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
1574 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1575 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1
1576 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2
1577 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1578 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1579 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1580 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1
1581 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1582 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1583 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1584 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1585 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0
1586 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1587 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1588 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0
1589 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260
1590 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4
1591 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1592 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
1593 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3
1594 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5
1595 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2
1596 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0
1597 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3
1598 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2
1599 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1600 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1601 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4
1602 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1603 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1604 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1605 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1607 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
1608 ; SI-IEEE-SAFE: ; %bb.0:
1609 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1610 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1611 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
1612 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1613 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1614 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1615 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1616 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1617 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1618 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1619 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1620 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1621 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1622 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1623 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1624 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1625 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0
1626 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1627 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1628 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1629 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1630 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1631 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1632 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1633 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1634 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1635 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1636 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1637 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1638 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1639 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1640 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1641 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1642 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1643 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1644 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1645 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1646 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1647 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1648 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1649 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1650 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1651 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1652 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1653 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1654 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
1655 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1656 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1657 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1658 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1659 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1661 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
1662 ; CI-IEEE-SAFE: ; %bb.0:
1663 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1664 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1665 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
1666 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1667 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1668 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1669 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1670 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1671 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1672 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1673 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1674 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1675 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1676 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1677 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1678 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1679 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0
1680 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1681 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1682 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1683 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1684 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1685 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1686 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1687 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1688 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1689 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1690 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1691 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1692 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1693 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1694 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1695 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1696 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1697 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1698 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1699 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1700 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1701 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1702 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1703 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1704 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1705 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1706 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1707 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1708 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1709 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
1710 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1711 ret <2 x float> %div
1714 define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
1715 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
1716 ; GCN-DAZ-UNSAFE: ; %bb.0:
1717 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1718 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1719 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1720 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1722 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
1723 ; GCN-IEEE-UNSAFE: ; %bb.0:
1724 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1725 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1726 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1727 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1729 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
1730 ; GCN-DAZ-SAFE: ; %bb.0:
1731 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1732 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1733 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
1734 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1735 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1736 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0
1737 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2
1738 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1739 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1740 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1741 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0
1742 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1743 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1744 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1745 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1746 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1747 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1748 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1749 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1750 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1751 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1753 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
1754 ; SI-IEEE-SAFE: ; %bb.0:
1755 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1756 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1757 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
1758 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1759 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1760 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1761 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1762 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1763 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1764 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1765 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1766 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1767 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1768 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1769 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1770 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1771 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1772 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1773 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1774 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
1775 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1776 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
1777 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1778 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1779 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1780 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1781 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1782 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1783 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1785 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
1786 ; CI-IEEE-SAFE: ; %bb.0:
1787 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1788 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1789 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
1790 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1791 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1792 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1793 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1794 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1795 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1796 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1797 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1798 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1799 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1800 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1801 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1802 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1803 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1804 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1805 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1806 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1807 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1808 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1809 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1810 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1811 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1812 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1813 %sqrt = call contract float @llvm.sqrt.f32(float %val0)
1814 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
1815 %user = fmul contract float %div, %val1
1819 define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
1820 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1821 ; GCN-DAZ-UNSAFE: ; %bb.0:
1822 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1823 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1824 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1825 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1826 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1827 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1829 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1830 ; GCN-IEEE-UNSAFE: ; %bb.0:
1831 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1832 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1833 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1834 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1835 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1836 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1838 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1839 ; GCN-DAZ-SAFE: ; %bb.0:
1840 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1841 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1842 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
1843 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
1844 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1845 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1
1846 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4
1847 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4
1848 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5
1849 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5
1850 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1
1851 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4
1852 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5
1853 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1854 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1855 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v0
1856 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1857 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
1858 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0
1859 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
1860 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6
1861 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1862 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5
1863 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
1864 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5
1865 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4
1866 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0
1867 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5
1868 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4
1869 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1870 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1871 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6
1872 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1873 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1874 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1875 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1876 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1877 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1879 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1880 ; SI-IEEE-SAFE: ; %bb.0:
1881 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1882 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1883 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
1884 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1885 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1886 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1887 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1888 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1889 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1890 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1891 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1892 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1893 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1894 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1895 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1896 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1897 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0
1898 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1899 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1900 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1901 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1902 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1903 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1904 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1905 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1906 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1907 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1908 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1909 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1910 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1911 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1912 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1913 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1914 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1915 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1916 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1917 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1918 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1919 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
1920 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1921 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1922 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1923 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1924 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1925 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1926 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
1927 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1928 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1929 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1930 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1931 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1932 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1933 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1935 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1936 ; CI-IEEE-SAFE: ; %bb.0:
1937 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1938 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1939 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
1940 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1941 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1942 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1943 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1944 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1945 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1946 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1947 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1948 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1949 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1950 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1951 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1952 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1953 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0
1954 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1955 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1956 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1957 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1958 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1959 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1960 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1961 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1962 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1963 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1964 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1965 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1966 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1967 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1968 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1969 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1970 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1971 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1972 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1973 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1974 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1975 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1976 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1977 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1978 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1979 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1980 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1981 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1982 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1983 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1984 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1985 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0)
1986 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1987 %user = fmul contract <2 x float> %div, %val1
1988 ret <2 x float> %user
1991 define float @v_rsq_f32(float %val) {
1992 ; GCN-DAZ-LABEL: v_rsq_f32:
1994 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1995 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
1996 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
1998 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32:
1999 ; GCN-IEEE-UNSAFE: ; %bb.0:
2000 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2001 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2002 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2004 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
2005 ; GCN-IEEE-SAFE: ; %bb.0:
2006 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2007 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2008 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000
2009 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2010 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2011 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2012 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2013 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000
2014 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2015 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2016 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2017 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2018 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2022 define { float, float } @v_rsq_f32_multi_use(float %val) {
2023 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_multi_use:
2024 ; GCN-DAZ-UNSAFE: ; %bb.0:
2025 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2026 ; GCN-DAZ-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0
2027 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0
2028 ; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
2029 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2031 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use:
2032 ; GCN-IEEE-UNSAFE: ; %bb.0:
2033 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2034 ; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0
2035 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0
2036 ; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
2037 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2039 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
2040 ; GCN-DAZ-SAFE: ; %bb.0:
2041 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2042 ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
2043 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v1, v0
2044 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
2046 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
2047 ; SI-IEEE-SAFE: ; %bb.0:
2048 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2049 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2050 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2051 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2052 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2053 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2054 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2055 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2056 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2057 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2058 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2059 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2060 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2061 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2062 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2063 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2064 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2065 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2066 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2067 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
2068 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2069 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
2070 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
2071 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2072 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
2073 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
2074 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2
2075 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2077 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
2078 ; CI-IEEE-SAFE: ; %bb.0:
2079 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2080 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2081 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2082 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2083 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2084 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2085 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2086 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2087 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2088 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2089 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2090 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2091 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2092 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2093 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2094 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2095 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2096 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2097 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2098 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2099 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2100 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
2101 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
2102 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2
2103 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2104 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2105 %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
2106 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2107 %insert.1 = insertvalue { float, float } %insert.0, float %div, 1
2108 ret { float, float } %insert.1
2111 define float @v_rsq_f32_missing_contract0(float %val) {
2112 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
2113 ; GCN-DAZ-UNSAFE: ; %bb.0:
2114 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2115 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2116 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2118 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
2119 ; GCN-IEEE-UNSAFE: ; %bb.0:
2120 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2121 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2122 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2124 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
2125 ; GCN-DAZ-SAFE: ; %bb.0:
2126 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2127 ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
2128 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
2129 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
2131 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
2132 ; SI-IEEE-SAFE: ; %bb.0:
2133 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2134 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2135 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2136 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2137 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2138 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2139 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2140 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2141 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2142 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2143 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2144 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2145 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2146 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2147 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2148 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2149 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2150 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2151 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2152 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
2153 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2154 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
2155 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
2156 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2157 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2158 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2159 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2160 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2162 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
2163 ; CI-IEEE-SAFE: ; %bb.0:
2164 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2165 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2166 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2167 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2168 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2169 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2170 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2171 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2172 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2173 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2174 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2175 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2176 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2177 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2178 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2179 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2180 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2181 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2182 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2183 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2184 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2185 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2186 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2187 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2188 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2189 %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1
2190 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2194 define float @v_rsq_f32_missing_contract1(float %val) {
2195 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
2196 ; GCN-DAZ-UNSAFE: ; %bb.0:
2197 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2198 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2199 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2201 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
2202 ; GCN-IEEE-UNSAFE: ; %bb.0:
2203 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2204 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2205 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2207 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
2208 ; GCN-DAZ-SAFE: ; %bb.0:
2209 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2210 ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
2211 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
2212 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
2214 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
2215 ; SI-IEEE-SAFE: ; %bb.0:
2216 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2217 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2218 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2219 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2220 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2221 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2222 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2223 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2224 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2225 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2226 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2227 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2228 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2229 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2230 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2231 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2232 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2233 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2234 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2235 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
2236 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2237 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
2238 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
2239 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2240 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2241 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2242 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2243 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2245 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
2246 ; CI-IEEE-SAFE: ; %bb.0:
2247 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2248 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2249 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2250 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2251 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2252 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2253 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2254 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2255 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2256 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2257 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2258 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2259 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2260 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2261 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2262 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2263 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2264 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2265 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2266 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2267 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2268 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2269 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2270 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2271 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2272 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2273 %div = fdiv float 1.0, %sqrt, !fpmath !1
2277 ; Test that we contract into FMA for an fadd user after introducing
2279 define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
2280 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user:
2282 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2283 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2284 ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
2285 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2287 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user:
2288 ; GCN-IEEE-UNSAFE: ; %bb.0:
2289 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2290 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2291 ; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1
2292 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2294 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
2295 ; GCN-IEEE-SAFE: ; %bb.0:
2296 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2297 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2298 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000
2299 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2300 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2301 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2302 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2303 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
2304 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2305 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1
2306 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2307 %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
2308 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2309 %add = fadd contract float %div, %val1
2313 ; Missing contract on the fdiv
2314 define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %val1) {
2315 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract0:
2317 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2318 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2319 ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
2320 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2322 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
2323 ; GCN-IEEE-UNSAFE: ; %bb.0:
2324 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2325 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2326 ; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1
2327 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2329 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
2330 ; GCN-IEEE-SAFE: ; %bb.0:
2331 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2332 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2333 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000
2334 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2335 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2336 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2337 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2338 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
2339 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2340 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1
2341 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2342 %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
2343 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2344 %add = fadd contract float %div, %val1
2348 ; Missing contract on the fadd
2349 define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %val1) {
2350 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract1:
2352 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2353 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2354 ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
2355 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2357 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
2358 ; GCN-IEEE-UNSAFE: ; %bb.0:
2359 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2360 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2361 ; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1
2362 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2364 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
2365 ; GCN-IEEE-SAFE: ; %bb.0:
2366 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2367 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2368 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000
2369 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2370 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2371 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2372 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2373 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
2374 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2375 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2376 ; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1
2377 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2378 %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
2379 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2380 %add = fadd float %div, %val1
2384 define float @v_rsq_f32_known_never_denormal(float nofpclass(sub) %val) {
2385 ; GCN-DAZ-LABEL: v_rsq_f32_known_never_denormal:
2387 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2388 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2389 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2391 ; GCN-IEEE-LABEL: v_rsq_f32_known_never_denormal:
2392 ; GCN-IEEE: ; %bb.0:
2393 ; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2394 ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
2395 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
2396 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2397 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2401 define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
2402 ; GCN-DAZ-LABEL: v_rsq_f32_known_never_posdenormal:
2404 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2405 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2406 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2408 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_known_never_posdenormal:
2409 ; GCN-IEEE-UNSAFE: ; %bb.0:
2410 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2411 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2412 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2414 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
2415 ; GCN-IEEE-SAFE: ; %bb.0:
2416 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2418 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000
2419 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2420 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2421 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2422 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2423 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000
2424 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2425 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2426 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2427 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2428 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2432 !0 = !{float 2.500000e+00}
2433 !1 = !{float 1.000000e+00}
2435 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2436 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2437 ; CI-DAZ-SAFE: {{.*}}
2438 ; CI-DAZ-UNSAFE: {{.*}}
2439 ; CI-IEEE-UNSAFE: {{.*}}
2440 ; SI-DAZ-SAFE: {{.*}}
2441 ; SI-DAZ-UNSAFE: {{.*}}
2442 ; SI-IEEE-UNSAFE: {{.*}}