1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s
9 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s
10 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s
12 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s
13 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s
16 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
17 declare float @llvm.sqrt.f32(float) nounwind readnone
18 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
20 define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
21 ; GCN-DAZ-UNSAFE-LABEL: rsq_f32:
22 ; GCN-DAZ-UNSAFE: ; %bb.0:
23 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
24 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
25 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1
26 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6
27 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7
28 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
29 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2
30 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3
31 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
32 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0
33 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
34 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
35 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
36 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
37 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
39 ; GCN-IEEE-UNSAFE-LABEL: rsq_f32:
40 ; GCN-IEEE-UNSAFE: ; %bb.0:
41 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
42 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
43 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1
44 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6
45 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7
46 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
47 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2
48 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3
49 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
50 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0
51 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1
52 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
53 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
54 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
55 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
57 ; GCN-DAZ-SAFE-LABEL: rsq_f32:
58 ; GCN-DAZ-SAFE: ; %bb.0:
59 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
60 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
61 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
62 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
63 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
64 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
65 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
66 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
67 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
68 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
69 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
70 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
71 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
72 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
73 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
74 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
75 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
76 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
77 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
78 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
79 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
80 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
81 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
82 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
83 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
84 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
85 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
86 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
87 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
88 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
89 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
90 ; GCN-DAZ-SAFE-NEXT: s_endpgm
92 ; SI-IEEE-SAFE-LABEL: rsq_f32:
93 ; SI-IEEE-SAFE: ; %bb.0:
94 ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
95 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
96 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
97 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
98 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
99 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
100 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
101 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
102 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
103 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
104 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
105 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
106 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
107 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
108 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
109 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
110 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
111 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
112 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
113 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
114 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
115 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
116 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
117 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
118 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
119 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
120 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
121 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
122 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
123 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
124 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
125 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
126 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
127 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
128 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
129 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
130 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
131 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
132 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
133 ; SI-IEEE-SAFE-NEXT: s_endpgm
135 ; CI-IEEE-SAFE-LABEL: rsq_f32:
136 ; CI-IEEE-SAFE: ; %bb.0:
137 ; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
138 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
139 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
140 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
141 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
142 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
143 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
144 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
145 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
146 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
147 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
148 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
149 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
150 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
151 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
152 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
153 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
154 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
155 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
156 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
157 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
158 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
159 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
160 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
161 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
162 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
163 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
164 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
165 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
166 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
167 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
168 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
169 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
170 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
171 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
172 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
173 ; CI-IEEE-SAFE-NEXT: s_endpgm
174 ; GCN-UNSAFE-LABEL: rsq_f32:
175 ; GCN-UNSAFE: ; %bb.0:
176 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
177 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
178 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1
179 ; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6
180 ; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7
181 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
182 ; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2
183 ; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3
184 ; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
185 ; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0
186 ; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1
187 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
188 ; GCN-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
189 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
190 ; GCN-UNSAFE-NEXT: s_endpgm
191 %val = load float, ptr addrspace(1) %in, align 4
192 %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
193 %div = fdiv contract float 1.0, %sqrt, !fpmath !0
194 store float %div, ptr addrspace(1) %out, align 4
198 define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) {
199 ; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr:
200 ; GCN-DAZ-UNSAFE: ; %bb.0:
201 ; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[4:5], 0xb
202 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
203 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000
204 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
205 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2
206 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1
207 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
208 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
210 ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr:
211 ; GCN-IEEE-UNSAFE: ; %bb.0:
212 ; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[4:5], 0xb
213 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
214 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000
215 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
216 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2
217 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1
218 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
219 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
221 ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr:
222 ; GCN-DAZ-SAFE: ; %bb.0:
223 ; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb
224 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
225 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
226 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000
227 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1
228 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
229 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1
230 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0
231 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
232 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
233 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
234 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
235 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
236 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
237 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
238 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
239 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
240 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v2, v2, v0
241 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v3, v1, v2
242 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
243 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
244 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
245 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
246 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
247 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
248 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
249 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
250 ; GCN-DAZ-SAFE-NEXT: s_endpgm
252 ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
253 ; SI-IEEE-SAFE: ; %bb.0:
254 ; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb
255 ; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
256 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
257 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
258 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
259 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
260 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1
261 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0
262 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
263 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
264 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
265 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
266 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1
267 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
268 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3
269 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
270 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1
271 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
272 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
273 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
274 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
275 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
276 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
277 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
278 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
279 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x7f800000
280 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
281 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s0
282 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
283 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
284 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
285 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
286 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
287 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
288 ; SI-IEEE-SAFE-NEXT: s_endpgm
290 ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
291 ; CI-IEEE-SAFE: ; %bb.0:
292 ; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb
293 ; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
294 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
295 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
296 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
297 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
298 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1
299 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0
300 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
301 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
302 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
303 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
304 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1
305 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
306 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3
307 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
308 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1
309 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
310 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
311 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
312 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
313 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
314 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
315 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
316 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
317 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
318 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
319 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
320 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
321 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
322 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
323 ; CI-IEEE-SAFE-NEXT: s_endpgm
324 ; GCN-UNSAFE-LABEL: rsq_f32_sgpr:
325 ; GCN-UNSAFE: ; %bb.0:
326 ; GCN-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb
327 ; GCN-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
328 ; GCN-UNSAFE-NEXT: s_mov_b32 s3, 0xf000
329 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
330 ; GCN-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2
331 ; GCN-UNSAFE-NEXT: s_mov_b32 s2, -1
332 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
333 ; GCN-UNSAFE-NEXT: s_endpgm
334 %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
335 %div = fdiv contract float 1.0, %sqrt, !fpmath !0
336 store float %div, ptr addrspace(1) %out, align 4
340 ; Recognize that this is rsqrt(a) * rcp(b) * c,
341 ; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
343 ; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
344 define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) {
345 ; GCN-UNSAFE-LABEL: rsqrt_fmul:
346 ; GCN-UNSAFE: ; %bb.0:
347 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
348 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
349 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, 0
350 ; GCN-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
351 ; GCN-UNSAFE-NEXT: v_mov_b32_e32 v1, 0
352 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
353 ; GCN-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3]
354 ; GCN-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7]
355 ; GCN-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
356 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
357 ; GCN-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
358 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
359 ; GCN-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
360 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
361 ; GCN-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1]
362 ; GCN-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v2
363 ; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3
364 ; GCN-UNSAFE-NEXT: v_rcp_f32_e32 v2, v2
365 ; GCN-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2
366 ; GCN-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
367 ; GCN-UNSAFE-NEXT: s_endpgm
368 ; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul:
369 ; GCN-DAZ-UNSAFE: ; %bb.0:
370 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
371 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
372 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0
373 ; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
374 ; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v1, 0
375 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
376 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3]
377 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7]
378 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
379 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
380 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
381 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
382 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
383 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
384 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1]
385 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2
386 ; GCN-DAZ-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3
387 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3
388 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2
389 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
390 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
392 ; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul:
393 ; GCN-IEEE-UNSAFE: ; %bb.0:
394 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
395 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
396 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0
397 ; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
398 ; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v1, 0
399 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
400 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[8:9], s[2:3]
401 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[10:11], s[6:7]
402 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
403 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
404 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
405 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
406 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
407 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
408 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b64 s[4:5], s[0:1]
409 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v2, v2
410 ; GCN-IEEE-UNSAFE-NEXT: v_rcp_f32_e32 v3, v3
411 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v2, v3
412 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e32 v2, v4, v2
413 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
414 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
416 ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul:
417 ; GCN-DAZ-SAFE: ; %bb.0:
418 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
419 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000
420 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0
421 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
422 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0
423 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
424 ; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7]
425 ; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3]
426 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
427 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
428 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
429 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
430 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
431 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
432 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000
433 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
434 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
435 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
436 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
437 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2
438 ; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5]
439 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5
440 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
441 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5
442 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v7, v8, v7
443 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v8, v5
444 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v7, v7, v2
445 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v8, v5, v7
446 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
447 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
448 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
449 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
450 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3
451 ; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4
452 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3
453 ; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
454 ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
455 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
456 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5
457 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5
458 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6
459 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7
460 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6
461 ; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
462 ; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
463 ; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
464 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
465 ; GCN-DAZ-SAFE-NEXT: s_endpgm
467 ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul:
468 ; GCN-IEEE-SAFE: ; %bb.0:
469 ; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
470 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000
471 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0
472 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
473 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0
474 ; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
475 ; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7]
476 ; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3]
477 ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
478 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
479 ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
480 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
481 ; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
482 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
483 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
484 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
485 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
486 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
487 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
488 ; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v5, v2
489 ; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5
490 ; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5
491 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v9, -v7, v5, v2
492 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v10, -v8, v5, v2
493 ; GCN-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9
494 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1]
495 ; GCN-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10
496 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1]
497 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
498 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
499 ; GCN-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
500 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
501 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3
502 ; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4
503 ; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3
504 ; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
505 ; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5]
506 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
507 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5
508 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5
509 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6
510 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7
511 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6
512 ; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
513 ; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
514 ; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
515 ; GCN-IEEE-SAFE-NEXT: s_endpgm
516 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
517 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
518 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
519 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
520 %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
522 %a = load volatile float, ptr addrspace(1) %gep.0
523 %b = load volatile float, ptr addrspace(1) %gep.1
524 %c = load volatile float, ptr addrspace(1) %gep.2
526 %x = call contract float @llvm.sqrt.f32(float %a)
527 %y = fmul contract float %x, %b
528 %z = fdiv contract float %c, %y
529 store float %z, ptr addrspace(1) %out.gep
533 define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
534 ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32:
535 ; GCN-DAZ-UNSAFE: ; %bb.0:
536 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
537 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
538 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1
539 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6
540 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7
541 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
542 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2
543 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3
544 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
545 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0
546 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
547 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
548 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
549 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
550 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
551 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
553 ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32:
554 ; GCN-IEEE-UNSAFE: ; %bb.0:
555 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
556 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
557 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1
558 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6
559 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7
560 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
561 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2
562 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3
563 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
564 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0
565 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1
566 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
567 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
568 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
569 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
570 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
572 ; GCN-DAZ-SAFE-LABEL: neg_rsq_f32:
573 ; GCN-DAZ-SAFE: ; %bb.0:
574 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
575 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
576 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
577 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
578 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
579 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
580 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
581 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
582 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
583 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
584 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
585 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
586 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
587 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
588 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
589 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
590 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
591 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
592 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
593 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
594 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
595 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
596 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
597 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
598 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
599 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
600 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
601 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
602 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
603 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
604 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
605 ; GCN-DAZ-SAFE-NEXT: s_endpgm
607 ; SI-IEEE-SAFE-LABEL: neg_rsq_f32:
608 ; SI-IEEE-SAFE: ; %bb.0:
609 ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
610 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
611 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
612 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
613 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
614 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
615 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
616 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
617 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
618 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
619 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
620 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
621 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
622 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
623 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
624 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
625 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
626 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
627 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
628 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
629 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
630 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
631 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
632 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
633 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
634 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
635 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
636 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
637 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
638 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
639 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
640 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
641 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
642 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
643 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
644 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
645 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
646 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
647 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
648 ; SI-IEEE-SAFE-NEXT: s_endpgm
650 ; CI-IEEE-SAFE-LABEL: neg_rsq_f32:
651 ; CI-IEEE-SAFE: ; %bb.0:
652 ; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
653 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
654 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
655 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
656 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
657 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
658 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
659 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
660 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
661 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
662 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
663 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
664 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
665 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
666 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
667 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
668 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
669 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
670 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
671 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
672 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
673 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
674 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
675 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
676 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
677 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
678 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
679 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
680 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
681 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
682 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
683 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
684 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
685 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
686 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
687 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
688 ; CI-IEEE-SAFE-NEXT: s_endpgm
689 ; GCN-UNSAFE-LABEL: neg_rsq_f32:
690 ; GCN-UNSAFE: ; %bb.0:
691 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
692 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
693 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1
694 ; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6
695 ; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7
696 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
697 ; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2
698 ; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3
699 ; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
700 ; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0
701 ; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1
702 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
703 ; GCN-UNSAFE-NEXT: v_sqrt_f32_e32 v0, v0
704 ; GCN-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0
705 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
706 ; GCN-UNSAFE-NEXT: s_endpgm
707 %val = load float, ptr addrspace(1) %in, align 4
708 %sqrt = call contract float @llvm.sqrt.f32(float %val)
709 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
710 store float %div, ptr addrspace(1) %out, align 4
714 define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
715 ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32:
716 ; GCN-DAZ-UNSAFE: ; %bb.0:
717 ; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
718 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
719 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1
720 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6
721 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s11, s7
722 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
723 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s8, s2
724 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s9, s3
725 ; GCN-DAZ-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
726 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s4, s0
727 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
728 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
729 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
730 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
731 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
732 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm
734 ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32:
735 ; GCN-IEEE-UNSAFE: ; %bb.0:
736 ; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
737 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
738 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1
739 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6
740 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s11, s7
741 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
742 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s8, s2
743 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s9, s3
744 ; GCN-IEEE-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
745 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s4, s0
746 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s5, s1
747 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0)
748 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
749 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
750 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
751 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm
753 ; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32:
754 ; GCN-DAZ-SAFE: ; %bb.0:
755 ; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
756 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
757 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
758 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
759 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
760 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
761 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
762 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
763 ; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
764 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000
765 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
766 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
767 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
768 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
769 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
770 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
771 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
772 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
773 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
774 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
775 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
776 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
777 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
778 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
779 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
780 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
781 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
782 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
783 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
784 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
785 ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
786 ; GCN-DAZ-SAFE-NEXT: s_endpgm
788 ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
789 ; SI-IEEE-SAFE: ; %bb.0:
790 ; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
791 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
792 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
793 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
794 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
795 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
796 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
797 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
798 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
799 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000
800 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
801 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
802 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
803 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
804 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
805 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
806 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
807 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
808 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
809 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
810 ; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
811 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
812 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
813 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
814 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
815 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
816 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
817 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
818 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
819 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
820 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
821 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
822 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
823 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
824 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
825 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
826 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
827 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
828 ; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
829 ; SI-IEEE-SAFE-NEXT: s_endpgm
831 ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
832 ; CI-IEEE-SAFE: ; %bb.0:
833 ; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
834 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
835 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
836 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
837 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
838 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
839 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
840 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
841 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
842 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000
843 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
844 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
845 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
846 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
847 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
848 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
849 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
850 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
851 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
852 ; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
853 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
854 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
855 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
856 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
857 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
858 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
859 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
860 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
861 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
862 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
863 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
864 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
865 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
866 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
867 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
868 ; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
869 ; CI-IEEE-SAFE-NEXT: s_endpgm
870 ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32:
871 ; GCN-UNSAFE: ; %bb.0:
872 ; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
873 ; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000
874 ; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1
875 ; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6
876 ; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7
877 ; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0)
878 ; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2
879 ; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3
880 ; GCN-UNSAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
881 ; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0
882 ; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1
883 ; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0)
884 ; GCN-UNSAFE-NEXT: v_sqrt_f32_e64 v0, -v0
885 ; GCN-UNSAFE-NEXT: v_rcp_f32_e64 v0, -v0
886 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
887 ; GCN-UNSAFE-NEXT: s_endpgm
888 %val = load float, ptr addrspace(1) %in, align 4
889 %val.fneg = fneg float %val
890 %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
891 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
892 store float %div, ptr addrspace(1) %out, align 4
896 define float @v_neg_rsq_neg_f32(float %val) {
897 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32:
898 ; GCN-DAZ-UNSAFE: ; %bb.0:
899 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
901 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
902 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
904 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32:
905 ; GCN-IEEE-UNSAFE: ; %bb.0:
906 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
908 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
909 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
911 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
912 ; GCN-DAZ-SAFE: ; %bb.0:
913 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
914 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
915 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
916 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
917 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
918 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
919 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
920 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
921 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
922 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
923 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0
924 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
925 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2
926 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
927 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
928 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
929 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
930 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
931 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
932 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
934 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
935 ; SI-IEEE-SAFE: ; %bb.0:
936 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
938 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
939 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
940 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
941 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
942 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
943 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
944 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
945 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
946 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
947 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
948 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
949 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
950 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
951 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
952 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
953 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
954 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
955 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
956 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
957 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
958 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
959 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
960 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
961 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
962 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
963 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
965 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
966 ; CI-IEEE-SAFE: ; %bb.0:
967 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
968 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
969 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
970 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
971 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
972 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
973 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
974 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
975 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
976 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
977 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
978 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
979 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
980 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
981 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
982 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
983 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
984 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
985 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
986 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
987 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
988 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
989 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
990 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
991 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
992 %val.fneg = fneg float %val
993 %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
994 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
998 define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
999 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
1000 ; GCN-DAZ-UNSAFE: ; %bb.0:
1001 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1003 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1004 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1005 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1006 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1008 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
1009 ; GCN-IEEE-UNSAFE: ; %bb.0:
1010 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1012 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1013 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1014 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1015 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1017 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
1018 ; GCN-DAZ-SAFE: ; %bb.0:
1019 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1020 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1021 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000
1022 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s5
1023 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
1024 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
1025 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1
1026 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2
1027 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1028 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1029 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1030 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1
1031 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1032 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1033 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1034 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1035 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v3, -v0, s5
1036 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1037 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc
1038 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0
1039 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260
1040 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4
1041 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1042 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
1043 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3
1044 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5
1045 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2
1046 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0
1047 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3
1048 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2
1049 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1050 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1051 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4
1052 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1053 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1054 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1055 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1057 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
1058 ; SI-IEEE-SAFE: ; %bb.0:
1059 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1061 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1062 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7
1063 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1064 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
1065 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1066 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1067 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1068 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1069 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1070 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1071 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1072 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1073 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1074 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1075 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1076 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7
1077 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1078 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc
1079 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1080 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1081 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1082 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1083 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1084 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1085 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1086 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1087 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1088 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1089 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1090 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1091 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1092 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1093 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1094 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1095 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1096 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1097 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1098 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1099 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1100 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1101 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1102 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1103 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1104 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1105 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
1106 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1107 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1108 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1109 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1110 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1112 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
1113 ; CI-IEEE-SAFE: ; %bb.0:
1114 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1115 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1116 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1117 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7
1118 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1119 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
1120 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1121 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1122 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1123 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1124 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1125 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1126 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1127 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1128 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1129 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1130 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1131 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7
1132 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1133 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc
1134 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1135 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1136 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1137 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1138 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1139 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1140 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1141 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1142 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1143 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1144 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1145 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1146 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1147 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1148 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1149 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1150 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1151 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1152 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1153 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1154 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1155 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1156 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1157 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1158 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1159 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1160 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1161 %val.fneg = fneg <2 x float> %val
1162 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
1163 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1164 ret <2 x float> %div
1167 define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
1168 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1169 ; GCN-DAZ-UNSAFE: ; %bb.0:
1170 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1171 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1172 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1173 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1175 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1176 ; GCN-IEEE-UNSAFE: ; %bb.0:
1177 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1179 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1180 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1182 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1183 ; GCN-DAZ-SAFE: ; %bb.0:
1184 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1186 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
1187 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1188 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
1189 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0
1190 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2
1191 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1192 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1193 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1194 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0
1195 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1196 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1197 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1198 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1199 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1200 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1201 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1202 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1203 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1204 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1206 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1207 ; SI-IEEE-SAFE: ; %bb.0:
1208 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1209 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1210 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
1211 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1212 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
1213 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1214 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1215 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1216 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1217 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1218 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1219 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1220 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1221 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1222 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1223 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1224 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1225 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1226 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1227 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
1228 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1229 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
1230 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1231 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1232 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1233 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1234 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1235 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1236 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1238 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
1239 ; CI-IEEE-SAFE: ; %bb.0:
1240 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1241 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1242 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
1243 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1244 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
1245 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1246 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1247 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1248 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1249 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1250 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1251 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1252 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1253 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1254 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1255 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1256 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1257 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1258 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1259 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1260 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1261 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1262 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1263 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1264 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1265 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1266 %val0.neg = fneg float %val0
1267 %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
1268 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
1269 %user = fmul contract float %div, %val1
1273 define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
1274 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1275 ; GCN-DAZ-UNSAFE: ; %bb.0:
1276 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1277 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1278 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1279 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1280 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1281 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1283 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1284 ; GCN-IEEE-UNSAFE: ; %bb.0:
1285 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1286 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
1287 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
1288 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1289 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1290 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1292 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1293 ; GCN-DAZ-SAFE: ; %bb.0:
1294 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1295 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
1296 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000
1297 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s5
1298 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
1299 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
1300 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1
1301 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4
1302 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4
1303 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5
1304 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5
1305 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1
1306 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4
1307 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5
1308 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1309 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1310 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v5, -v0, s5
1311 ; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
1312 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v5, vcc
1313 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0
1314 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
1315 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6
1316 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1317 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5
1318 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
1319 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5
1320 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4
1321 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0
1322 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5
1323 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4
1324 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1325 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1326 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6
1327 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1328 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1329 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1330 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1331 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1332 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1334 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1335 ; SI-IEEE-SAFE: ; %bb.0:
1336 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1337 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1338 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1339 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7
1340 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1341 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
1342 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1343 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1344 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1345 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1346 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1347 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1348 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1349 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1350 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1351 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1352 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1353 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7
1354 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1355 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc
1356 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1357 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1358 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1359 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1360 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1361 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1362 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1363 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1364 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1365 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1366 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1367 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1368 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1369 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1370 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1371 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1372 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1373 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1374 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1375 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
1376 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1377 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1378 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1379 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1380 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1381 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1382 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
1383 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1384 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1385 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1386 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1387 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1388 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1389 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1391 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
1392 ; CI-IEEE-SAFE: ; %bb.0:
1393 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1394 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
1395 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
1396 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7
1397 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
1398 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
1399 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1400 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1401 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1402 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1403 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1404 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1405 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1406 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1407 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1408 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1409 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1410 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7
1411 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
1412 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc
1413 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1414 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1415 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1416 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1417 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1418 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1419 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1420 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1421 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1422 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1423 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1424 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1425 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1426 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1427 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1428 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1429 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1430 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1431 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1432 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1433 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1434 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1435 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1436 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1437 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1438 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1439 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1440 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1441 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1442 %val0.fneg = fneg <2 x float> %val0
1443 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
1444 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1445 %user = fmul contract <2 x float> %div, %val1
1446 ret <2 x float> %user
1449 define float @v_neg_rsq_f32(float %val) {
1450 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32:
1451 ; GCN-DAZ-UNSAFE: ; %bb.0:
1452 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1453 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1454 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1455 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1457 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32:
1458 ; GCN-IEEE-UNSAFE: ; %bb.0:
1459 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1460 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1461 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1462 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1464 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
1465 ; GCN-DAZ-SAFE: ; %bb.0:
1466 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1467 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1468 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1469 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1470 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1471 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
1472 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
1473 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
1474 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
1475 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
1476 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0
1477 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
1478 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2
1479 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1480 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1481 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
1482 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1483 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1484 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1485 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1487 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
1488 ; SI-IEEE-SAFE: ; %bb.0:
1489 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1490 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1491 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1492 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1493 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1494 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
1495 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
1496 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
1497 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
1498 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
1499 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
1500 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
1501 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
1502 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
1503 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1504 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1505 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
1506 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1507 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1508 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
1509 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
1510 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
1511 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
1512 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
1513 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1514 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1515 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
1516 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1518 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
1519 ; CI-IEEE-SAFE: ; %bb.0:
1520 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1521 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1522 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1523 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1524 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1525 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
1526 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
1527 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
1528 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
1529 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
1530 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
1531 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
1532 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
1533 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
1534 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1535 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1536 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
1537 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1538 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1539 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
1540 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
1541 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1542 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1543 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
1544 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1545 %sqrt = call contract float @llvm.sqrt.f32(float %val)
1546 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
1550 define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
1551 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32:
1552 ; GCN-DAZ-UNSAFE: ; %bb.0:
1553 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1554 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1555 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1556 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1557 ; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1558 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1560 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32:
1561 ; GCN-IEEE-UNSAFE: ; %bb.0:
1562 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1563 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1564 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1565 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1566 ; GCN-IEEE-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1567 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1569 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
1570 ; GCN-DAZ-SAFE: ; %bb.0:
1571 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1572 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1573 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
1574 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
1575 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1576 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1
1577 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2
1578 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1579 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1580 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1581 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1
1582 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1583 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1584 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1585 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1586 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0
1587 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1588 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1589 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0
1590 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260
1591 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4
1592 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1593 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
1594 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3
1595 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5
1596 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2
1597 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0
1598 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3
1599 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2
1600 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1601 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1602 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4
1603 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1604 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1605 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1606 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1608 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
1609 ; SI-IEEE-SAFE: ; %bb.0:
1610 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1611 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1612 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
1613 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1614 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1615 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1616 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1617 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1618 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1619 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1620 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1621 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1622 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1623 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1624 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1625 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1626 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0
1627 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1628 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1629 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1630 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1631 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1632 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1633 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1634 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1635 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1636 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1637 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1638 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1639 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1640 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1641 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1642 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1643 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1644 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1645 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1646 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1647 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1648 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1649 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1650 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1651 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1652 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1653 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1654 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1655 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
1656 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1657 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1658 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1659 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1660 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1662 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
1663 ; CI-IEEE-SAFE: ; %bb.0:
1664 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1666 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
1667 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1668 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1669 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
1670 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1671 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
1672 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1673 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1674 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1675 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
1676 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1677 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1678 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1679 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1680 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0
1681 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1682 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1683 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
1684 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1685 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
1686 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
1687 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
1688 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
1689 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
1690 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
1691 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
1692 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
1693 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1694 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
1695 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
1696 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1697 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1698 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1699 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1700 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1701 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1702 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1703 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1704 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
1705 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1706 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1707 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1708 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
1709 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1710 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
1711 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1712 ret <2 x float> %div
1715 define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
1716 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
1717 ; GCN-DAZ-UNSAFE: ; %bb.0:
1718 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1719 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1720 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1721 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1723 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
1724 ; GCN-IEEE-UNSAFE: ; %bb.0:
1725 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1727 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v1
1728 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1730 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
1731 ; GCN-DAZ-SAFE: ; %bb.0:
1732 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1733 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1734 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
1735 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1736 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1737 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0
1738 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2
1739 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
1740 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
1741 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
1742 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0
1743 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
1744 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
1745 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1746 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1747 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1748 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1749 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1750 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1751 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1752 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1754 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
1755 ; SI-IEEE-SAFE: ; %bb.0:
1756 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1757 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1758 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
1759 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1760 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1761 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1762 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1763 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1764 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1765 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1766 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1767 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1768 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1769 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1770 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1771 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1772 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1773 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1774 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1775 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
1776 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1777 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
1778 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
1779 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1780 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1781 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1782 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1783 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1784 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1786 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
1787 ; CI-IEEE-SAFE: ; %bb.0:
1788 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1789 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1790 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
1791 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1792 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1793 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
1794 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
1795 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
1796 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
1797 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
1798 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
1799 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
1800 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
1801 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
1802 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
1803 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1804 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
1805 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
1806 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
1807 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
1808 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
1809 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1810 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1811 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
1812 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
1813 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1814 %sqrt = call contract float @llvm.sqrt.f32(float %val0)
1815 %div = fdiv contract float -1.0, %sqrt, !fpmath !0
1816 %user = fmul contract float %div, %val1
1820 define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
1821 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1822 ; GCN-DAZ-UNSAFE: ; %bb.0:
1823 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1824 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1825 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1826 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1827 ; GCN-DAZ-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1828 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1830 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1831 ; GCN-IEEE-UNSAFE: ; %bb.0:
1832 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1833 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
1834 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
1835 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v0, -v0, v2
1836 ; GCN-IEEE-UNSAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
1837 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
1839 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1840 ; GCN-DAZ-SAFE: ; %bb.0:
1841 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1842 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
1843 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
1844 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
1845 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1846 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1
1847 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4
1848 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4
1849 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5
1850 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5
1851 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1
1852 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4
1853 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5
1854 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1855 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1856 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v0
1857 ; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1858 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
1859 ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0
1860 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
1861 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6
1862 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1863 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5
1864 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
1865 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5
1866 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4
1867 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0
1868 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5
1869 ; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4
1870 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1871 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1872 ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6
1873 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1874 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
1875 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
1876 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1877 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1878 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
1880 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1881 ; SI-IEEE-SAFE: ; %bb.0:
1882 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1883 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1884 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
1885 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1886 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1887 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1888 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1889 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1890 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1891 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1892 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1893 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1894 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1895 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1896 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1897 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1898 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0
1899 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1900 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1901 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1902 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1903 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1904 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1905 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1906 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1907 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1908 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1909 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1910 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1911 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1912 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1913 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1914 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1915 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1916 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1917 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
1918 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1919 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
1920 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
1921 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1922 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1923 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1924 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1925 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1926 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
1927 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
1928 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1929 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1930 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1931 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1932 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1933 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1934 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1936 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
1937 ; CI-IEEE-SAFE: ; %bb.0:
1938 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1939 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
1940 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
1941 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
1942 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1943 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
1944 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
1945 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
1946 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
1947 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
1948 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
1949 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
1950 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
1951 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
1952 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
1953 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1954 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0
1955 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
1956 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1957 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
1958 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
1959 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
1960 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1961 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
1962 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
1963 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
1964 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
1965 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
1966 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
1967 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
1968 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
1969 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
1970 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1971 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
1972 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
1973 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
1974 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1975 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
1976 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
1977 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
1978 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
1979 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
1980 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
1981 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
1982 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
1983 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
1984 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
1985 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
1986 %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0)
1987 %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
1988 %user = fmul contract <2 x float> %div, %val1
1989 ret <2 x float> %user
1992 define float @v_rsq_f32(float %val) {
1993 ; GCN-DAZ-LABEL: v_rsq_f32:
1995 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1996 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
1997 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
1999 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32:
2000 ; GCN-IEEE-UNSAFE: ; %bb.0:
2001 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2002 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2003 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2005 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
2006 ; GCN-IEEE-SAFE: ; %bb.0:
2007 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2008 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2009 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000
2010 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2011 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2012 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2013 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2014 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000
2015 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2016 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2017 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2018 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2019 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2023 define { float, float } @v_rsq_f32_multi_use(float %val) {
2024 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_multi_use:
2025 ; GCN-DAZ-UNSAFE: ; %bb.0:
2026 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2027 ; GCN-DAZ-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0
2028 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0
2029 ; GCN-DAZ-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
2030 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2032 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use:
2033 ; GCN-IEEE-UNSAFE: ; %bb.0:
2034 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2035 ; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0
2036 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0
2037 ; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
2038 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2040 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
2041 ; GCN-DAZ-SAFE: ; %bb.0:
2042 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2043 ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
2044 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v1, v0
2045 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
2047 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
2048 ; SI-IEEE-SAFE: ; %bb.0:
2049 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2050 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2051 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2052 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2053 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2054 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2055 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2056 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2057 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2058 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2059 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2060 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2061 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2062 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2063 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2064 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2065 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2066 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2067 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2068 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
2069 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2070 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
2071 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
2072 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2073 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
2074 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
2075 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2
2076 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2078 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
2079 ; CI-IEEE-SAFE: ; %bb.0:
2080 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2081 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2082 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2083 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2084 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2085 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2086 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2087 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2088 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2089 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2090 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2091 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2092 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2093 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2094 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2095 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2096 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2097 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2098 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2099 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2100 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2101 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
2102 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
2103 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2
2104 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2105 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2106 %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
2107 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2108 %insert.1 = insertvalue { float, float } %insert.0, float %div, 1
2109 ret { float, float } %insert.1
2112 define float @v_rsq_f32_missing_contract0(float %val) {
2113 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
2114 ; GCN-DAZ-UNSAFE: ; %bb.0:
2115 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2116 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2117 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2119 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
2120 ; GCN-IEEE-UNSAFE: ; %bb.0:
2121 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2122 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2123 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2125 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
2126 ; GCN-DAZ-SAFE: ; %bb.0:
2127 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128 ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
2129 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
2130 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
2132 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
2133 ; SI-IEEE-SAFE: ; %bb.0:
2134 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2135 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2136 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2137 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2138 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2139 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2140 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2141 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2142 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2143 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2144 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2145 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2146 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2147 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2148 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2149 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2150 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2151 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2152 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2153 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
2154 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2155 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
2156 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
2157 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2158 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2159 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2160 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2161 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2163 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
2164 ; CI-IEEE-SAFE: ; %bb.0:
2165 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2166 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2167 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2168 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2169 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2170 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2171 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2172 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2173 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2174 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2175 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2176 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2177 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2178 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2179 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2180 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2181 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2182 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2183 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2184 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2185 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2186 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2187 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2188 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2189 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2190 %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1
2191 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2195 define float @v_rsq_f32_missing_contract1(float %val) {
2196 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
2197 ; GCN-DAZ-UNSAFE: ; %bb.0:
2198 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2199 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2200 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2202 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
2203 ; GCN-IEEE-UNSAFE: ; %bb.0:
2204 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2205 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2206 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2208 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
2209 ; GCN-DAZ-SAFE: ; %bb.0:
2210 ; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2211 ; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
2212 ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
2213 ; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
2215 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
2216 ; SI-IEEE-SAFE: ; %bb.0:
2217 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2218 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2219 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2220 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2221 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2222 ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2223 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2224 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2225 ; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2226 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2227 ; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2228 ; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2229 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2230 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2231 ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2232 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2233 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2234 ; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2235 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2236 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
2237 ; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2238 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
2239 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
2240 ; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2241 ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2242 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2243 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2244 ; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2246 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
2247 ; CI-IEEE-SAFE: ; %bb.0:
2248 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2249 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
2250 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
2251 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2252 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2253 ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
2254 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
2255 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
2256 ; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
2257 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
2258 ; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
2259 ; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
2260 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
2261 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
2262 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
2263 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2264 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
2265 ; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
2266 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
2267 ; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
2268 ; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
2269 ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
2270 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
2271 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
2272 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2273 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2274 %div = fdiv float 1.0, %sqrt, !fpmath !1
2278 ; Test that we contract into FMA for an fadd user after introducing
2280 define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
2281 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user:
2283 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2284 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2285 ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
2286 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2288 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user:
2289 ; GCN-IEEE-UNSAFE: ; %bb.0:
2290 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2291 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2292 ; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1
2293 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2295 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
2296 ; GCN-IEEE-SAFE: ; %bb.0:
2297 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2298 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2299 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000
2300 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2301 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2302 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2303 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2304 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
2305 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2306 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1
2307 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2308 %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
2309 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2310 %add = fadd contract float %div, %val1
2314 ; Missing contract on the fdiv
2315 define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %val1) {
2316 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract0:
2318 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2319 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2320 ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
2321 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2323 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
2324 ; GCN-IEEE-UNSAFE: ; %bb.0:
2325 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2326 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2327 ; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1
2328 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2330 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
2331 ; GCN-IEEE-SAFE: ; %bb.0:
2332 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2333 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2334 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000
2335 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2336 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2337 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2338 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2339 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
2340 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2341 ; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1
2342 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2343 %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
2344 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2345 %add = fadd contract float %div, %val1
2349 ; Missing contract on the fadd
2350 define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %val1) {
2351 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract1:
2353 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2354 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2355 ; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
2356 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2358 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
2359 ; GCN-IEEE-UNSAFE: ; %bb.0:
2360 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2361 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2362 ; GCN-IEEE-UNSAFE-NEXT: v_add_f32_e32 v0, v0, v1
2363 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2365 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
2366 ; GCN-IEEE-SAFE: ; %bb.0:
2367 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2368 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2369 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x4b800000
2370 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2371 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2372 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2373 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2374 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
2375 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
2376 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
2377 ; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1
2378 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2379 %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
2380 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2381 %add = fadd float %div, %val1
2385 define float @v_rsq_f32_known_never_denormal(float nofpclass(sub) %val) {
2386 ; GCN-DAZ-LABEL: v_rsq_f32_known_never_denormal:
2388 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2389 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2390 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2392 ; GCN-IEEE-LABEL: v_rsq_f32_known_never_denormal:
2393 ; GCN-IEEE: ; %bb.0:
2394 ; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2395 ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
2396 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
2397 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2398 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2402 define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
2403 ; GCN-DAZ-LABEL: v_rsq_f32_known_never_posdenormal:
2405 ; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2406 ; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
2407 ; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
2409 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_known_never_posdenormal:
2410 ; GCN-IEEE-UNSAFE: ; %bb.0:
2411 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2412 ; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
2413 ; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
2415 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
2416 ; GCN-IEEE-SAFE: ; %bb.0:
2417 ; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2418 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
2419 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4b800000
2420 ; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
2421 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2422 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2423 ; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
2424 ; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x45800000
2425 ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
2426 ; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
2427 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
2428 %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
2429 %div = fdiv contract float 1.0, %sqrt, !fpmath !1
2433 !0 = !{float 2.500000e+00}
2434 !1 = !{float 1.000000e+00}
2436 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2437 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2438 ; CI-DAZ-SAFE: {{.*}}
2439 ; CI-DAZ-UNSAFE: {{.*}}
2440 ; CI-IEEE-UNSAFE: {{.*}}
2441 ; SI-DAZ-SAFE: {{.*}}
2442 ; SI-DAZ-UNSAFE: {{.*}}
2443 ; SI-IEEE-UNSAFE: {{.*}}