1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | FileCheck %s
4 declare hidden float @_Z3powff(float, float)
5 declare hidden double @_Z3powdd(double, double)
6 declare hidden half @_Z3powDhDh(half, half)
8 declare hidden float @_Z4powrff(float, float)
9 declare hidden double @_Z4powrdd(double, double)
10 declare hidden half @_Z4powrDhDh(half, half)
12 declare hidden float @_Z4pownfi(float, i32)
13 declare hidden double @_Z4powndi(double, i32)
14 declare hidden half @_Z4pownDhi(half, i32)
16 ; --------------------------------------------------------------------
18 ; --------------------------------------------------------------------
20 define half @test_pow_fast_f16(half %x, half %y) {
21 ; CHECK-LABEL: test_pow_fast_f16:
23 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24 ; CHECK-NEXT: s_getpc_b64 s[16:17]
25 ; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4
26 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12
27 ; CHECK-NEXT: s_setpc_b64 s[16:17]
28 %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
32 define float @test_pow_fast_f32(float %x, float %y) {
33 ; CHECK-LABEL: test_pow_fast_f32:
35 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; CHECK-NEXT: s_getpc_b64 s[16:17]
37 ; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff@rel32@lo+4
38 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff@rel32@hi+12
39 ; CHECK-NEXT: s_setpc_b64 s[16:17]
40 %pow = tail call fast float @_Z3powff(float %x, float %y)
44 define double @test_pow_fast_f64(double %x, double %y) {
45 ; CHECK-LABEL: test_pow_fast_f64:
47 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; CHECK-NEXT: s_getpc_b64 s[16:17]
49 ; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd@rel32@lo+4
50 ; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12
51 ; CHECK-NEXT: s_setpc_b64 s[16:17]
52 %pow = tail call fast double @_Z3powdd(double %x, double %y)
56 define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
57 ; CHECK-LABEL: test_pow_fast_f16__integral_y:
59 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
61 ; CHECK-NEXT: v_log_f16_e64 v3, |v0|
62 ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
63 ; CHECK-NEXT: v_cvt_f32_f16_e32 v1, v1
64 ; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
65 ; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1
66 ; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1
67 ; CHECK-NEXT: v_and_b32_e32 v0, v1, v0
68 ; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
69 ; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
70 ; CHECK-NEXT: v_exp_f16_e32 v2, v2
71 ; CHECK-NEXT: v_or_b32_e32 v0, v0, v2
72 ; CHECK-NEXT: s_setpc_b64 s[30:31]
73 %y = sitofp i32 %y.i to half
74 %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
78 define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
79 ; CHECK-LABEL: test_pow_fast_f32__integral_y:
81 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
83 ; CHECK-NEXT: s_mov_b32 s4, 0x800000
84 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
85 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
86 ; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1
87 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
88 ; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3
89 ; CHECK-NEXT: v_log_f32_e32 v3, v3
90 ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1
91 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
92 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
93 ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
94 ; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4
95 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
96 ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000
97 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
98 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
99 ; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3
100 ; CHECK-NEXT: v_exp_f32_e32 v2, v2
101 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000
102 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
103 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
104 ; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3
105 ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
106 ; CHECK-NEXT: s_setpc_b64 s[30:31]
107 %y = sitofp i32 %y.i to float
108 %pow = tail call fast float @_Z3powff(float %x, float %y)
112 define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
113 ; CHECK-LABEL: test_pow_fast_f64__integral_y:
115 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; CHECK-NEXT: s_mov_b32 s16, s33
117 ; CHECK-NEXT: s_mov_b32 s33, s32
118 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
119 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
120 ; CHECK-NEXT: s_mov_b64 exec, s[18:19]
121 ; CHECK-NEXT: v_writelane_b32 v40, s16, 14
122 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0
123 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1
124 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2
125 ; CHECK-NEXT: v_writelane_b32 v40, s35, 3
126 ; CHECK-NEXT: v_writelane_b32 v40, s36, 4
127 ; CHECK-NEXT: v_writelane_b32 v40, s37, 5
128 ; CHECK-NEXT: v_writelane_b32 v40, s38, 6
129 ; CHECK-NEXT: v_writelane_b32 v40, s39, 7
130 ; CHECK-NEXT: s_addk_i32 s32, 0x800
131 ; CHECK-NEXT: v_writelane_b32 v40, s40, 8
132 ; CHECK-NEXT: v_writelane_b32 v40, s41, 9
133 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
134 ; CHECK-NEXT: s_getpc_b64 s[4:5]
135 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
136 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
137 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
138 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10
139 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
140 ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
141 ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
142 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11
143 ; CHECK-NEXT: v_mov_b32_e32 v43, v1
144 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12
145 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43
146 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
147 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13
148 ; CHECK-NEXT: v_mov_b32_e32 v41, v31
149 ; CHECK-NEXT: s_mov_b32 s42, s15
150 ; CHECK-NEXT: s_mov_b32 s43, s14
151 ; CHECK-NEXT: s_mov_b32 s44, s13
152 ; CHECK-NEXT: s_mov_b32 s45, s12
153 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
154 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
155 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
156 ; CHECK-NEXT: v_mov_b32_e32 v42, v2
157 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
158 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
159 ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42
160 ; CHECK-NEXT: s_getpc_b64 s[4:5]
161 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
162 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
163 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
164 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
165 ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
166 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
167 ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
168 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
169 ; CHECK-NEXT: s_mov_b32 s12, s45
170 ; CHECK-NEXT: s_mov_b32 s13, s44
171 ; CHECK-NEXT: s_mov_b32 s14, s43
172 ; CHECK-NEXT: s_mov_b32 s15, s42
173 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
174 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
175 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
176 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42
177 ; CHECK-NEXT: v_and_b32_e32 v2, v2, v43
178 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
179 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
180 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
181 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
182 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13
183 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12
184 ; CHECK-NEXT: v_readlane_b32 s43, v40, 11
185 ; CHECK-NEXT: v_readlane_b32 s42, v40, 10
186 ; CHECK-NEXT: v_readlane_b32 s41, v40, 9
187 ; CHECK-NEXT: v_readlane_b32 s40, v40, 8
188 ; CHECK-NEXT: v_readlane_b32 s39, v40, 7
189 ; CHECK-NEXT: v_readlane_b32 s38, v40, 6
190 ; CHECK-NEXT: v_readlane_b32 s37, v40, 5
191 ; CHECK-NEXT: v_readlane_b32 s36, v40, 4
192 ; CHECK-NEXT: v_readlane_b32 s35, v40, 3
193 ; CHECK-NEXT: v_readlane_b32 s34, v40, 2
194 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1
195 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0
196 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14
197 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
198 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
199 ; CHECK-NEXT: s_mov_b64 exec, s[6:7]
200 ; CHECK-NEXT: s_addk_i32 s32, 0xf800
201 ; CHECK-NEXT: s_mov_b32 s33, s4
202 ; CHECK-NEXT: s_waitcnt vmcnt(0)
203 ; CHECK-NEXT: s_setpc_b64 s[30:31]
204 %y = sitofp i32 %y.i to double
205 %pow = tail call fast double @_Z3powdd(double %x, double %y)
209 ; --------------------------------------------------------------------
211 ; --------------------------------------------------------------------
213 define half @test_powr_fast_f16(half %x, half %y) {
214 ; CHECK-LABEL: test_powr_fast_f16:
216 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217 ; CHECK-NEXT: v_log_f16_e32 v0, v0
218 ; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1
219 ; CHECK-NEXT: v_exp_f16_e32 v0, v0
220 ; CHECK-NEXT: s_setpc_b64 s[30:31]
221 %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
225 define float @test_powr_fast_f32(float %x, float %y) {
226 ; CHECK-LABEL: test_powr_fast_f32:
228 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229 ; CHECK-NEXT: s_mov_b32 s4, 0x800000
230 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
231 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
232 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
233 ; CHECK-NEXT: v_mul_f32_e32 v0, v0, v3
234 ; CHECK-NEXT: v_log_f32_e32 v0, v0
235 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
236 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
237 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
238 ; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2
239 ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1
240 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000
241 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
242 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
243 ; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2
244 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
245 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000
246 ; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
247 ; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
248 ; CHECK-NEXT: s_setpc_b64 s[30:31]
249 %powr = tail call fast float @_Z4powrff(float %x, float %y)
253 define double @test_powr_fast_f64(double %x, double %y) {
254 ; CHECK-LABEL: test_powr_fast_f64:
256 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; CHECK-NEXT: s_mov_b32 s16, s33
258 ; CHECK-NEXT: s_mov_b32 s33, s32
259 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
260 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
261 ; CHECK-NEXT: s_mov_b64 exec, s[18:19]
262 ; CHECK-NEXT: v_writelane_b32 v40, s16, 14
263 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0
264 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1
265 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2
266 ; CHECK-NEXT: v_writelane_b32 v40, s35, 3
267 ; CHECK-NEXT: v_writelane_b32 v40, s36, 4
268 ; CHECK-NEXT: v_writelane_b32 v40, s37, 5
269 ; CHECK-NEXT: v_writelane_b32 v40, s38, 6
270 ; CHECK-NEXT: v_writelane_b32 v40, s39, 7
271 ; CHECK-NEXT: s_addk_i32 s32, 0x800
272 ; CHECK-NEXT: v_writelane_b32 v40, s40, 8
273 ; CHECK-NEXT: v_writelane_b32 v40, s41, 9
274 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
275 ; CHECK-NEXT: s_getpc_b64 s[4:5]
276 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
277 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
278 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
279 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10
280 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11
281 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12
282 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
283 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
284 ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
285 ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
286 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13
287 ; CHECK-NEXT: v_mov_b32_e32 v43, v31
288 ; CHECK-NEXT: s_mov_b32 s42, s15
289 ; CHECK-NEXT: s_mov_b32 s43, s14
290 ; CHECK-NEXT: s_mov_b32 s44, s13
291 ; CHECK-NEXT: s_mov_b32 s45, s12
292 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
293 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
294 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
295 ; CHECK-NEXT: v_mov_b32_e32 v42, v3
296 ; CHECK-NEXT: v_mov_b32_e32 v41, v2
297 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
298 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
299 ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[41:42]
300 ; CHECK-NEXT: s_getpc_b64 s[4:5]
301 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
302 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
303 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
304 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
305 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
306 ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
307 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
308 ; CHECK-NEXT: s_mov_b32 s12, s45
309 ; CHECK-NEXT: s_mov_b32 s13, s44
310 ; CHECK-NEXT: s_mov_b32 s14, s43
311 ; CHECK-NEXT: s_mov_b32 s15, s42
312 ; CHECK-NEXT: v_mov_b32_e32 v31, v43
313 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
314 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
315 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
316 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
317 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
318 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13
319 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12
320 ; CHECK-NEXT: v_readlane_b32 s43, v40, 11
321 ; CHECK-NEXT: v_readlane_b32 s42, v40, 10
322 ; CHECK-NEXT: v_readlane_b32 s41, v40, 9
323 ; CHECK-NEXT: v_readlane_b32 s40, v40, 8
324 ; CHECK-NEXT: v_readlane_b32 s39, v40, 7
325 ; CHECK-NEXT: v_readlane_b32 s38, v40, 6
326 ; CHECK-NEXT: v_readlane_b32 s37, v40, 5
327 ; CHECK-NEXT: v_readlane_b32 s36, v40, 4
328 ; CHECK-NEXT: v_readlane_b32 s35, v40, 3
329 ; CHECK-NEXT: v_readlane_b32 s34, v40, 2
330 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1
331 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0
332 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14
333 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
334 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
335 ; CHECK-NEXT: s_mov_b64 exec, s[6:7]
336 ; CHECK-NEXT: s_addk_i32 s32, 0xf800
337 ; CHECK-NEXT: s_mov_b32 s33, s4
338 ; CHECK-NEXT: s_waitcnt vmcnt(0)
339 ; CHECK-NEXT: s_setpc_b64 s[30:31]
340 %powr = tail call fast double @_Z4powrdd(double %x, double %y)
344 ; --------------------------------------------------------------------
346 ; --------------------------------------------------------------------
348 define half @test_pown_fast_f16(half %x, i32 %y) {
349 ; CHECK-LABEL: test_pown_fast_f16:
351 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352 ; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1
353 ; CHECK-NEXT: v_log_f16_e64 v3, |v0|
354 ; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1
355 ; CHECK-NEXT: v_and_b32_e32 v0, v1, v0
356 ; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
357 ; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2
358 ; CHECK-NEXT: v_exp_f16_e32 v2, v2
359 ; CHECK-NEXT: v_or_b32_e32 v0, v0, v2
360 ; CHECK-NEXT: s_setpc_b64 s[30:31]
361 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
365 define float @test_pown_fast_f32(float %x, i32 %y) {
366 ; CHECK-LABEL: test_pown_fast_f32:
368 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369 ; CHECK-NEXT: s_mov_b32 s4, 0x800000
370 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
371 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
372 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
373 ; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3
374 ; CHECK-NEXT: v_log_f32_e32 v3, v3
375 ; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1
376 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
377 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
378 ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
379 ; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4
380 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
381 ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000
382 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
383 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
384 ; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3
385 ; CHECK-NEXT: v_exp_f32_e32 v2, v2
386 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x1f800000
387 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
388 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
389 ; CHECK-NEXT: v_mul_f32_e32 v2, v2, v3
390 ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2
391 ; CHECK-NEXT: s_setpc_b64 s[30:31]
392 %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
396 define double @test_pown_fast_f64(double %x, i32 %y) {
397 ; CHECK-LABEL: test_pown_fast_f64:
399 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400 ; CHECK-NEXT: s_mov_b32 s16, s33
401 ; CHECK-NEXT: s_mov_b32 s33, s32
402 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
403 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
404 ; CHECK-NEXT: s_mov_b64 exec, s[18:19]
405 ; CHECK-NEXT: v_writelane_b32 v40, s16, 14
406 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0
407 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1
408 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2
409 ; CHECK-NEXT: v_writelane_b32 v40, s35, 3
410 ; CHECK-NEXT: v_writelane_b32 v40, s36, 4
411 ; CHECK-NEXT: v_writelane_b32 v40, s37, 5
412 ; CHECK-NEXT: v_writelane_b32 v40, s38, 6
413 ; CHECK-NEXT: v_writelane_b32 v40, s39, 7
414 ; CHECK-NEXT: s_addk_i32 s32, 0x800
415 ; CHECK-NEXT: v_writelane_b32 v40, s40, 8
416 ; CHECK-NEXT: v_writelane_b32 v40, s41, 9
417 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
418 ; CHECK-NEXT: s_getpc_b64 s[4:5]
419 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
420 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
421 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
422 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10
423 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
424 ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
425 ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
426 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11
427 ; CHECK-NEXT: v_mov_b32_e32 v43, v1
428 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12
429 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43
430 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
431 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13
432 ; CHECK-NEXT: v_mov_b32_e32 v41, v31
433 ; CHECK-NEXT: s_mov_b32 s42, s15
434 ; CHECK-NEXT: s_mov_b32 s43, s14
435 ; CHECK-NEXT: s_mov_b32 s44, s13
436 ; CHECK-NEXT: s_mov_b32 s45, s12
437 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
438 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
439 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
440 ; CHECK-NEXT: v_mov_b32_e32 v42, v2
441 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
442 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
443 ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42
444 ; CHECK-NEXT: s_getpc_b64 s[4:5]
445 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
446 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
447 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
448 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
449 ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
450 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
451 ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
452 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
453 ; CHECK-NEXT: s_mov_b32 s12, s45
454 ; CHECK-NEXT: s_mov_b32 s13, s44
455 ; CHECK-NEXT: s_mov_b32 s14, s43
456 ; CHECK-NEXT: s_mov_b32 s15, s42
457 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
458 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
459 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
460 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42
461 ; CHECK-NEXT: v_and_b32_e32 v2, v2, v43
462 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
463 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
464 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
465 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
466 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13
467 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12
468 ; CHECK-NEXT: v_readlane_b32 s43, v40, 11
469 ; CHECK-NEXT: v_readlane_b32 s42, v40, 10
470 ; CHECK-NEXT: v_readlane_b32 s41, v40, 9
471 ; CHECK-NEXT: v_readlane_b32 s40, v40, 8
472 ; CHECK-NEXT: v_readlane_b32 s39, v40, 7
473 ; CHECK-NEXT: v_readlane_b32 s38, v40, 6
474 ; CHECK-NEXT: v_readlane_b32 s37, v40, 5
475 ; CHECK-NEXT: v_readlane_b32 s36, v40, 4
476 ; CHECK-NEXT: v_readlane_b32 s35, v40, 3
477 ; CHECK-NEXT: v_readlane_b32 s34, v40, 2
478 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1
479 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0
480 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14
481 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
482 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
483 ; CHECK-NEXT: s_mov_b64 exec, s[6:7]
484 ; CHECK-NEXT: s_addk_i32 s32, 0xf800
485 ; CHECK-NEXT: s_mov_b32 s33, s4
486 ; CHECK-NEXT: s_waitcnt vmcnt(0)
487 ; CHECK-NEXT: s_setpc_b64 s[30:31]
488 %call = tail call fast double @_Z4powndi(double %x, i32 %y)
492 define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
493 ; CHECK-LABEL: test_pown_fast_f16_known_even:
495 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1
497 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
498 ; CHECK-NEXT: v_log_f16_e64 v0, |v0|
499 ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
500 ; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1
501 ; CHECK-NEXT: v_exp_f16_e32 v0, v0
502 ; CHECK-NEXT: s_setpc_b64 s[30:31]
503 %y = shl i32 %y.arg, 1
504 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
508 define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
509 ; CHECK-LABEL: test_pown_fast_f32_known_even:
511 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512 ; CHECK-NEXT: s_mov_b32 s4, 0x800000
513 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
514 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
515 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
516 ; CHECK-NEXT: v_mul_f32_e64 v0, |v0|, v3
517 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 1, v1
518 ; CHECK-NEXT: v_log_f32_e32 v0, v0
519 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
520 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
521 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
522 ; CHECK-NEXT: v_sub_f32_e32 v0, v0, v2
523 ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v1
524 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
525 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x42800000
526 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v2
527 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
528 ; CHECK-NEXT: v_fma_f32 v0, v0, v1, v2
529 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
530 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x1f800000
531 ; CHECK-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
532 ; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
533 ; CHECK-NEXT: s_setpc_b64 s[30:31]
534 %y = shl i32 %y.arg, 1
535 %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
539 define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
540 ; CHECK-LABEL: test_pown_fast_f64_known_even:
542 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543 ; CHECK-NEXT: s_mov_b32 s16, s33
544 ; CHECK-NEXT: s_mov_b32 s33, s32
545 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
546 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
547 ; CHECK-NEXT: s_mov_b64 exec, s[18:19]
548 ; CHECK-NEXT: v_writelane_b32 v40, s16, 14
549 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0
550 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1
551 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2
552 ; CHECK-NEXT: v_writelane_b32 v40, s35, 3
553 ; CHECK-NEXT: v_writelane_b32 v40, s36, 4
554 ; CHECK-NEXT: v_writelane_b32 v40, s37, 5
555 ; CHECK-NEXT: v_writelane_b32 v40, s38, 6
556 ; CHECK-NEXT: v_writelane_b32 v40, s39, 7
557 ; CHECK-NEXT: s_addk_i32 s32, 0x400
558 ; CHECK-NEXT: v_writelane_b32 v40, s40, 8
559 ; CHECK-NEXT: v_writelane_b32 v40, s41, 9
560 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
561 ; CHECK-NEXT: s_getpc_b64 s[4:5]
562 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
563 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
564 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
565 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10
566 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11
567 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12
568 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
569 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
570 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
571 ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
572 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13
573 ; CHECK-NEXT: v_mov_b32_e32 v41, v31
574 ; CHECK-NEXT: s_mov_b32 s42, s15
575 ; CHECK-NEXT: s_mov_b32 s43, s14
576 ; CHECK-NEXT: s_mov_b32 s44, s13
577 ; CHECK-NEXT: s_mov_b32 s45, s12
578 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
579 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
580 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
581 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v2
582 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
583 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
584 ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42
585 ; CHECK-NEXT: s_getpc_b64 s[4:5]
586 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
587 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
588 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
589 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
590 ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
591 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
592 ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
593 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
594 ; CHECK-NEXT: s_mov_b32 s12, s45
595 ; CHECK-NEXT: s_mov_b32 s13, s44
596 ; CHECK-NEXT: s_mov_b32 s14, s43
597 ; CHECK-NEXT: s_mov_b32 s15, s42
598 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
599 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
600 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
601 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
602 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
603 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13
604 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12
605 ; CHECK-NEXT: v_readlane_b32 s43, v40, 11
606 ; CHECK-NEXT: v_readlane_b32 s42, v40, 10
607 ; CHECK-NEXT: v_readlane_b32 s41, v40, 9
608 ; CHECK-NEXT: v_readlane_b32 s40, v40, 8
609 ; CHECK-NEXT: v_readlane_b32 s39, v40, 7
610 ; CHECK-NEXT: v_readlane_b32 s38, v40, 6
611 ; CHECK-NEXT: v_readlane_b32 s37, v40, 5
612 ; CHECK-NEXT: v_readlane_b32 s36, v40, 4
613 ; CHECK-NEXT: v_readlane_b32 s35, v40, 3
614 ; CHECK-NEXT: v_readlane_b32 s34, v40, 2
615 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1
616 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0
617 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14
618 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
619 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
620 ; CHECK-NEXT: s_mov_b64 exec, s[6:7]
621 ; CHECK-NEXT: s_addk_i32 s32, 0xfc00
622 ; CHECK-NEXT: s_mov_b32 s33, s4
623 ; CHECK-NEXT: s_waitcnt vmcnt(0)
624 ; CHECK-NEXT: s_setpc_b64 s[30:31]
625 %y = shl i32 %y.arg, 1
626 %call = tail call fast double @_Z4powndi(double %x, i32 %y)
630 define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
631 ; CHECK-LABEL: test_pown_fast_f16_known_odd:
633 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634 ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1
635 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
636 ; CHECK-NEXT: v_log_f16_e64 v2, |v0|
637 ; CHECK-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
638 ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
639 ; CHECK-NEXT: v_mul_f16_e32 v1, v2, v1
640 ; CHECK-NEXT: v_exp_f16_e32 v1, v1
641 ; CHECK-NEXT: v_or_b32_e32 v0, v0, v1
642 ; CHECK-NEXT: s_setpc_b64 s[30:31]
643 %y = or i32 %y.arg, 1
644 %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
648 define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
649 ; CHECK-LABEL: test_pown_fast_f32_known_odd:
651 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652 ; CHECK-NEXT: s_mov_b32 s4, 0x800000
653 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
654 ; CHECK-NEXT: v_mov_b32_e32 v3, 0x4f800000
655 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 1.0, v3, vcc
656 ; CHECK-NEXT: v_mul_f32_e64 v3, |v0|, v3
657 ; CHECK-NEXT: v_or_b32_e32 v1, 1, v1
658 ; CHECK-NEXT: v_log_f32_e32 v3, v3
659 ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
660 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000
661 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
662 ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2
663 ; CHECK-NEXT: v_mul_f32_e32 v3, v2, v1
664 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000
665 ; CHECK-NEXT: v_mov_b32_e32 v4, 0x42800000
666 ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
667 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
668 ; CHECK-NEXT: v_fma_f32 v1, v2, v1, v3
669 ; CHECK-NEXT: v_exp_f32_e32 v1, v1
670 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x1f800000
671 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
672 ; CHECK-NEXT: s_brev_b32 s4, 1
673 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v2
674 ; CHECK-NEXT: v_and_or_b32 v0, v0, s4, v1
675 ; CHECK-NEXT: s_setpc_b64 s[30:31]
676 %y = or i32 %y.arg, 1
677 %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
681 define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
682 ; CHECK-LABEL: test_pown_fast_f64_known_odd:
684 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
685 ; CHECK-NEXT: s_mov_b32 s16, s33
686 ; CHECK-NEXT: s_mov_b32 s33, s32
687 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
688 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
689 ; CHECK-NEXT: s_mov_b64 exec, s[18:19]
690 ; CHECK-NEXT: v_writelane_b32 v40, s16, 14
691 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0
692 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1
693 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2
694 ; CHECK-NEXT: v_writelane_b32 v40, s35, 3
695 ; CHECK-NEXT: v_writelane_b32 v40, s36, 4
696 ; CHECK-NEXT: v_writelane_b32 v40, s37, 5
697 ; CHECK-NEXT: v_writelane_b32 v40, s38, 6
698 ; CHECK-NEXT: v_writelane_b32 v40, s39, 7
699 ; CHECK-NEXT: s_addk_i32 s32, 0x800
700 ; CHECK-NEXT: v_writelane_b32 v40, s40, 8
701 ; CHECK-NEXT: v_writelane_b32 v40, s41, 9
702 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
703 ; CHECK-NEXT: s_getpc_b64 s[4:5]
704 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
705 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
706 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
707 ; CHECK-NEXT: v_writelane_b32 v40, s42, 10
708 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
709 ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
710 ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
711 ; CHECK-NEXT: v_writelane_b32 v40, s43, 11
712 ; CHECK-NEXT: v_mov_b32_e32 v42, v1
713 ; CHECK-NEXT: v_writelane_b32 v40, s44, 12
714 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
715 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
716 ; CHECK-NEXT: v_writelane_b32 v40, s45, 13
717 ; CHECK-NEXT: v_mov_b32_e32 v41, v31
718 ; CHECK-NEXT: s_mov_b32 s42, s15
719 ; CHECK-NEXT: s_mov_b32 s43, s14
720 ; CHECK-NEXT: s_mov_b32 s44, s13
721 ; CHECK-NEXT: s_mov_b32 s45, s12
722 ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
723 ; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
724 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
725 ; CHECK-NEXT: v_or_b32_e32 v43, 1, v2
726 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
727 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
728 ; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v43
729 ; CHECK-NEXT: s_getpc_b64 s[4:5]
730 ; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
731 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
732 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
733 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
734 ; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
735 ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
736 ; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
737 ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
738 ; CHECK-NEXT: s_mov_b32 s12, s45
739 ; CHECK-NEXT: s_mov_b32 s13, s44
740 ; CHECK-NEXT: s_mov_b32 s14, s43
741 ; CHECK-NEXT: s_mov_b32 s15, s42
742 ; CHECK-NEXT: v_mov_b32_e32 v31, v41
743 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
744 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
745 ; CHECK-NEXT: v_and_b32_e32 v2, 0x80000000, v42
746 ; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
747 ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
748 ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
749 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
750 ; CHECK-NEXT: v_readlane_b32 s45, v40, 13
751 ; CHECK-NEXT: v_readlane_b32 s44, v40, 12
752 ; CHECK-NEXT: v_readlane_b32 s43, v40, 11
753 ; CHECK-NEXT: v_readlane_b32 s42, v40, 10
754 ; CHECK-NEXT: v_readlane_b32 s41, v40, 9
755 ; CHECK-NEXT: v_readlane_b32 s40, v40, 8
756 ; CHECK-NEXT: v_readlane_b32 s39, v40, 7
757 ; CHECK-NEXT: v_readlane_b32 s38, v40, 6
758 ; CHECK-NEXT: v_readlane_b32 s37, v40, 5
759 ; CHECK-NEXT: v_readlane_b32 s36, v40, 4
760 ; CHECK-NEXT: v_readlane_b32 s35, v40, 3
761 ; CHECK-NEXT: v_readlane_b32 s34, v40, 2
762 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1
763 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0
764 ; CHECK-NEXT: v_readlane_b32 s4, v40, 14
765 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
766 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
767 ; CHECK-NEXT: s_mov_b64 exec, s[6:7]
768 ; CHECK-NEXT: s_addk_i32 s32, 0xf800
769 ; CHECK-NEXT: s_mov_b32 s33, s4
770 ; CHECK-NEXT: s_waitcnt vmcnt(0)
771 ; CHECK-NEXT: s_setpc_b64 s[30:31]
772 %y = or i32 %y.arg, 1
773 %call = tail call fast double @_Z4powndi(double %x, i32 %y)