1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6 declare i16 @llvm.umax.i16(i16, i16)
7 declare i64 @llvm.umin.i64(i64, i64)
9 declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
11 define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
12 ; VI-LABEL: fmul_pow2_4xfloat:
14 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15 ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
16 ; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 1
17 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 1
18 ; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 1
19 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
20 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
21 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v2
22 ; VI-NEXT: v_cvt_f32_u32_e32 v3, v3
23 ; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
24 ; VI-NEXT: v_mul_f32_e32 v1, 0x41100000, v1
25 ; VI-NEXT: v_mul_f32_e32 v2, 0x41100000, v2
26 ; VI-NEXT: v_mul_f32_e32 v3, 0x41100000, v3
27 ; VI-NEXT: s_setpc_b64 s[30:31]
29 ; GFX10-LABEL: fmul_pow2_4xfloat:
31 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
33 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 1
34 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 1
35 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 1
36 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
37 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
38 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
39 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
40 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
41 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x41100000, v1
42 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x41100000, v2
43 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x41100000, v3
44 ; GFX10-NEXT: s_setpc_b64 s[30:31]
46 ; GFX11-LABEL: fmul_pow2_4xfloat:
48 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1
50 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 1
51 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 1
52 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 1
53 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
54 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
55 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
57 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
58 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
59 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
60 ; GFX11-NEXT: v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1
61 ; GFX11-NEXT: v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3
62 ; GFX11-NEXT: s_setpc_b64 s[30:31]
63 %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
64 %p2_f = uitofp <4 x i32> %p2 to <4 x float>
65 %r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
69 define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
70 ; VI-LABEL: fmul_pow2_ldexp_4xfloat:
72 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73 ; VI-NEXT: s_mov_b32 s4, 0x41100000
74 ; VI-NEXT: v_ldexp_f32 v0, s4, v0
75 ; VI-NEXT: v_ldexp_f32 v1, s4, v1
76 ; VI-NEXT: v_ldexp_f32 v2, s4, v2
77 ; VI-NEXT: v_ldexp_f32 v3, s4, v3
78 ; VI-NEXT: s_setpc_b64 s[30:31]
80 ; GFX10-LABEL: fmul_pow2_ldexp_4xfloat:
82 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX10-NEXT: v_ldexp_f32 v0, 0x41100000, v0
84 ; GFX10-NEXT: v_ldexp_f32 v1, 0x41100000, v1
85 ; GFX10-NEXT: v_ldexp_f32 v2, 0x41100000, v2
86 ; GFX10-NEXT: v_ldexp_f32 v3, 0x41100000, v3
87 ; GFX10-NEXT: s_setpc_b64 s[30:31]
89 ; GFX11-LABEL: fmul_pow2_ldexp_4xfloat:
91 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX11-NEXT: v_ldexp_f32 v0, 0x41100000, v0
93 ; GFX11-NEXT: v_ldexp_f32 v1, 0x41100000, v1
94 ; GFX11-NEXT: v_ldexp_f32 v2, 0x41100000, v2
95 ; GFX11-NEXT: v_ldexp_f32 v3, 0x41100000, v3
96 ; GFX11-NEXT: s_setpc_b64 s[30:31]
97 %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
101 define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
102 ; VI-LABEL: fdiv_pow2_4xfloat:
104 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
106 ; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v1
107 ; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2
108 ; VI-NEXT: v_lshlrev_b32_e32 v3, 23, v3
109 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41100000, v0
110 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 0x41100000, v1
111 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41100000, v2
112 ; VI-NEXT: v_sub_u32_e32 v3, vcc, 0x41100000, v3
113 ; VI-NEXT: s_setpc_b64 s[30:31]
115 ; GFX10-LABEL: fdiv_pow2_4xfloat:
117 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
119 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v1
120 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 23, v2
121 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 23, v3
122 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0
123 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1
124 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2
125 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3
126 ; GFX10-NEXT: s_setpc_b64 s[30:31]
128 ; GFX11-LABEL: fdiv_pow2_4xfloat:
130 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
132 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v1
133 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 23, v2
134 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 23, v3
135 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
136 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0
137 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1
138 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
139 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2
140 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3
141 ; GFX11-NEXT: s_setpc_b64 s[30:31]
142 %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
143 %p2_f = uitofp <4 x i32> %p2 to <4 x float>
144 %r = fdiv <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
148 declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
150 define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
151 ; VI-LABEL: fmul_pow2_8xhalf:
153 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154 ; VI-NEXT: v_mov_b32_e32 v5, 1
155 ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, 1
156 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
157 ; VI-NEXT: v_lshlrev_b16_e64 v6, v2, 1
158 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
159 ; VI-NEXT: v_lshlrev_b16_e64 v7, v1, 1
160 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
161 ; VI-NEXT: v_lshlrev_b16_e64 v8, v0, 1
162 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
163 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
164 ; VI-NEXT: v_cvt_f16_u16_e32 v5, v8
165 ; VI-NEXT: v_cvt_f16_u16_e32 v1, v1
166 ; VI-NEXT: v_cvt_f16_u16_e32 v7, v7
167 ; VI-NEXT: v_cvt_f16_u16_e32 v2, v2
168 ; VI-NEXT: v_cvt_f16_u16_e32 v6, v6
169 ; VI-NEXT: v_cvt_f16_u16_e32 v3, v3
170 ; VI-NEXT: v_cvt_f16_u16_e32 v4, v4
171 ; VI-NEXT: v_mov_b32_e32 v8, 0x7000
172 ; VI-NEXT: v_mul_f16_e32 v4, 0x7000, v4
173 ; VI-NEXT: v_mul_f16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
174 ; VI-NEXT: v_mul_f16_e32 v6, 0x7000, v6
175 ; VI-NEXT: v_mul_f16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
176 ; VI-NEXT: v_mul_f16_e32 v7, 0x7000, v7
177 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
178 ; VI-NEXT: v_mul_f16_e32 v5, 0x7000, v5
179 ; VI-NEXT: v_mul_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
180 ; VI-NEXT: v_or_b32_e32 v0, v5, v0
181 ; VI-NEXT: v_or_b32_e32 v1, v7, v1
182 ; VI-NEXT: v_or_b32_e32 v2, v6, v2
183 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
184 ; VI-NEXT: s_setpc_b64 s[30:31]
186 ; GFX10-LABEL: fmul_pow2_8xhalf:
188 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
190 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
191 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
192 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
193 ; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3
194 ; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2
195 ; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1
196 ; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0
197 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
198 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
199 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
200 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
201 ; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0
202 ; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
203 ; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2
204 ; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3
205 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
206 ; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
207 ; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
208 ; GFX10-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
209 ; GFX10-NEXT: s_setpc_b64 s[30:31]
211 ; GFX11-LABEL: fmul_pow2_8xhalf:
213 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
215 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
216 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
217 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
218 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
219 ; GFX11-NEXT: v_cvt_f16_u16_e32 v4, v3
220 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
221 ; GFX11-NEXT: v_cvt_f16_u16_e32 v5, v2
222 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
223 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
224 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
225 ; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
226 ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
227 ; GFX11-NEXT: v_cvt_f16_u16_e32 v6, v6
228 ; GFX11-NEXT: v_cvt_f16_u16_e32 v7, v7
229 ; GFX11-NEXT: v_cvt_f16_u16_e32 v2, v2
230 ; GFX11-NEXT: v_cvt_f16_u16_e32 v3, v3
231 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
232 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6
233 ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7
234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
235 ; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2
236 ; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3
237 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
238 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
239 ; GFX11-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
240 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
241 ; GFX11-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
242 ; GFX11-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
243 ; GFX11-NEXT: s_setpc_b64 s[30:31]
244 %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
245 %p2_f = uitofp <8 x i16> %p2 to <8 x half>
246 %r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
250 define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
251 ; VI-LABEL: fmul_pow2_ldexp_8xhalf:
253 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; VI-NEXT: v_mov_b32_e32 v5, 0x7000
255 ; VI-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3
256 ; VI-NEXT: v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
257 ; VI-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
258 ; VI-NEXT: v_ldexp_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
259 ; VI-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
260 ; VI-NEXT: v_ldexp_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
261 ; VI-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
262 ; VI-NEXT: v_ldexp_f16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
263 ; VI-NEXT: v_or_b32_e32 v0, v8, v0
264 ; VI-NEXT: v_or_b32_e32 v1, v7, v1
265 ; VI-NEXT: v_or_b32_e32 v2, v6, v2
266 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
267 ; VI-NEXT: s_setpc_b64 s[30:31]
269 ; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
271 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000
273 ; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3
274 ; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
275 ; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
276 ; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
277 ; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
278 ; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
279 ; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
280 ; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
281 ; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0
282 ; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1
283 ; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2
284 ; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3
285 ; GFX10-NEXT: s_setpc_b64 s[30:31]
287 ; GFX11-LABEL: fmul_pow2_ldexp_8xhalf:
289 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290 ; GFX11-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3
291 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
292 ; GFX11-NEXT: v_ldexp_f16_e32 v5, 0x7000, v2
293 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
294 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
295 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
296 ; GFX11-NEXT: v_ldexp_f16_e32 v1, 0x7000, v1
297 ; GFX11-NEXT: v_ldexp_f16_e32 v0, 0x7000, v0
298 ; GFX11-NEXT: v_ldexp_f16_e32 v6, 0x7000, v6
299 ; GFX11-NEXT: v_ldexp_f16_e32 v7, 0x7000, v7
300 ; GFX11-NEXT: v_ldexp_f16_e32 v2, 0x7000, v2
301 ; GFX11-NEXT: v_ldexp_f16_e32 v3, 0x7000, v3
302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
303 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6
304 ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7
305 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
306 ; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2
307 ; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3
308 ; GFX11-NEXT: s_setpc_b64 s[30:31]
309 %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
313 define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
314 ; VI-LABEL: fdiv_pow2_8xhalf:
316 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317 ; VI-NEXT: v_mov_b32_e32 v4, 10
318 ; VI-NEXT: v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
319 ; VI-NEXT: v_mov_b32_e32 v6, 0x7000
320 ; VI-NEXT: v_lshlrev_b16_e32 v3, 10, v3
321 ; VI-NEXT: v_lshlrev_b16_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
322 ; VI-NEXT: v_lshlrev_b16_e32 v2, 10, v2
323 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
324 ; VI-NEXT: v_lshlrev_b16_e32 v1, 10, v1
325 ; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
326 ; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0
327 ; VI-NEXT: v_sub_u16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
328 ; VI-NEXT: v_sub_u16_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
329 ; VI-NEXT: v_sub_u16_sdwa v8, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
330 ; VI-NEXT: v_sub_u16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
331 ; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0
332 ; VI-NEXT: v_sub_u16_e32 v1, 0x7000, v1
333 ; VI-NEXT: v_sub_u16_e32 v2, 0x7000, v2
334 ; VI-NEXT: v_sub_u16_e32 v3, 0x7000, v3
335 ; VI-NEXT: v_or_b32_e32 v0, v0, v4
336 ; VI-NEXT: v_or_b32_e32 v1, v1, v8
337 ; VI-NEXT: v_or_b32_e32 v2, v2, v7
338 ; VI-NEXT: v_or_b32_e32 v3, v3, v5
339 ; VI-NEXT: s_setpc_b64 s[30:31]
341 ; GFX10-LABEL: fdiv_pow2_8xhalf:
343 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
345 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
346 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
347 ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1]
348 ; GFX10-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1]
349 ; GFX10-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
350 ; GFX10-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
351 ; GFX10-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
352 ; GFX10-NEXT: s_setpc_b64 s[30:31]
354 ; GFX11-LABEL: fdiv_pow2_8xhalf:
356 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
357 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
358 ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
359 ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
360 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1]
361 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
362 ; GFX11-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1]
363 ; GFX11-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
364 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
365 ; GFX11-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
366 ; GFX11-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
367 ; GFX11-NEXT: s_setpc_b64 s[30:31]
368 %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
369 %p2_f = uitofp <8 x i16> %p2 to <8 x half>
370 %r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
374 define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
375 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt:
376 ; CHECK-SSE: # %bb.0:
377 ; CHECK-SSE-NEXT: movq %rdi, %rcx
378 ; CHECK-SSE-NEXT: movl $1, %eax
379 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
380 ; CHECK-SSE-NEXT: shlq %cl, %rax
381 ; CHECK-SSE-NEXT: movq %rax, %xmm1
382 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
383 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
384 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
385 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
386 ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0
387 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
388 ; CHECK-SSE-NEXT: retq
390 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt:
391 ; CHECK-AVX2: # %bb.0:
392 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
393 ; CHECK-AVX2-NEXT: movl $1, %eax
394 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
395 ; CHECK-AVX2-NEXT: shlq %cl, %rax
396 ; CHECK-AVX2-NEXT: vmovq %rax, %xmm0
397 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
398 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
399 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
400 ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
401 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
402 ; CHECK-AVX2-NEXT: retq
404 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt:
405 ; CHECK-NO-FASTFMA: # %bb.0:
406 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
407 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
408 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
409 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
410 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
411 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
412 ; CHECK-NO-FASTFMA-NEXT: retq
414 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt:
415 ; CHECK-FMA: # %bb.0:
416 ; CHECK-FMA-NEXT: movl $1, %eax
417 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
418 ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
419 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
420 ; CHECK-FMA-NEXT: retq
421 ; VI-LABEL: fmul_pow_shl_cnt:
423 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1
425 ; VI-NEXT: s_mov_b32 s4, 0
426 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
427 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
428 ; VI-NEXT: s_mov_b32 s5, 0x40220000
429 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
430 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
431 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
432 ; VI-NEXT: s_setpc_b64 s[30:31]
434 ; GFX10-LABEL: fmul_pow_shl_cnt:
436 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1
438 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
439 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
440 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
441 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
442 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
443 ; GFX10-NEXT: s_setpc_b64 s[30:31]
445 ; GFX11-LABEL: fmul_pow_shl_cnt:
447 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1
449 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
450 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
451 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
452 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
453 ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
454 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
456 ; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
457 ; GFX11-NEXT: s_setpc_b64 s[30:31]
458 %shl = shl nuw i64 1, %cnt
459 %conv = uitofp i64 %shl to double
460 %mul = fmul double 9.000000e+00, %conv
464 define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
465 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt2:
466 ; CHECK-SSE: # %bb.0:
467 ; CHECK-SSE-NEXT: movq %rdi, %rcx
468 ; CHECK-SSE-NEXT: movl $2, %eax
469 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
470 ; CHECK-SSE-NEXT: shlq %cl, %rax
471 ; CHECK-SSE-NEXT: movq %rax, %xmm1
472 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
473 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
474 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
475 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
476 ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0
477 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
478 ; CHECK-SSE-NEXT: retq
480 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt2:
481 ; CHECK-AVX2: # %bb.0:
482 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
483 ; CHECK-AVX2-NEXT: movl $2, %eax
484 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
485 ; CHECK-AVX2-NEXT: shlq %cl, %rax
486 ; CHECK-AVX2-NEXT: vmovq %rax, %xmm0
487 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
488 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
489 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
490 ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
491 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
492 ; CHECK-AVX2-NEXT: retq
494 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt2:
495 ; CHECK-NO-FASTFMA: # %bb.0:
496 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
497 ; CHECK-NO-FASTFMA-NEXT: movl $2, %eax
498 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
499 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
500 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
501 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
502 ; CHECK-NO-FASTFMA-NEXT: retq
504 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt2:
505 ; CHECK-FMA: # %bb.0:
506 ; CHECK-FMA-NEXT: movl $2, %eax
507 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
508 ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
509 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
510 ; CHECK-FMA-NEXT: retq
511 ; VI-LABEL: fmul_pow_shl_cnt2:
513 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
515 ; VI-NEXT: s_mov_b32 s4, 0
516 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
517 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
518 ; VI-NEXT: s_mov_b32 s5, 0xc0220000
519 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
520 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
521 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
522 ; VI-NEXT: s_setpc_b64 s[30:31]
524 ; GFX10-LABEL: fmul_pow_shl_cnt2:
526 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
527 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
528 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
529 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
530 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
531 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
532 ; GFX10-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1]
533 ; GFX10-NEXT: s_setpc_b64 s[30:31]
535 ; GFX11-LABEL: fmul_pow_shl_cnt2:
537 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
539 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
540 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
541 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
542 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
543 ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
544 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
545 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
546 ; GFX11-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1]
547 ; GFX11-NEXT: s_setpc_b64 s[30:31]
548 %shl = shl nuw i64 2, %cnt
549 %conv = uitofp i64 %shl to double
550 %mul = fmul double -9.000000e+00, %conv
554 define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
555 ; CHECK-SSE-LABEL: fmul_pow_select:
556 ; CHECK-SSE: # %bb.0:
557 ; CHECK-SSE-NEXT: movl %edi, %ecx
558 ; CHECK-SSE-NEXT: andl $1, %esi
559 ; CHECK-SSE-NEXT: movl $2, %eax
560 ; CHECK-SSE-NEXT: subl %esi, %eax
561 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
562 ; CHECK-SSE-NEXT: shll %cl, %eax
563 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
564 ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
565 ; CHECK-SSE-NEXT: retq
567 ; CHECK-AVX2-LABEL: fmul_pow_select:
568 ; CHECK-AVX2: # %bb.0:
569 ; CHECK-AVX2-NEXT: movl %edi, %ecx
570 ; CHECK-AVX2-NEXT: andl $1, %esi
571 ; CHECK-AVX2-NEXT: movl $2, %eax
572 ; CHECK-AVX2-NEXT: subl %esi, %eax
573 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
574 ; CHECK-AVX2-NEXT: shll %cl, %eax
575 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
576 ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
577 ; CHECK-AVX2-NEXT: retq
579 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_select:
580 ; CHECK-NO-FASTFMA: # %bb.0:
581 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
582 ; CHECK-NO-FASTFMA-NEXT: andl $1, %esi
583 ; CHECK-NO-FASTFMA-NEXT: movl $2, %eax
584 ; CHECK-NO-FASTFMA-NEXT: subl %esi, %eax
585 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
586 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
587 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
588 ; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
589 ; CHECK-NO-FASTFMA-NEXT: retq
591 ; CHECK-FMA-LABEL: fmul_pow_select:
592 ; CHECK-FMA: # %bb.0:
593 ; CHECK-FMA-NEXT: andl $1, %esi
594 ; CHECK-FMA-NEXT: movl $2, %eax
595 ; CHECK-FMA-NEXT: subl %esi, %eax
596 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
597 ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
598 ; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
599 ; CHECK-FMA-NEXT: retq
600 ; VI-LABEL: fmul_pow_select:
602 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
603 ; VI-NEXT: v_and_b32_e32 v1, 1, v1
604 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
605 ; VI-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
606 ; VI-NEXT: v_lshlrev_b32_e32 v0, v0, v1
607 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
608 ; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
609 ; VI-NEXT: s_setpc_b64 s[30:31]
611 ; GFX10-LABEL: fmul_pow_select:
613 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
614 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
615 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
616 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
617 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, v0, v1
618 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
619 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
620 ; GFX10-NEXT: s_setpc_b64 s[30:31]
622 ; GFX11-LABEL: fmul_pow_select:
624 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
626 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
627 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
628 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
629 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v1
630 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
631 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
632 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
633 ; GFX11-NEXT: s_setpc_b64 s[30:31]
634 %shl2 = shl nuw i32 2, %cnt
635 %shl1 = shl nuw i32 1, %cnt
636 %shl = select i1 %c, i32 %shl1, i32 %shl2
637 %conv = uitofp i32 %shl to float
638 %mul = fmul float 9.000000e+00, %conv
642 define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
643 ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2:
644 ; CHECK-SSE: # %bb.0:
645 ; CHECK-SSE-NEXT: movq %rdi, %rcx
646 ; CHECK-SSE-NEXT: movl $8, %eax
647 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
648 ; CHECK-SSE-NEXT: shlq %cl, %rax
649 ; CHECK-SSE-NEXT: cmpq $8192, %rax # imm = 0x2000
650 ; CHECK-SSE-NEXT: movl $8192, %ecx # imm = 0x2000
651 ; CHECK-SSE-NEXT: cmovbq %rax, %rcx
652 ; CHECK-SSE-NEXT: cvtsi2ss %rcx, %xmm0
653 ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
654 ; CHECK-SSE-NEXT: retq
656 ; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2:
657 ; CHECK-AVX2: # %bb.0:
658 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
659 ; CHECK-AVX2-NEXT: movl $8, %eax
660 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
661 ; CHECK-AVX2-NEXT: shlq %cl, %rax
662 ; CHECK-AVX2-NEXT: cmpq $8192, %rax # imm = 0x2000
663 ; CHECK-AVX2-NEXT: movl $8192, %ecx # imm = 0x2000
664 ; CHECK-AVX2-NEXT: cmovbq %rax, %rcx
665 ; CHECK-AVX2-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0
666 ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
667 ; CHECK-AVX2-NEXT: retq
669 ; CHECK-NO-FASTFMA-LABEL: fmul_fly_pow_mul_min_pow2:
670 ; CHECK-NO-FASTFMA: # %bb.0:
671 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
672 ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
673 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
674 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
675 ; CHECK-NO-FASTFMA-NEXT: cmpq $8192, %rax # imm = 0x2000
676 ; CHECK-NO-FASTFMA-NEXT: movl $8192, %ecx # imm = 0x2000
677 ; CHECK-NO-FASTFMA-NEXT: cmovbq %rax, %rcx
678 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0
679 ; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
680 ; CHECK-NO-FASTFMA-NEXT: retq
682 ; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2:
683 ; CHECK-FMA: # %bb.0:
684 ; CHECK-FMA-NEXT: movl $8, %eax
685 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
686 ; CHECK-FMA-NEXT: cmpq $8192, %rax # imm = 0x2000
687 ; CHECK-FMA-NEXT: movl $8192, %ecx # imm = 0x2000
688 ; CHECK-FMA-NEXT: cmovbq %rax, %rcx
689 ; CHECK-FMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0
690 ; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
691 ; CHECK-FMA-NEXT: retq
692 ; VI-LABEL: fmul_fly_pow_mul_min_pow2:
694 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
695 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
696 ; VI-NEXT: s_mov_b64 s[4:5], 0x2000
697 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
698 ; VI-NEXT: v_mov_b32_e32 v2, 0x2000
699 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
700 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
701 ; VI-NEXT: v_ffbh_u32_e32 v2, v1
702 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
703 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
704 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
705 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
706 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
707 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
708 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
709 ; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
710 ; VI-NEXT: s_setpc_b64 s[30:31]
712 ; GFX10-LABEL: fmul_fly_pow_mul_min_pow2:
714 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
715 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
716 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
717 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
718 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
719 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
720 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
721 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
722 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
723 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
724 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
725 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
726 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
727 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
728 ; GFX10-NEXT: s_setpc_b64 s[30:31]
730 ; GFX11-LABEL: fmul_fly_pow_mul_min_pow2:
732 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
734 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
735 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
736 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
737 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
738 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
739 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
740 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
741 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
742 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
743 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
744 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
745 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
746 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
747 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
748 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
749 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
750 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0
751 ; GFX11-NEXT: s_setpc_b64 s[30:31]
752 %shl8 = shl nuw i64 8, %cnt
753 %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192)
754 %conv = uitofp i64 %shl to float
755 %mul = fmul float 9.000000e+00, %conv
759 define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
760 ; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2:
761 ; CHECK-SSE: # %bb.0:
762 ; CHECK-SSE-NEXT: movl %edi, %ecx
763 ; CHECK-SSE-NEXT: movl $2, %eax
764 ; CHECK-SSE-NEXT: shll %cl, %eax
765 ; CHECK-SSE-NEXT: movl $1, %edx
766 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
767 ; CHECK-SSE-NEXT: shll %cl, %edx
768 ; CHECK-SSE-NEXT: cmpw %ax, %dx
769 ; CHECK-SSE-NEXT: cmovbel %eax, %edx
770 ; CHECK-SSE-NEXT: movzwl %dx, %eax
771 ; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0
772 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
773 ; CHECK-SSE-NEXT: retq
775 ; CHECK-AVX2-LABEL: fmul_pow_mul_max_pow2:
776 ; CHECK-AVX2: # %bb.0:
777 ; CHECK-AVX2-NEXT: movl %edi, %ecx
778 ; CHECK-AVX2-NEXT: movl $2, %eax
779 ; CHECK-AVX2-NEXT: shll %cl, %eax
780 ; CHECK-AVX2-NEXT: movl $1, %edx
781 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
782 ; CHECK-AVX2-NEXT: shll %cl, %edx
783 ; CHECK-AVX2-NEXT: cmpw %ax, %dx
784 ; CHECK-AVX2-NEXT: cmovbel %eax, %edx
785 ; CHECK-AVX2-NEXT: movzwl %dx, %eax
786 ; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0
787 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
788 ; CHECK-AVX2-NEXT: retq
790 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_mul_max_pow2:
791 ; CHECK-NO-FASTFMA: # %bb.0:
792 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
793 ; CHECK-NO-FASTFMA-NEXT: movl $2, %eax
794 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
795 ; CHECK-NO-FASTFMA-NEXT: movl $1, %edx
796 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
797 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %edx
798 ; CHECK-NO-FASTFMA-NEXT: cmpw %ax, %dx
799 ; CHECK-NO-FASTFMA-NEXT: cmovbel %eax, %edx
800 ; CHECK-NO-FASTFMA-NEXT: movzwl %dx, %eax
801 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0
802 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
803 ; CHECK-NO-FASTFMA-NEXT: retq
805 ; CHECK-FMA-LABEL: fmul_pow_mul_max_pow2:
806 ; CHECK-FMA: # %bb.0:
807 ; CHECK-FMA-NEXT: movl $2, %eax
808 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
809 ; CHECK-FMA-NEXT: movl $1, %ecx
810 ; CHECK-FMA-NEXT: shlxl %edi, %ecx, %ecx
811 ; CHECK-FMA-NEXT: cmpw %ax, %cx
812 ; CHECK-FMA-NEXT: cmoval %ecx, %eax
813 ; CHECK-FMA-NEXT: movzwl %ax, %eax
814 ; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0
815 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
816 ; CHECK-FMA-NEXT: retq
817 ; VI-LABEL: fmul_pow_mul_max_pow2:
819 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
820 ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2
821 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
822 ; VI-NEXT: s_mov_b32 s4, 0
823 ; VI-NEXT: s_mov_b32 s5, 0x40080000
824 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
825 ; VI-NEXT: s_setpc_b64 s[30:31]
827 ; GFX10-LABEL: fmul_pow_mul_max_pow2:
829 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
830 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 2
831 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
832 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
833 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1]
834 ; GFX10-NEXT: s_setpc_b64 s[30:31]
836 ; GFX11-LABEL: fmul_pow_mul_max_pow2:
838 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839 ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 2
840 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
841 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
842 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
843 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
844 ; GFX11-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1]
845 ; GFX11-NEXT: s_setpc_b64 s[30:31]
846 %shl2 = shl nuw i16 2, %cnt
847 %shl1 = shl nuw i16 1, %cnt
848 %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2)
849 %conv = uitofp i16 %shl to double
850 %mul = fmul double 3.000000e+00, %conv
854 define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
855 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
856 ; CHECK-SSE: # %bb.0:
857 ; CHECK-SSE-NEXT: movq %rsi, %rcx
858 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
859 ; CHECK-SSE-NEXT: shlq %cl, %rdi
860 ; CHECK-SSE-NEXT: movq %rdi, %xmm1
861 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
862 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
863 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
864 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
865 ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0
866 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
867 ; CHECK-SSE-NEXT: retq
869 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
870 ; CHECK-AVX2: # %bb.0:
871 ; CHECK-AVX2-NEXT: movq %rsi, %rcx
872 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
873 ; CHECK-AVX2-NEXT: shlq %cl, %rdi
874 ; CHECK-AVX2-NEXT: vmovq %rdi, %xmm0
875 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
876 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
877 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
878 ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
879 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
880 ; CHECK-AVX2-NEXT: retq
882 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
883 ; CHECK-NO-FASTFMA: # %bb.0:
884 ; CHECK-NO-FASTFMA-NEXT: movq %rsi, %rcx
885 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
886 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rdi
887 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0
888 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
889 ; CHECK-NO-FASTFMA-NEXT: retq
891 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
892 ; CHECK-FMA: # %bb.0:
893 ; CHECK-FMA-NEXT: shlxq %rsi, %rdi, %rax
894 ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
895 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
896 ; CHECK-FMA-NEXT: retq
897 ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
899 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
901 ; VI-NEXT: s_mov_b32 s4, 0
902 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
903 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
904 ; VI-NEXT: s_mov_b32 s5, 0x40220000
905 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
906 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
907 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
908 ; VI-NEXT: s_setpc_b64 s[30:31]
910 ; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
912 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
914 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
915 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
916 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
917 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
918 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
919 ; GFX10-NEXT: s_setpc_b64 s[30:31]
921 ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
923 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
925 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
926 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
927 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
929 ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
930 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
931 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
932 ; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1]
933 ; GFX11-NEXT: s_setpc_b64 s[30:31]
934 %shl = shl nuw i64 %v, %cnt
935 %conv = uitofp i64 %shl to double
936 %mul = fmul double 9.000000e+00, %conv
940 define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
941 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
942 ; CHECK-SSE: # %bb.0:
943 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
944 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [2,2]
945 ; CHECK-SSE-NEXT: movdqa %xmm3, %xmm1
946 ; CHECK-SSE-NEXT: psllq %xmm2, %xmm1
947 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm3
948 ; CHECK-SSE-NEXT: movq %xmm3, %rax
949 ; CHECK-SSE-NEXT: testq %rax, %rax
950 ; CHECK-SSE-NEXT: js .LBB6_1
951 ; CHECK-SSE-NEXT: # %bb.2:
952 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
953 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
954 ; CHECK-SSE-NEXT: jmp .LBB6_3
955 ; CHECK-SSE-NEXT: .LBB6_1:
956 ; CHECK-SSE-NEXT: movq %rax, %rcx
957 ; CHECK-SSE-NEXT: shrq %rcx
958 ; CHECK-SSE-NEXT: andl $1, %eax
959 ; CHECK-SSE-NEXT: orq %rcx, %rax
960 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
961 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
962 ; CHECK-SSE-NEXT: addss %xmm0, %xmm0
963 ; CHECK-SSE-NEXT: .LBB6_3:
964 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
965 ; CHECK-SSE-NEXT: movq %xmm1, %rax
966 ; CHECK-SSE-NEXT: testq %rax, %rax
967 ; CHECK-SSE-NEXT: js .LBB6_4
968 ; CHECK-SSE-NEXT: # %bb.5:
969 ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1
970 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
971 ; CHECK-SSE-NEXT: jmp .LBB6_6
972 ; CHECK-SSE-NEXT: .LBB6_4:
973 ; CHECK-SSE-NEXT: movq %rax, %rcx
974 ; CHECK-SSE-NEXT: shrq %rcx
975 ; CHECK-SSE-NEXT: andl $1, %eax
976 ; CHECK-SSE-NEXT: orq %rcx, %rax
977 ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1
978 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
979 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1
980 ; CHECK-SSE-NEXT: .LBB6_6:
981 ; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
982 ; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
983 ; CHECK-SSE-NEXT: retq
985 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
986 ; CHECK-AVX2: # %bb.0:
987 ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
988 ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
989 ; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm1
990 ; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
991 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm1, %rax
992 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
993 ; CHECK-AVX2-NEXT: vmovq %xmm1, %rax
994 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
995 ; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
996 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2
997 ; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
998 ; CHECK-AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
999 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1000 ; CHECK-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
1001 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
1002 ; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0
1003 ; CHECK-AVX2-NEXT: retq
1005 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
1006 ; CHECK-NO-FASTFMA: # %bb.0:
1007 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1008 ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1009 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax
1010 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm1
1011 ; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax
1012 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
1013 ; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1014 ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
1015 ; CHECK-NO-FASTFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
1016 ; CHECK-NO-FASTFMA-NEXT: retq
1018 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
1019 ; CHECK-FMA: # %bb.0:
1020 ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1021 ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1022 ; CHECK-FMA-NEXT: vcvtuqq2ps %xmm0, %xmm0
1023 ; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1024 ; CHECK-FMA-NEXT: retq
1025 ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
1027 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028 ; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2
1029 ; VI-NEXT: v_ffbh_u32_e32 v3, v2
1030 ; VI-NEXT: v_min_u32_e32 v5, 32, v3
1031 ; VI-NEXT: v_lshlrev_b64 v[1:2], v5, v[1:2]
1032 ; VI-NEXT: v_lshlrev_b64 v[3:4], v0, 2
1033 ; VI-NEXT: v_min_u32_e32 v0, 1, v1
1034 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
1035 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v0
1036 ; VI-NEXT: v_ffbh_u32_e32 v0, v4
1037 ; VI-NEXT: v_min_u32_e32 v6, 32, v0
1038 ; VI-NEXT: v_lshlrev_b64 v[0:1], v6, v[3:4]
1039 ; VI-NEXT: v_sub_u32_e32 v3, vcc, 32, v5
1040 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
1041 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1042 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1043 ; VI-NEXT: v_ldexp_f32 v1, v2, v3
1044 ; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v6
1045 ; VI-NEXT: v_ldexp_f32 v0, v0, v2
1046 ; VI-NEXT: v_mul_f32_e32 v0, 0x41700000, v0
1047 ; VI-NEXT: v_mul_f32_e32 v1, 0x41700000, v1
1048 ; VI-NEXT: s_setpc_b64 s[30:31]
1050 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
1052 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1053 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1054 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1055 ; GFX10-NEXT: v_ffbh_u32_e32 v4, v1
1056 ; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
1057 ; GFX10-NEXT: v_min_u32_e32 v4, 32, v4
1058 ; GFX10-NEXT: v_min_u32_e32 v5, 32, v5
1059 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1060 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
1061 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
1062 ; GFX10-NEXT: v_min_u32_e32 v2, 1, v2
1063 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
1064 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v2
1065 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v5
1066 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v4
1067 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
1068 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
1069 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v3
1070 ; GFX10-NEXT: v_ldexp_f32 v1, v1, v2
1071 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x41700000, v0
1072 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x41700000, v1
1073 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1075 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
1077 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1078 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1079 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1080 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1081 ; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1
1082 ; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3
1083 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1084 ; GFX11-NEXT: v_min_u32_e32 v4, 32, v4
1085 ; GFX11-NEXT: v_min_u32_e32 v5, 32, v5
1086 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1087 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
1088 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3]
1089 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1090 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
1091 ; GFX11-NEXT: v_min_u32_e32 v2, 1, v2
1092 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1093 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
1094 ; GFX11-NEXT: v_or_b32_e32 v1, v3, v2
1095 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v5
1096 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v4
1097 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1098 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
1099 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
1100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1101 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v3
1102 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
1103 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1104 ; GFX11-NEXT: v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1
1105 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1106 %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
1107 %conv = uitofp <2 x i64> %shl to <2 x float>
1108 %mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv
1109 ret <2 x float> %mul
1112 define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
1113 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec:
1114 ; CHECK-SSE: # %bb.0:
1115 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,2]
1116 ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2
1117 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2
1118 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1119 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1
1120 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1121 ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295]
1122 ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0
1123 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1124 ; CHECK-SSE-NEXT: psrlq $32, %xmm1
1125 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1126 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1127 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1
1128 ; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1129 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
1130 ; CHECK-SSE-NEXT: retq
1132 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec:
1133 ; CHECK-AVX2: # %bb.0:
1134 ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1135 ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1136 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1137 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1138 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1139 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1140 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1141 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1142 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1143 ; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1144 ; CHECK-AVX2-NEXT: retq
1146 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec:
1147 ; CHECK-NO-FASTFMA: # %bb.0:
1148 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1149 ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1150 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1
1151 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1152 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1153 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0
1154 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1155 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1156 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1157 ; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1158 ; CHECK-NO-FASTFMA-NEXT: retq
1160 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec:
1161 ; CHECK-FMA: # %bb.0:
1162 ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1163 ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1164 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0
1165 ; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
1166 ; CHECK-FMA-NEXT: retq
1167 ; VI-LABEL: fmul_pow_shl_cnt_vec:
1169 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1170 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1171 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1172 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1173 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1174 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1175 ; VI-NEXT: s_mov_b32 s4, 0
1176 ; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1177 ; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1178 ; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2
1179 ; VI-NEXT: s_mov_b32 s5, 0x402e0000
1180 ; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1181 ; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8]
1182 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
1183 ; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5]
1184 ; VI-NEXT: s_setpc_b64 s[30:31]
1186 ; GFX10-LABEL: fmul_pow_shl_cnt_vec:
1188 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1189 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1190 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1191 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1192 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1193 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1194 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
1195 ; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1196 ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1197 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1198 ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
1199 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
1200 ; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
1201 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1203 ; GFX11-LABEL: fmul_pow_shl_cnt_vec:
1205 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1206 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1207 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1208 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1209 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1210 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1211 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1212 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1213 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
1214 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1215 ; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1216 ; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1217 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1218 ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1219 ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
1220 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1221 ; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
1222 ; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
1223 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1224 %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
1225 %conv = uitofp <2 x i64> %shl to <2 x double>
1226 %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
1227 ret <2 x double> %mul
1230 define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
1231 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1232 ; CHECK-SSE: # %bb.0:
1233 ; CHECK-SSE-NEXT: pslld $23, %xmm0
1234 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1235 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
1236 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2]
1237 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1238 ; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm0
1239 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1240 ; CHECK-SSE-NEXT: pmuludq %xmm2, %xmm3
1241 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1242 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1243 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1244 ; CHECK-SSE-NEXT: pand %xmm0, %xmm2
1245 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1246 ; CHECK-SSE-NEXT: psrld $16, %xmm0
1247 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1248 ; CHECK-SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1249 ; CHECK-SSE-NEXT: addps %xmm2, %xmm0
1250 ; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1251 ; CHECK-SSE-NEXT: addps %xmm1, %xmm0
1252 ; CHECK-SSE-NEXT: retq
1254 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1255 ; CHECK-AVX2: # %bb.0:
1256 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
1257 ; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm0
1258 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
1259 ; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1260 ; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
1261 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1392508928,1392508928,1392508928,1392508928]
1262 ; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
1263 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
1264 ; CHECK-AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
1265 ; CHECK-AVX2-NEXT: vaddps %xmm0, %xmm2, %xmm0
1266 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
1267 ; CHECK-AVX2-NEXT: vmulps %xmm2, %xmm0, %xmm0
1268 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
1269 ; CHECK-AVX2-NEXT: retq
1271 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1272 ; CHECK-NO-FASTFMA: # %bb.0:
1273 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
1274 ; CHECK-NO-FASTFMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0
1275 ; CHECK-NO-FASTFMA-NEXT: vcvtudq2ps %zmm0, %zmm0
1276 ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0]
1277 ; CHECK-NO-FASTFMA-NEXT: vmulps %xmm2, %xmm0, %xmm0
1278 ; CHECK-NO-FASTFMA-NEXT: vaddps %xmm1, %xmm0, %xmm0
1279 ; CHECK-NO-FASTFMA-NEXT: vzeroupper
1280 ; CHECK-NO-FASTFMA-NEXT: retq
1282 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1283 ; CHECK-FMA: # %bb.0:
1284 ; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
1285 ; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0
1286 ; CHECK-FMA-NEXT: vcvtudq2ps %xmm0, %xmm0
1287 ; CHECK-FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
1288 ; CHECK-FMA-NEXT: retq
1289 ; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1291 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1292 ; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 2
1293 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 2
1294 ; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 2
1295 ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 2
1296 ; VI-NEXT: v_cvt_f32_u32_e32 v3, v3
1297 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v2
1298 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
1299 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1300 ; VI-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3
1301 ; VI-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2
1302 ; VI-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1
1303 ; VI-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0
1304 ; VI-NEXT: v_add_f32_e32 v0, v0, v4
1305 ; VI-NEXT: v_add_f32_e32 v1, v1, v5
1306 ; VI-NEXT: v_add_f32_e32 v2, v2, v6
1307 ; VI-NEXT: v_add_f32_e32 v3, v3, v7
1308 ; VI-NEXT: s_setpc_b64 s[30:31]
1310 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1312 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1313 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 2
1314 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 2
1315 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 2
1316 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 2
1317 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
1318 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1
1319 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2
1320 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3
1321 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0
1322 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1
1323 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2
1324 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3
1325 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
1326 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
1327 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
1328 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
1329 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1331 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
1333 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1334 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 2
1335 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 2
1336 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 2
1337 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 2
1338 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1339 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
1340 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
1341 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1342 ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
1343 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
1344 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1345 ; GFX11-NEXT: v_dual_mul_f32 v0, 0x40a00000, v0 :: v_dual_mul_f32 v1, 0x40a00000, v1
1346 ; GFX11-NEXT: v_dual_mul_f32 v2, 0x40a00000, v2 :: v_dual_mul_f32 v3, 0x40a00000, v3
1347 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1348 ; GFX11-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
1349 ; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
1350 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1351 %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
1352 %conv = uitofp <4 x i32> %shl to <4 x float>
1353 %mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
1354 %res = fadd <4 x float> %mul, %add
1355 ret <4 x float> %res
1358 define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind {
1359 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1360 ; CHECK-SSE: # %bb.0:
1361 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,2]
1362 ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2
1363 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2
1364 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1365 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1
1366 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1367 ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295]
1368 ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0
1369 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1370 ; CHECK-SSE-NEXT: psrlq $32, %xmm1
1371 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1372 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1373 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1
1374 ; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1375 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
1376 ; CHECK-SSE-NEXT: retq
1378 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1379 ; CHECK-AVX2: # %bb.0:
1380 ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1381 ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1382 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1383 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1384 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1385 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1386 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1387 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1388 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1389 ; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1390 ; CHECK-AVX2-NEXT: retq
1392 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1393 ; CHECK-NO-FASTFMA: # %bb.0:
1394 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1395 ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1396 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1
1397 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1398 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1399 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0
1400 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1401 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1402 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1403 ; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1404 ; CHECK-NO-FASTFMA-NEXT: retq
1406 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1407 ; CHECK-FMA: # %bb.0:
1408 ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
1409 ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1410 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0
1411 ; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1412 ; CHECK-FMA-NEXT: retq
1413 ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1415 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1416 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1417 ; VI-NEXT: s_mov_b32 s4, 0
1418 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v1
1419 ; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2
1420 ; VI-NEXT: s_mov_b32 s5, 0x402e0000
1421 ; VI-NEXT: v_cvt_f64_u32_e32 v[5:6], v2
1422 ; VI-NEXT: v_ldexp_f64 v[2:3], v[3:4], 32
1423 ; VI-NEXT: v_ldexp_f64 v[4:5], v[5:6], 32
1424 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v0
1425 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v1
1426 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7]
1427 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[0:1]
1428 ; VI-NEXT: v_mul_f64 v[0:1], v[2:3], s[4:5]
1429 ; VI-NEXT: s_mov_b32 s4, 0
1430 ; VI-NEXT: s_mov_b32 s5, 0x402c0000
1431 ; VI-NEXT: v_mul_f64 v[2:3], v[4:5], s[4:5]
1432 ; VI-NEXT: s_setpc_b64 s[30:31]
1434 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1436 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1437 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1438 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1439 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1440 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1441 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1442 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
1443 ; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1444 ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1445 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1446 ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
1447 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
1448 ; GFX10-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3]
1449 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1451 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
1453 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1454 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1455 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2
1456 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1457 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1458 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1460 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1461 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
1462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1463 ; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1464 ; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1465 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1466 ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1467 ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
1468 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1469 ; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
1470 ; GFX11-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3]
1471 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1472 %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
1473 %conv = uitofp <2 x i64> %shl to <2 x double>
1474 %mul = fmul <2 x double> <double 15.000000e+00, double 14.000000e+00>, %conv
1475 ret <2 x double> %mul
1478 define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind {
1479 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1480 ; CHECK-SSE: # %bb.0:
1481 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2,1]
1482 ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2
1483 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2
1484 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1485 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1
1486 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1487 ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295]
1488 ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0
1489 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1490 ; CHECK-SSE-NEXT: psrlq $32, %xmm1
1491 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1492 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1493 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1
1494 ; CHECK-SSE-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1495 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
1496 ; CHECK-SSE-NEXT: retq
1498 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1499 ; CHECK-AVX2: # %bb.0:
1500 ; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1]
1501 ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1502 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1503 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1504 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1505 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1506 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1507 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1508 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1509 ; CHECK-AVX2-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1510 ; CHECK-AVX2-NEXT: retq
1512 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1513 ; CHECK-NO-FASTFMA: # %bb.0:
1514 ; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1]
1515 ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1516 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1
1517 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1518 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1519 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0
1520 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1521 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1522 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1523 ; CHECK-NO-FASTFMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1524 ; CHECK-NO-FASTFMA-NEXT: retq
1526 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1527 ; CHECK-FMA: # %bb.0:
1528 ; CHECK-FMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,1]
1529 ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1530 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0
1531 ; CHECK-FMA-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
1532 ; CHECK-FMA-NEXT: retq
1533 ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1535 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1537 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 1
1538 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1539 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1540 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1541 ; VI-NEXT: s_mov_b32 s4, 0
1542 ; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1543 ; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1544 ; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2
1545 ; VI-NEXT: s_mov_b32 s5, 0x402e0000
1546 ; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1547 ; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8]
1548 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
1549 ; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5]
1550 ; VI-NEXT: s_setpc_b64 s[30:31]
1552 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1554 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1555 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1556 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1
1557 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1558 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1559 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1560 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
1561 ; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1562 ; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1563 ; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1564 ; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
1565 ; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
1566 ; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
1567 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1569 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
1571 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1572 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2
1573 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1
1574 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1575 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1
1576 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3
1577 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1578 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1579 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2
1580 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1581 ; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32
1582 ; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32
1583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1584 ; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1]
1585 ; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9]
1586 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1587 ; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1]
1588 ; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3]
1589 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1590 %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
1591 %conv = uitofp <2 x i64> %shl to <2 x double>
1592 %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
1593 ret <2 x double> %mul
1596 define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
1597 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1598 ; CHECK-SSE: # %bb.0:
1599 ; CHECK-SSE-NEXT: subq $40, %rsp
1600 ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1601 ; CHECK-SSE-NEXT: pslld $23, %xmm0
1602 ; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1603 ; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
1604 ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1605 ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1606 ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1607 ; CHECK-SSE-NEXT: pextrw $1, %xmm0, %eax
1608 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
1609 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
1610 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1611 ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1612 ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1613 ; CHECK-SSE-NEXT: pextrw $0, %xmm0, %eax
1614 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
1615 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
1616 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1617 ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
1618 ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1619 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1620 ; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1621 ; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
1622 ; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
1623 ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
1624 ; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1625 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1626 ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1627 ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1628 ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
1629 ; CHECK-SSE-NEXT: addq $40, %rsp
1630 ; CHECK-SSE-NEXT: retq
1632 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1633 ; CHECK-AVX2: # %bb.0:
1634 ; CHECK-AVX2-NEXT: subq $40, %rsp
1635 ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1636 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
1637 ; CHECK-AVX2-NEXT: vpsllvd %xmm0, %xmm1, %xmm0
1638 ; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1639 ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax
1640 ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
1641 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
1642 ; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1643 ; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1644 ; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax
1645 ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
1646 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
1647 ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
1648 ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1649 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
1650 ; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1651 ; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
1652 ; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
1653 ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
1654 ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1655 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
1656 ; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1657 ; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1658 ; CHECK-AVX2-NEXT: addq $40, %rsp
1659 ; CHECK-AVX2-NEXT: retq
1661 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1662 ; CHECK-NO-FASTFMA: # %bb.0:
1663 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1664 ; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
1665 ; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
1666 ; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm1
1667 ; CHECK-NO-FASTFMA-NEXT: vpextrw $0, %xmm0, %eax
1668 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0
1669 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1670 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
1671 ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
1672 ; CHECK-NO-FASTFMA-NEXT: vpextrw $1, %xmm1, %eax
1673 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1
1674 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1675 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm1, %eax
1676 ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
1677 ; CHECK-NO-FASTFMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1678 ; CHECK-NO-FASTFMA-NEXT: vmovaps {{.*#+}} xmm1 = [16,0,0,0]
1679 ; CHECK-NO-FASTFMA-NEXT: xorl %eax, %eax
1680 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
1681 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1682 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm2, %eax
1683 ; CHECK-NO-FASTFMA-NEXT: vmovd %eax, %xmm2
1684 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastw %xmm2, %xmm2
1685 ; CHECK-NO-FASTFMA-NEXT: vpermt2ps %zmm0, %zmm1, %zmm2
1686 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm2, %ymm0
1687 ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
1688 ; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
1689 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
1690 ; CHECK-NO-FASTFMA-NEXT: vzeroupper
1691 ; CHECK-NO-FASTFMA-NEXT: retq
1693 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1694 ; CHECK-FMA: # %bb.0:
1695 ; CHECK-FMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
1696 ; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
1697 ; CHECK-FMA-NEXT: vpextrw $7, %xmm0, %eax
1698 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm1
1699 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1700 ; CHECK-FMA-NEXT: vmovd %xmm1, %eax
1701 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
1702 ; CHECK-FMA-NEXT: vpextrw $6, %xmm0, %eax
1703 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
1704 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1705 ; CHECK-FMA-NEXT: vmovd %xmm2, %eax
1706 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
1707 ; CHECK-FMA-NEXT: vpextrw $5, %xmm0, %eax
1708 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm3, %xmm3
1709 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1710 ; CHECK-FMA-NEXT: vmovd %xmm3, %eax
1711 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
1712 ; CHECK-FMA-NEXT: vpextrw $4, %xmm0, %eax
1713 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm4, %xmm4
1714 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1715 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm2
1716 ; CHECK-FMA-NEXT: vmovd %xmm2, %eax
1717 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
1718 ; CHECK-FMA-NEXT: vpextrw $3, %xmm0, %eax
1719 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
1720 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1721 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1722 ; CHECK-FMA-NEXT: vmovd %xmm4, %eax
1723 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
1724 ; CHECK-FMA-NEXT: vpextrw $2, %xmm0, %eax
1725 ; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1726 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm2
1727 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1728 ; CHECK-FMA-NEXT: vmovd %xmm2, %eax
1729 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
1730 ; CHECK-FMA-NEXT: vpextrw $1, %xmm0, %eax
1731 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm4
1732 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1733 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm4, %xmm3
1734 ; CHECK-FMA-NEXT: vmovd %xmm3, %eax
1735 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3
1736 ; CHECK-FMA-NEXT: vpextrw $0, %xmm0, %eax
1737 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm5, %xmm0
1738 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1739 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax
1740 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
1741 ; CHECK-FMA-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1742 ; CHECK-FMA-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1743 ; CHECK-FMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1744 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0
1745 ; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1746 ; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
1747 ; CHECK-FMA-NEXT: vzeroupper
1748 ; CHECK-FMA-NEXT: retq
1749 ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1751 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1752 ; VI-NEXT: v_mov_b32_e32 v1, 2
1753 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1754 ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2
1755 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
1756 ; VI-NEXT: v_cvt_f16_u16_e32 v1, v1
1757 ; VI-NEXT: v_mov_b32_e32 v2, 0x4b80
1758 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1759 ; VI-NEXT: v_mul_f16_e32 v0, 0x4b80, v0
1760 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1761 ; VI-NEXT: s_setpc_b64 s[30:31]
1763 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1765 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1766 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
1767 ; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0
1768 ; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1769 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
1770 ; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
1771 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1773 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1775 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1776 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
1777 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1778 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1779 ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
1780 ; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
1781 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1782 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
1783 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
1784 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1785 %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
1786 %conv = uitofp <2 x i16> %shl to <2 x half>
1787 %mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
1791 define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
1792 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1793 ; CHECK-SSE: # %bb.0:
1794 ; CHECK-SSE-NEXT: movq %rdi, %rcx
1795 ; CHECK-SSE-NEXT: movl $1, %eax
1796 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
1797 ; CHECK-SSE-NEXT: shlq %cl, %rax
1798 ; CHECK-SSE-NEXT: movq %rax, %xmm1
1799 ; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1800 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1801 ; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
1802 ; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1803 ; CHECK-SSE-NEXT: addsd %xmm1, %xmm0
1804 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1805 ; CHECK-SSE-NEXT: retq
1807 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1808 ; CHECK-AVX2: # %bb.0:
1809 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
1810 ; CHECK-AVX2-NEXT: movl $1, %eax
1811 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
1812 ; CHECK-AVX2-NEXT: shlq %cl, %rax
1813 ; CHECK-AVX2-NEXT: vmovq %rax, %xmm0
1814 ; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
1815 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1816 ; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
1817 ; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1818 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1819 ; CHECK-AVX2-NEXT: retq
1821 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1822 ; CHECK-NO-FASTFMA: # %bb.0:
1823 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
1824 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
1825 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
1826 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
1827 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
1828 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1829 ; CHECK-NO-FASTFMA-NEXT: retq
1831 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1832 ; CHECK-FMA: # %bb.0:
1833 ; CHECK-FMA-NEXT: movl $1, %eax
1834 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
1835 ; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0
1836 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1837 ; CHECK-FMA-NEXT: retq
1838 ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1840 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1841 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1
1842 ; VI-NEXT: s_mov_b32 s4, 0xff5f3992
1843 ; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
1844 ; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
1845 ; VI-NEXT: s_mov_b32 s5, 0x7befffff
1846 ; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32
1847 ; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4]
1848 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
1849 ; VI-NEXT: s_setpc_b64 s[30:31]
1851 ; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1853 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1854 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1
1855 ; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992
1856 ; GFX10-NEXT: s_mov_b32 s5, 0x7befffff
1857 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
1858 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
1859 ; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
1860 ; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
1861 ; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
1862 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1864 ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1866 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1867 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1
1868 ; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992
1869 ; GFX11-NEXT: s_mov_b32 s1, 0x7befffff
1870 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1871 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1
1872 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0
1873 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1874 ; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32
1875 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4]
1876 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1877 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1]
1878 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1879 %shl = shl nuw i64 1, %cnt
1880 %conv = uitofp i64 %shl to double
1881 %mul = fmul double 9.745314e+288, %conv
1885 define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
1886 ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe:
1887 ; CHECK-SSE: # %bb.0:
1888 ; CHECK-SSE-NEXT: movl %edi, %ecx
1889 ; CHECK-SSE-NEXT: movl $1, %eax
1890 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
1891 ; CHECK-SSE-NEXT: shll %cl, %eax
1892 ; CHECK-SSE-NEXT: movzwl %ax, %eax
1893 ; CHECK-SSE-NEXT: cvtsi2sd %eax, %xmm0
1894 ; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1895 ; CHECK-SSE-NEXT: retq
1897 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_safe:
1898 ; CHECK-AVX2: # %bb.0:
1899 ; CHECK-AVX2-NEXT: movl %edi, %ecx
1900 ; CHECK-AVX2-NEXT: movl $1, %eax
1901 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
1902 ; CHECK-AVX2-NEXT: shll %cl, %eax
1903 ; CHECK-AVX2-NEXT: movzwl %ax, %eax
1904 ; CHECK-AVX2-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0
1905 ; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1906 ; CHECK-AVX2-NEXT: retq
1908 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_safe:
1909 ; CHECK-NO-FASTFMA: # %bb.0:
1910 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
1911 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
1912 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
1913 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
1914 ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax
1915 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0
1916 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1917 ; CHECK-NO-FASTFMA-NEXT: retq
1919 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_safe:
1920 ; CHECK-FMA: # %bb.0:
1921 ; CHECK-FMA-NEXT: movl $1, %eax
1922 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
1923 ; CHECK-FMA-NEXT: movzwl %ax, %eax
1924 ; CHECK-FMA-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0
1925 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1926 ; CHECK-FMA-NEXT: retq
1927 ; VI-LABEL: fmul_pow_shl_cnt_safe:
1929 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1930 ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1
1931 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1932 ; VI-NEXT: s_mov_b32 s4, 0xff5f3992
1933 ; VI-NEXT: s_mov_b32 s5, 0x7befffff
1934 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
1935 ; VI-NEXT: s_setpc_b64 s[30:31]
1937 ; GFX10-LABEL: fmul_pow_shl_cnt_safe:
1939 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1
1941 ; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992
1942 ; GFX10-NEXT: s_mov_b32 s5, 0x7befffff
1943 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
1944 ; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1945 ; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5]
1946 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1948 ; GFX11-LABEL: fmul_pow_shl_cnt_safe:
1950 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1951 ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1
1952 ; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992
1953 ; GFX11-NEXT: s_mov_b32 s1, 0x7befffff
1954 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1955 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1956 ; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
1957 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1958 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1]
1959 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1960 %shl = shl nuw i16 1, %cnt
1961 %conv = uitofp i16 %shl to double
1962 %mul = fmul double 9.745314e+288, %conv
1966 define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
1967 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec:
1968 ; CHECK-SSE: # %bb.0:
1969 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
1970 ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm2
1971 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm2
1972 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1973 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm1
1974 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1975 ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [4294967295,4294967295]
1976 ; CHECK-SSE-NEXT: andpd %xmm1, %xmm0
1977 ; CHECK-SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1978 ; CHECK-SSE-NEXT: psrlq $32, %xmm1
1979 ; CHECK-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1980 ; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1981 ; CHECK-SSE-NEXT: addpd %xmm0, %xmm1
1982 ; CHECK-SSE-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
1983 ; CHECK-SSE-NEXT: divpd %xmm1, %xmm0
1984 ; CHECK-SSE-NEXT: retq
1986 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec:
1987 ; CHECK-AVX2: # %bb.0:
1988 ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
1989 ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
1990 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1991 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1992 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1993 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
1994 ; CHECK-AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1995 ; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1996 ; CHECK-AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
1997 ; CHECK-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
1998 ; CHECK-AVX2-NEXT: # xmm1 = mem[0,0]
1999 ; CHECK-AVX2-NEXT: vdivpd %xmm0, %xmm1, %xmm0
2000 ; CHECK-AVX2-NEXT: retq
2002 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec:
2003 ; CHECK-NO-FASTFMA: # %bb.0:
2004 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
2005 ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
2006 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1
2007 ; CHECK-NO-FASTFMA-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2008 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2009 ; CHECK-NO-FASTFMA-NEXT: vpsrlq $32, %xmm0, %xmm0
2010 ; CHECK-NO-FASTFMA-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2011 ; CHECK-NO-FASTFMA-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2012 ; CHECK-NO-FASTFMA-NEXT: vaddpd %xmm0, %xmm1, %xmm0
2013 ; CHECK-NO-FASTFMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
2014 ; CHECK-NO-FASTFMA-NEXT: # xmm1 = mem[0,0]
2015 ; CHECK-NO-FASTFMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0
2016 ; CHECK-NO-FASTFMA-NEXT: retq
2018 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec:
2019 ; CHECK-FMA: # %bb.0:
2020 ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
2021 ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
2022 ; CHECK-FMA-NEXT: vcvtuqq2pd %xmm0, %xmm0
2023 ; CHECK-FMA-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
2024 ; CHECK-FMA-NEXT: # xmm1 = mem[0,0]
2025 ; CHECK-FMA-NEXT: vdivpd %xmm0, %xmm1, %xmm0
2026 ; CHECK-FMA-NEXT: retq
2027 ; VI-LABEL: fdiv_pow_shl_cnt_vec:
2029 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2030 ; VI-NEXT: v_lshlrev_b32_e32 v1, 20, v0
2031 ; VI-NEXT: v_mov_b32_e32 v3, 0x3ff00000
2032 ; VI-NEXT: v_sub_u32_e64 v0, vcc, 0, 0
2033 ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2034 ; VI-NEXT: v_subb_u32_e64 v1, s[4:5], v3, v1, vcc
2035 ; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v2, vcc
2036 ; VI-NEXT: v_mov_b32_e32 v2, v0
2037 ; VI-NEXT: s_setpc_b64 s[30:31]
2039 ; GFX10-LABEL: fdiv_pow_shl_cnt_vec:
2041 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2042 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 20, v0
2043 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2044 ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
2045 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s4, 0x3ff00000, v1, vcc_lo
2046 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v2, vcc_lo
2047 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
2048 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2050 ; GFX11-LABEL: fdiv_pow_shl_cnt_vec:
2052 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2053 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0
2054 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2055 ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
2056 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2057 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, s0, 0x3ff00000, v1, vcc_lo
2058 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v2, vcc_lo
2059 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2060 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
2061 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2062 %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
2063 %conv = uitofp <2 x i64> %shl to <2 x double>
2064 %mul = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %conv
2065 ret <2 x double> %mul
2068 define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind {
2069 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2070 ; CHECK-SSE: # %bb.0:
2071 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2072 ; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
2073 ; CHECK-SSE-NEXT: movdqa %xmm3, %xmm2
2074 ; CHECK-SSE-NEXT: psllq %xmm1, %xmm2
2075 ; CHECK-SSE-NEXT: psllq %xmm0, %xmm3
2076 ; CHECK-SSE-NEXT: movq %xmm3, %rax
2077 ; CHECK-SSE-NEXT: testq %rax, %rax
2078 ; CHECK-SSE-NEXT: js .LBB15_1
2079 ; CHECK-SSE-NEXT: # %bb.2:
2080 ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1
2081 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
2082 ; CHECK-SSE-NEXT: jmp .LBB15_3
2083 ; CHECK-SSE-NEXT: .LBB15_1:
2084 ; CHECK-SSE-NEXT: movq %rax, %rcx
2085 ; CHECK-SSE-NEXT: shrq %rcx
2086 ; CHECK-SSE-NEXT: andl $1, %eax
2087 ; CHECK-SSE-NEXT: orq %rcx, %rax
2088 ; CHECK-SSE-NEXT: xorps %xmm1, %xmm1
2089 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
2090 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1
2091 ; CHECK-SSE-NEXT: .LBB15_3:
2092 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
2093 ; CHECK-SSE-NEXT: movq %xmm0, %rax
2094 ; CHECK-SSE-NEXT: testq %rax, %rax
2095 ; CHECK-SSE-NEXT: js .LBB15_4
2096 ; CHECK-SSE-NEXT: # %bb.5:
2097 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
2098 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
2099 ; CHECK-SSE-NEXT: jmp .LBB15_6
2100 ; CHECK-SSE-NEXT: .LBB15_4:
2101 ; CHECK-SSE-NEXT: movq %rax, %rcx
2102 ; CHECK-SSE-NEXT: shrq %rcx
2103 ; CHECK-SSE-NEXT: andl $1, %eax
2104 ; CHECK-SSE-NEXT: orq %rcx, %rax
2105 ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
2106 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
2107 ; CHECK-SSE-NEXT: addss %xmm0, %xmm0
2108 ; CHECK-SSE-NEXT: .LBB15_6:
2109 ; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2110 ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u>
2111 ; CHECK-SSE-NEXT: divps %xmm1, %xmm0
2112 ; CHECK-SSE-NEXT: retq
2114 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2115 ; CHECK-AVX2: # %bb.0:
2116 ; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
2117 ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
2118 ; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
2119 ; CHECK-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm2
2120 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
2121 ; CHECK-AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
2122 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm1, %rax
2123 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
2124 ; CHECK-AVX2-NEXT: vmovq %xmm1, %rax
2125 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
2126 ; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
2127 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2
2128 ; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
2129 ; CHECK-AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
2130 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
2131 ; CHECK-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
2132 ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
2133 ; CHECK-AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0
2134 ; CHECK-AVX2-NEXT: retq
2136 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2137 ; CHECK-NO-FASTFMA: # %bb.0:
2138 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
2139 ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
2140 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax
2141 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm1
2142 ; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax
2143 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
2144 ; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
2145 ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
2146 ; CHECK-NO-FASTFMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
2147 ; CHECK-NO-FASTFMA-NEXT: retq
2149 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2150 ; CHECK-FMA: # %bb.0:
2151 ; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1]
2152 ; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
2153 ; CHECK-FMA-NEXT: vcvtuqq2ps %xmm0, %xmm0
2154 ; CHECK-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
2155 ; CHECK-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
2156 ; CHECK-FMA-NEXT: retq
2157 ; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2159 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2160 ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
2161 ; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v2
2162 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 1.0, v0
2163 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 1.0, v1
2164 ; VI-NEXT: s_setpc_b64 s[30:31]
2166 ; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2168 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2169 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
2170 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v2
2171 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0
2172 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1
2173 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2175 ; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
2177 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2178 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
2179 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v2
2180 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2181 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0
2182 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1
2183 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2184 %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
2185 %conv = uitofp <2 x i64> %shl to <2 x float>
2186 %mul = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %conv
2187 ret <2 x float> %mul
2190 define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
2191 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2192 ; CHECK-SSE: # %bb.0:
2193 ; CHECK-SSE-NEXT: movq %rdi, %rcx
2194 ; CHECK-SSE-NEXT: movl $8, %eax
2195 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
2196 ; CHECK-SSE-NEXT: shlq %cl, %rax
2197 ; CHECK-SSE-NEXT: testq %rax, %rax
2198 ; CHECK-SSE-NEXT: js .LBB16_1
2199 ; CHECK-SSE-NEXT: # %bb.2:
2200 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
2201 ; CHECK-SSE-NEXT: jmp .LBB16_3
2202 ; CHECK-SSE-NEXT: .LBB16_1:
2203 ; CHECK-SSE-NEXT: shrq %rax
2204 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
2205 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1
2206 ; CHECK-SSE-NEXT: .LBB16_3:
2207 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2208 ; CHECK-SSE-NEXT: divss %xmm1, %xmm0
2209 ; CHECK-SSE-NEXT: retq
2211 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2212 ; CHECK-AVX2: # %bb.0:
2213 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
2214 ; CHECK-AVX2-NEXT: movl $8, %eax
2215 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
2216 ; CHECK-AVX2-NEXT: shlq %cl, %rax
2217 ; CHECK-AVX2-NEXT: testq %rax, %rax
2218 ; CHECK-AVX2-NEXT: js .LBB16_1
2219 ; CHECK-AVX2-NEXT: # %bb.2:
2220 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2221 ; CHECK-AVX2-NEXT: jmp .LBB16_3
2222 ; CHECK-AVX2-NEXT: .LBB16_1:
2223 ; CHECK-AVX2-NEXT: shrq %rax
2224 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2225 ; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
2226 ; CHECK-AVX2-NEXT: .LBB16_3:
2227 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2228 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2229 ; CHECK-AVX2-NEXT: retq
2231 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2232 ; CHECK-NO-FASTFMA: # %bb.0:
2233 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
2234 ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
2235 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
2236 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
2237 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0
2238 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2239 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2240 ; CHECK-NO-FASTFMA-NEXT: retq
2242 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2243 ; CHECK-FMA: # %bb.0:
2244 ; CHECK-FMA-NEXT: movl $8, %eax
2245 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
2246 ; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0
2247 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2248 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2249 ; CHECK-FMA-NEXT: retq
2250 ; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2252 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2253 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2254 ; VI-NEXT: s_mov_b32 s6, 0xc1100000
2255 ; VI-NEXT: v_ffbh_u32_e32 v2, v1
2256 ; VI-NEXT: v_min_u32_e32 v2, 32, v2
2257 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2258 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
2259 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
2260 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
2261 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
2262 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
2263 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
2264 ; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
2265 ; VI-NEXT: v_rcp_f32_e32 v3, v1
2266 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
2267 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
2268 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
2269 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
2270 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
2271 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
2272 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
2273 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
2274 ; VI-NEXT: s_setpc_b64 s[30:31]
2276 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2278 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2279 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2280 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
2281 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
2282 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2283 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
2284 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
2285 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
2286 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
2287 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
2288 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
2289 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1
2290 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2291 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
2292 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
2293 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
2294 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
2295 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
2296 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
2297 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2298 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
2299 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2301 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
2303 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2304 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2305 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2306 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
2307 ; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
2308 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2309 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2310 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
2311 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2312 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
2313 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
2314 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
2315 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2316 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
2317 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000
2318 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2319 ; GFX11-NEXT: v_rcp_f32_e32 v2, v1
2320 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2321 ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2322 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
2323 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
2324 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2325 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
2326 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
2327 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2328 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
2329 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
2330 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2331 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2332 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
2333 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2334 %shl = shl i64 8, %cnt
2335 %conv = uitofp i64 %shl to float
2336 %mul = fdiv float -9.000000e+00, %conv
2340 define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
2341 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2342 ; CHECK-SSE: # %bb.0:
2343 ; CHECK-SSE-NEXT: movq %rdi, %rcx
2344 ; CHECK-SSE-NEXT: movl $8, %eax
2345 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
2346 ; CHECK-SSE-NEXT: shlq %cl, %rax
2347 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
2348 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2349 ; CHECK-SSE-NEXT: divss %xmm1, %xmm0
2350 ; CHECK-SSE-NEXT: retq
2352 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2353 ; CHECK-AVX2: # %bb.0:
2354 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
2355 ; CHECK-AVX2-NEXT: movl $8, %eax
2356 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
2357 ; CHECK-AVX2-NEXT: shlq %cl, %rax
2358 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2359 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2360 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2361 ; CHECK-AVX2-NEXT: retq
2363 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2364 ; CHECK-NO-FASTFMA: # %bb.0:
2365 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
2366 ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
2367 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
2368 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
2369 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2370 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2371 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2372 ; CHECK-NO-FASTFMA-NEXT: retq
2374 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2375 ; CHECK-FMA: # %bb.0:
2376 ; CHECK-FMA-NEXT: movl $8, %eax
2377 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
2378 ; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2379 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2380 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2381 ; CHECK-FMA-NEXT: retq
2382 ; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2384 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2385 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2386 ; VI-NEXT: s_mov_b32 s6, 0xc1100000
2387 ; VI-NEXT: v_xor_b32_e32 v2, v0, v1
2388 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v2
2389 ; VI-NEXT: v_ffbh_i32_e32 v3, v1
2390 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
2391 ; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
2392 ; VI-NEXT: v_min_u32_e32 v2, v3, v2
2393 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2394 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
2395 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
2396 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
2397 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
2398 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
2399 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
2400 ; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
2401 ; VI-NEXT: v_rcp_f32_e32 v3, v1
2402 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
2403 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
2404 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
2405 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
2406 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
2407 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
2408 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
2409 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
2410 ; VI-NEXT: s_setpc_b64 s[30:31]
2412 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2414 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2415 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2416 ; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1
2417 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
2418 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2
2419 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
2420 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
2421 ; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
2422 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2423 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
2424 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
2425 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
2426 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
2427 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
2428 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
2429 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1
2430 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2431 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
2432 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
2433 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
2434 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
2435 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
2436 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
2437 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2438 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
2439 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2441 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
2443 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2444 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2445 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2446 ; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1
2447 ; GFX11-NEXT: v_cls_i32_e32 v3, v1
2448 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2
2449 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2450 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
2451 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
2452 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2453 ; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
2454 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2456 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
2457 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
2458 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
2459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2460 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
2461 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
2462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2463 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000
2464 ; GFX11-NEXT: v_rcp_f32_e32 v2, v1
2465 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2466 ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2467 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2468 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
2469 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
2470 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
2471 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2472 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
2473 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
2474 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2475 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
2476 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2477 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2478 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
2479 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2480 %shl = shl i64 8, %cnt
2481 %conv = sitofp i64 %shl to float
2482 %mul = fdiv float -9.000000e+00, %conv
2486 define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
2487 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
2488 ; CHECK-SSE: # %bb.0:
2489 ; CHECK-SSE-NEXT: movq %rdi, %rcx
2490 ; CHECK-SSE-NEXT: andb $31, %cl
2491 ; CHECK-SSE-NEXT: movl $8, %eax
2492 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
2493 ; CHECK-SSE-NEXT: shlq %cl, %rax
2494 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
2495 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2496 ; CHECK-SSE-NEXT: divss %xmm1, %xmm0
2497 ; CHECK-SSE-NEXT: retq
2499 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt:
2500 ; CHECK-AVX2: # %bb.0:
2501 ; CHECK-AVX2-NEXT: movq %rdi, %rcx
2502 ; CHECK-AVX2-NEXT: andb $31, %cl
2503 ; CHECK-AVX2-NEXT: movl $8, %eax
2504 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
2505 ; CHECK-AVX2-NEXT: shlq %cl, %rax
2506 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2507 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2508 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2509 ; CHECK-AVX2-NEXT: retq
2511 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt:
2512 ; CHECK-NO-FASTFMA: # %bb.0:
2513 ; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
2514 ; CHECK-NO-FASTFMA-NEXT: andb $31, %cl
2515 ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
2516 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
2517 ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
2518 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2519 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2520 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2521 ; CHECK-NO-FASTFMA-NEXT: retq
2523 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt:
2524 ; CHECK-FMA: # %bb.0:
2525 ; CHECK-FMA-NEXT: andb $31, %dil
2526 ; CHECK-FMA-NEXT: movl $8, %eax
2527 ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
2528 ; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2529 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2530 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2531 ; CHECK-FMA-NEXT: retq
2532 ; VI-LABEL: fdiv_pow_shl_cnt:
2534 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2535 ; VI-NEXT: v_and_b32_e32 v0, 31, v0
2536 ; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2537 ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
2538 ; VI-NEXT: v_ffbh_i32_e32 v3, v1
2539 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
2540 ; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
2541 ; VI-NEXT: v_min_u32_e32 v2, v3, v2
2542 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2543 ; VI-NEXT: v_min_u32_e32 v0, 1, v0
2544 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
2545 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
2546 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
2547 ; VI-NEXT: v_ldexp_f32 v0, v0, v1
2548 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -0.5
2549 ; VI-NEXT: v_div_scale_f32 v2, vcc, -0.5, v0, -0.5
2550 ; VI-NEXT: v_rcp_f32_e32 v3, v1
2551 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
2552 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
2553 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
2554 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
2555 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
2556 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
2557 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
2558 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5
2559 ; VI-NEXT: s_setpc_b64 s[30:31]
2561 ; GFX10-LABEL: fdiv_pow_shl_cnt:
2563 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2564 ; GFX10-NEXT: v_and_b32_e32 v0, 31, v0
2565 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2566 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v0
2567 ; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
2568 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
2569 ; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
2570 ; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
2571 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2572 ; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
2573 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
2574 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
2575 ; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
2576 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
2577 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, -0.5
2578 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1
2579 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2580 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
2581 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
2582 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
2583 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
2584 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
2585 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
2586 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2587 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5
2588 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2590 ; GFX11-LABEL: fdiv_pow_shl_cnt:
2592 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2593 ; GFX11-NEXT: v_and_b32_e32 v0, 31, v0
2594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2595 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
2596 ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
2597 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2598 ; GFX11-NEXT: v_cls_i32_e32 v3, v1
2599 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
2600 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2601 ; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
2602 ; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
2603 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2604 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
2605 ; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
2606 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2607 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
2608 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
2609 ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
2610 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2611 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
2612 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, -0.5
2613 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2614 ; GFX11-NEXT: v_rcp_f32_e32 v2, v1
2615 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2616 ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2617 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
2618 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
2619 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2620 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
2621 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
2622 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2623 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
2624 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
2625 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2626 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2627 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5
2628 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2629 %cnt = and i64 %cnt_in, 31
2630 %shl = shl i64 8, %cnt
2631 %conv = sitofp i64 %shl to float
2632 %mul = fdiv float -0.500000e+00, %conv
2636 define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
2637 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2638 ; CHECK-SSE: # %bb.0:
2639 ; CHECK-SSE-NEXT: pushq %rax
2640 ; CHECK-SSE-NEXT: movl %edi, %ecx
2641 ; CHECK-SSE-NEXT: movl $1, %eax
2642 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
2643 ; CHECK-SSE-NEXT: shll %cl, %eax
2644 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
2645 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2646 ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
2647 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2648 ; CHECK-SSE-NEXT: divss %xmm0, %xmm1
2649 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
2650 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2651 ; CHECK-SSE-NEXT: popq %rax
2652 ; CHECK-SSE-NEXT: retq
2654 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2655 ; CHECK-AVX2: # %bb.0:
2656 ; CHECK-AVX2-NEXT: pushq %rax
2657 ; CHECK-AVX2-NEXT: movl %edi, %ecx
2658 ; CHECK-AVX2-NEXT: movl $1, %eax
2659 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
2660 ; CHECK-AVX2-NEXT: shll %cl, %eax
2661 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
2662 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2663 ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
2664 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2665 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2666 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2667 ; CHECK-AVX2-NEXT: popq %rax
2668 ; CHECK-AVX2-NEXT: retq
2670 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2671 ; CHECK-NO-FASTFMA: # %bb.0:
2672 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
2673 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
2674 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
2675 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
2676 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
2677 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2678 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2679 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
2680 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2681 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2682 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2683 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
2684 ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2685 ; CHECK-NO-FASTFMA-NEXT: retq
2687 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2688 ; CHECK-FMA: # %bb.0:
2689 ; CHECK-FMA-NEXT: movl $1, %eax
2690 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
2691 ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
2692 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2693 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2694 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
2695 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2696 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2697 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2698 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax
2699 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2700 ; CHECK-FMA-NEXT: retq
2701 ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2703 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2704 ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
2705 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
2706 ; VI-NEXT: s_movk_i32 s4, 0x7000
2707 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
2708 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
2709 ; VI-NEXT: v_rcp_f32_e32 v1, v1
2710 ; VI-NEXT: v_mul_f32_e32 v1, 0x46000000, v1
2711 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
2712 ; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4
2713 ; VI-NEXT: s_setpc_b64 s[30:31]
2715 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2717 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2718 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
2719 ; GFX10-NEXT: s_mov_b32 s4, 0x46000000
2720 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
2721 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
2722 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
2723 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1
2724 ; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0
2725 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
2726 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2728 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
2730 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2731 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1
2732 ; GFX11-NEXT: s_mov_b32 s0, 0x46000000
2733 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2734 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
2735 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
2736 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2737 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
2738 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1
2739 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2740 ; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0
2741 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2742 ; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
2743 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2744 %shl = shl nuw i32 1, %cnt
2745 %conv = uitofp i32 %shl to half
2746 %mul = fdiv half 0xH7000, %conv
2750 define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
2751 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds:
2752 ; CHECK-SSE: # %bb.0:
2753 ; CHECK-SSE-NEXT: pushq %rax
2754 ; CHECK-SSE-NEXT: movl %edi, %ecx
2755 ; CHECK-SSE-NEXT: movl $1, %eax
2756 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
2757 ; CHECK-SSE-NEXT: shll %cl, %eax
2758 ; CHECK-SSE-NEXT: movzwl %ax, %eax
2759 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
2760 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2761 ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
2762 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2763 ; CHECK-SSE-NEXT: divss %xmm0, %xmm1
2764 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
2765 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2766 ; CHECK-SSE-NEXT: popq %rax
2767 ; CHECK-SSE-NEXT: retq
2769 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds:
2770 ; CHECK-AVX2: # %bb.0:
2771 ; CHECK-AVX2-NEXT: pushq %rax
2772 ; CHECK-AVX2-NEXT: movl %edi, %ecx
2773 ; CHECK-AVX2-NEXT: movl $1, %eax
2774 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
2775 ; CHECK-AVX2-NEXT: shll %cl, %eax
2776 ; CHECK-AVX2-NEXT: movzwl %ax, %eax
2777 ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2778 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2779 ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
2780 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2781 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2782 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2783 ; CHECK-AVX2-NEXT: popq %rax
2784 ; CHECK-AVX2-NEXT: retq
2786 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds:
2787 ; CHECK-NO-FASTFMA: # %bb.0:
2788 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
2789 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
2790 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
2791 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
2792 ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax
2793 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2794 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2795 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2796 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
2797 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2798 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2799 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2800 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
2801 ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2802 ; CHECK-NO-FASTFMA-NEXT: retq
2804 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds:
2805 ; CHECK-FMA: # %bb.0:
2806 ; CHECK-FMA-NEXT: movl $1, %eax
2807 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
2808 ; CHECK-FMA-NEXT: movzwl %ax, %eax
2809 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2810 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2811 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2812 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
2813 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2814 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2815 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2816 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax
2817 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2818 ; CHECK-FMA-NEXT: retq
2819 ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds:
2821 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2822 ; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0
2823 ; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0
2824 ; VI-NEXT: s_setpc_b64 s[30:31]
2826 ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds:
2828 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2829 ; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0
2830 ; GFX10-NEXT: v_sub_nc_u16 v0, 0x7000, v0
2831 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2833 ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds:
2835 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2836 ; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0
2837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2838 ; GFX11-NEXT: v_sub_nc_u16 v0, 0x7000, v0
2839 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2840 %shl = shl nuw i16 1, %cnt
2841 %conv = uitofp i16 %shl to half
2842 %mul = fdiv half 0xH7000, %conv
2846 define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
2847 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2848 ; CHECK-SSE: # %bb.0:
2849 ; CHECK-SSE-NEXT: pushq %rax
2850 ; CHECK-SSE-NEXT: movl %edi, %ecx
2851 ; CHECK-SSE-NEXT: movl $1, %eax
2852 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
2853 ; CHECK-SSE-NEXT: shll %cl, %eax
2854 ; CHECK-SSE-NEXT: movzwl %ax, %eax
2855 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
2856 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2857 ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
2858 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2859 ; CHECK-SSE-NEXT: divss %xmm0, %xmm1
2860 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
2861 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2862 ; CHECK-SSE-NEXT: popq %rax
2863 ; CHECK-SSE-NEXT: retq
2865 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2866 ; CHECK-AVX2: # %bb.0:
2867 ; CHECK-AVX2-NEXT: pushq %rax
2868 ; CHECK-AVX2-NEXT: movl %edi, %ecx
2869 ; CHECK-AVX2-NEXT: movl $1, %eax
2870 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
2871 ; CHECK-AVX2-NEXT: shll %cl, %eax
2872 ; CHECK-AVX2-NEXT: movzwl %ax, %eax
2873 ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2874 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2875 ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
2876 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2877 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2878 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2879 ; CHECK-AVX2-NEXT: popq %rax
2880 ; CHECK-AVX2-NEXT: retq
2882 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2883 ; CHECK-NO-FASTFMA: # %bb.0:
2884 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
2885 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
2886 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
2887 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
2888 ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax
2889 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2890 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2891 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2892 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
2893 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2894 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2895 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2896 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
2897 ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2898 ; CHECK-NO-FASTFMA-NEXT: retq
2900 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2901 ; CHECK-FMA: # %bb.0:
2902 ; CHECK-FMA-NEXT: movl $1, %eax
2903 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
2904 ; CHECK-FMA-NEXT: movzwl %ax, %eax
2905 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2906 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2907 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2908 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
2909 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2910 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2911 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2912 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax
2913 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2914 ; CHECK-FMA-NEXT: retq
2915 ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2917 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2918 ; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0
2919 ; VI-NEXT: v_sub_u16_e32 v0, 0x4800, v0
2920 ; VI-NEXT: s_setpc_b64 s[30:31]
2922 ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2924 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2925 ; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0
2926 ; GFX10-NEXT: v_sub_nc_u16 v0, 0x4800, v0
2927 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2929 ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2:
2931 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2932 ; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0
2933 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2934 ; GFX11-NEXT: v_sub_nc_u16 v0, 0x4800, v0
2935 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2936 %shl = shl nuw i16 1, %cnt
2937 %conv = uitofp i16 %shl to half
2938 %mul = fdiv half 0xH4800, %conv
2942 define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
2943 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
2944 ; CHECK-SSE: # %bb.0:
2945 ; CHECK-SSE-NEXT: pushq %rax
2946 ; CHECK-SSE-NEXT: movl %edi, %ecx
2947 ; CHECK-SSE-NEXT: movl $1, %eax
2948 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
2949 ; CHECK-SSE-NEXT: shll %cl, %eax
2950 ; CHECK-SSE-NEXT: movzwl %ax, %eax
2951 ; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
2952 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2953 ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
2954 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2955 ; CHECK-SSE-NEXT: divss %xmm0, %xmm1
2956 ; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
2957 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
2958 ; CHECK-SSE-NEXT: popq %rax
2959 ; CHECK-SSE-NEXT: retq
2961 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
2962 ; CHECK-AVX2: # %bb.0:
2963 ; CHECK-AVX2-NEXT: pushq %rax
2964 ; CHECK-AVX2-NEXT: movl %edi, %ecx
2965 ; CHECK-AVX2-NEXT: movl $1, %eax
2966 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
2967 ; CHECK-AVX2-NEXT: shll %cl, %eax
2968 ; CHECK-AVX2-NEXT: movzwl %ax, %eax
2969 ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2970 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2971 ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
2972 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2973 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
2974 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
2975 ; CHECK-AVX2-NEXT: popq %rax
2976 ; CHECK-AVX2-NEXT: retq
2978 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
2979 ; CHECK-NO-FASTFMA: # %bb.0:
2980 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
2981 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
2982 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
2983 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
2984 ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax
2985 ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
2986 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2987 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2988 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
2989 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2990 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
2991 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2992 ; CHECK-NO-FASTFMA-NEXT: vmovd %xmm0, %eax
2993 ; CHECK-NO-FASTFMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
2994 ; CHECK-NO-FASTFMA-NEXT: retq
2996 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
2997 ; CHECK-FMA: # %bb.0:
2998 ; CHECK-FMA-NEXT: movl $1, %eax
2999 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
3000 ; CHECK-FMA-NEXT: movzwl %ax, %eax
3001 ; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0
3002 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3003 ; CHECK-FMA-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3004 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
3005 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3006 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
3007 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3008 ; CHECK-FMA-NEXT: vmovd %xmm0, %eax
3009 ; CHECK-FMA-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
3010 ; CHECK-FMA-NEXT: retq
3011 ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
3013 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3014 ; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1
3015 ; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
3016 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
3017 ; VI-NEXT: v_rcp_f32_e32 v1, v1
3018 ; VI-NEXT: v_add_f32_e32 v1, v1, v1
3019 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
3020 ; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
3021 ; VI-NEXT: s_setpc_b64 s[30:31]
3023 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
3025 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3026 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1
3027 ; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0
3028 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
3029 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1
3030 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v1
3031 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
3032 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
3033 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3035 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
3037 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3038 ; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1
3039 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3040 ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
3041 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
3042 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3043 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1
3044 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3045 ; GFX11-NEXT: v_add_f32_e32 v1, v1, v1
3046 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
3047 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3048 ; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
3049 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3050 %shl = shl nuw i16 1, %cnt
3051 %conv = uitofp i16 %shl to half
3052 %mul = fdiv half 0xH4000, %conv
3056 define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
3057 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3058 ; CHECK-SSE: # %bb.0:
3059 ; CHECK-SSE-NEXT: movl %edi, %ecx
3060 ; CHECK-SSE-NEXT: movl $1, %eax
3061 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
3062 ; CHECK-SSE-NEXT: shll %cl, %eax
3063 ; CHECK-SSE-NEXT: cvtsi2sd %rax, %xmm1
3064 ; CHECK-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
3065 ; CHECK-SSE-NEXT: divsd %xmm1, %xmm0
3066 ; CHECK-SSE-NEXT: retq
3068 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3069 ; CHECK-AVX2: # %bb.0:
3070 ; CHECK-AVX2-NEXT: movl %edi, %ecx
3071 ; CHECK-AVX2-NEXT: movl $1, %eax
3072 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
3073 ; CHECK-AVX2-NEXT: shll %cl, %eax
3074 ; CHECK-AVX2-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0
3075 ; CHECK-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
3076 ; CHECK-AVX2-NEXT: vdivsd %xmm0, %xmm1, %xmm0
3077 ; CHECK-AVX2-NEXT: retq
3079 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3080 ; CHECK-NO-FASTFMA: # %bb.0:
3081 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
3082 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
3083 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
3084 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
3085 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %eax, %xmm0, %xmm0
3086 ; CHECK-NO-FASTFMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
3087 ; CHECK-NO-FASTFMA-NEXT: vdivsd %xmm0, %xmm1, %xmm0
3088 ; CHECK-NO-FASTFMA-NEXT: retq
3090 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3091 ; CHECK-FMA: # %bb.0:
3092 ; CHECK-FMA-NEXT: movl $1, %eax
3093 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
3094 ; CHECK-FMA-NEXT: vcvtusi2sd %eax, %xmm0, %xmm0
3095 ; CHECK-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
3096 ; CHECK-FMA-NEXT: vdivsd %xmm0, %xmm1, %xmm0
3097 ; CHECK-FMA-NEXT: retq
3098 ; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3100 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3101 ; VI-NEXT: v_lshlrev_b32_e32 v0, 20, v0
3102 ; VI-NEXT: v_mov_b32_e32 v1, 0x36a00000
3103 ; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0
3104 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v0, vcc
3105 ; VI-NEXT: v_mov_b32_e32 v0, 0
3106 ; VI-NEXT: s_setpc_b64 s[30:31]
3108 ; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3110 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3111 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0
3112 ; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
3113 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
3114 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3115 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3117 ; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
3119 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3120 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0
3121 ; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
3122 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
3123 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
3124 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3125 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3126 %shl = shl nuw i32 1, %cnt
3127 %conv = uitofp i32 %shl to double
3128 %mul = fdiv double 0x36A0000000000000, %conv
3132 define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
3133 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3134 ; CHECK-SSE: # %bb.0:
3135 ; CHECK-SSE-NEXT: movl %edi, %ecx
3136 ; CHECK-SSE-NEXT: movl $1, %eax
3137 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
3138 ; CHECK-SSE-NEXT: shll %cl, %eax
3139 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
3140 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
3141 ; CHECK-SSE-NEXT: divss %xmm1, %xmm0
3142 ; CHECK-SSE-NEXT: retq
3144 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3145 ; CHECK-AVX2: # %bb.0:
3146 ; CHECK-AVX2-NEXT: movl %edi, %ecx
3147 ; CHECK-AVX2-NEXT: movl $1, %eax
3148 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
3149 ; CHECK-AVX2-NEXT: shll %cl, %eax
3150 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
3151 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3152 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
3153 ; CHECK-AVX2-NEXT: retq
3155 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3156 ; CHECK-NO-FASTFMA: # %bb.0:
3157 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
3158 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
3159 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
3160 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
3161 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
3162 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3163 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
3164 ; CHECK-NO-FASTFMA-NEXT: retq
3166 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3167 ; CHECK-FMA: # %bb.0:
3168 ; CHECK-FMA-NEXT: movl $1, %eax
3169 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
3170 ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
3171 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3172 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
3173 ; CHECK-FMA-NEXT: retq
3174 ; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3176 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3177 ; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
3178 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
3179 ; VI-NEXT: s_mov_b32 s6, 0x10fffff8
3180 ; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
3181 ; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
3182 ; VI-NEXT: v_rcp_f32_e32 v3, v1
3183 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
3184 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
3185 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
3186 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
3187 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
3188 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
3189 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
3190 ; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
3191 ; VI-NEXT: s_setpc_b64 s[30:31]
3193 ; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3195 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3196 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
3197 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
3198 ; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8
3199 ; GFX10-NEXT: v_rcp_f32_e32 v2, v1
3200 ; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
3201 ; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
3202 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
3203 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
3204 ; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
3205 ; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
3206 ; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
3207 ; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
3208 ; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8
3209 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3211 ; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
3213 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3214 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1
3215 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3216 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
3217 ; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x10fffff8
3218 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3219 ; GFX11-NEXT: v_rcp_f32_e32 v2, v1
3220 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3221 ; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
3222 ; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
3223 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
3224 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3225 ; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
3226 ; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
3227 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3228 ; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
3229 ; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
3230 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3231 ; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
3232 ; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8
3233 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3234 %shl = shl nuw i32 1, %cnt
3235 %conv = uitofp i32 %shl to float
3236 %mul = fdiv float 0x3a1fffff00000000, %conv
3240 define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
3241 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay:
3242 ; CHECK-SSE: # %bb.0:
3243 ; CHECK-SSE-NEXT: movl %edi, %ecx
3244 ; CHECK-SSE-NEXT: movl $1, %eax
3245 ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
3246 ; CHECK-SSE-NEXT: shll %cl, %eax
3247 ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
3248 ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
3249 ; CHECK-SSE-NEXT: divss %xmm1, %xmm0
3250 ; CHECK-SSE-NEXT: retq
3252 ; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_okay:
3253 ; CHECK-AVX2: # %bb.0:
3254 ; CHECK-AVX2-NEXT: movl %edi, %ecx
3255 ; CHECK-AVX2-NEXT: movl $1, %eax
3256 ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
3257 ; CHECK-AVX2-NEXT: shll %cl, %eax
3258 ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
3259 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3260 ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
3261 ; CHECK-AVX2-NEXT: retq
3263 ; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_okay:
3264 ; CHECK-NO-FASTFMA: # %bb.0:
3265 ; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
3266 ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
3267 ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
3268 ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
3269 ; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
3270 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3271 ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
3272 ; CHECK-NO-FASTFMA-NEXT: retq
3274 ; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_okay:
3275 ; CHECK-FMA: # %bb.0:
3276 ; CHECK-FMA-NEXT: movl $1, %eax
3277 ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
3278 ; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0
3279 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3280 ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
3281 ; CHECK-FMA-NEXT: retq
3282 ; VI-LABEL: fdiv_pow_shl_cnt32_okay:
3284 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3285 ; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
3286 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x11000000, v0
3287 ; VI-NEXT: s_setpc_b64 s[30:31]
3289 ; GFX10-LABEL: fdiv_pow_shl_cnt32_okay:
3291 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3292 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
3293 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0
3294 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3296 ; GFX11-LABEL: fdiv_pow_shl_cnt32_okay:
3298 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3299 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
3300 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3301 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0
3302 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3303 %shl = shl nuw i32 1, %cnt
3304 %conv = uitofp i32 %shl to float
3305 %mul = fdiv float 0x3a20000000000000, %conv