1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
4 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX
6 ; Verify we fold loads into unary sse intrinsics only when optimizing for size
8 define float @rcpss(float* %a) {
11 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
12 ; SSE-NEXT: rcpss %xmm0, %xmm0
17 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
18 ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
20 %ld = load float, float* %a
21 %ins = insertelement <4 x float> undef, float %ld, i32 0
22 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
23 %ext = extractelement <4 x float> %res, i32 0
27 define float @rsqrtss(float* %a) {
30 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
31 ; SSE-NEXT: rsqrtss %xmm0, %xmm0
36 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
37 ; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
39 %ld = load float, float* %a
40 %ins = insertelement <4 x float> undef, float %ld, i32 0
41 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
42 %ext = extractelement <4 x float> %res, i32 0
46 define float @sqrtss(float* %a) {
49 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
50 ; SSE-NEXT: sqrtss %xmm0, %xmm0
55 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
56 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
58 %ld = load float, float* %a
59 %ins = insertelement <4 x float> undef, float %ld, i32 0
60 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
61 %ext = extractelement <4 x float> %res, i32 0
65 define double @sqrtsd(double* %a) {
68 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
69 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
74 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
75 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
77 %ld = load double, double* %a
78 %ins = insertelement <2 x double> undef, double %ld, i32 0
79 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
80 %ext = extractelement <2 x double> %res, i32 0
84 define float @rcpss_size(float* %a) optsize {
85 ; SSE-LABEL: rcpss_size:
87 ; SSE-NEXT: rcpss (%rdi), %xmm0
90 ; AVX-LABEL: rcpss_size:
92 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
94 %ld = load float, float* %a
95 %ins = insertelement <4 x float> undef, float %ld, i32 0
96 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
97 %ext = extractelement <4 x float> %res, i32 0
101 define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
102 ; SSE-LABEL: rcpss_full_size:
104 ; SSE-NEXT: rcpss (%rdi), %xmm0
107 ; AVX-LABEL: rcpss_full_size:
109 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
111 %ld = load <4 x float>, <4 x float>* %a
112 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
116 define float @rcpss_pgso(float* %a) !prof !14 {
117 ; SSE-LABEL: rcpss_pgso:
119 ; SSE-NEXT: rcpss (%rdi), %xmm0
122 ; AVX-LABEL: rcpss_pgso:
124 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
126 %ld = load float, float* %a
127 %ins = insertelement <4 x float> undef, float %ld, i32 0
128 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
129 %ext = extractelement <4 x float> %res, i32 0
133 define <4 x float> @rcpss_full_pgso(<4 x float>* %a) !prof !14 {
134 ; SSE-LABEL: rcpss_full_pgso:
136 ; SSE-NEXT: rcpss (%rdi), %xmm0
139 ; AVX-LABEL: rcpss_full_pgso:
141 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
143 %ld = load <4 x float>, <4 x float>* %a
144 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
148 define float @rsqrtss_size(float* %a) optsize {
149 ; SSE-LABEL: rsqrtss_size:
151 ; SSE-NEXT: rsqrtss (%rdi), %xmm0
154 ; AVX-LABEL: rsqrtss_size:
156 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
158 %ld = load float, float* %a
159 %ins = insertelement <4 x float> undef, float %ld, i32 0
160 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
161 %ext = extractelement <4 x float> %res, i32 0
165 define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
166 ; SSE-LABEL: rsqrtss_full_size:
168 ; SSE-NEXT: rsqrtss (%rdi), %xmm0
171 ; AVX-LABEL: rsqrtss_full_size:
173 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
175 %ld = load <4 x float>, <4 x float>* %a
176 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
180 define float @rsqrtss_pgso(float* %a) !prof !14 {
181 ; SSE-LABEL: rsqrtss_pgso:
183 ; SSE-NEXT: rsqrtss (%rdi), %xmm0
186 ; AVX-LABEL: rsqrtss_pgso:
188 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
190 %ld = load float, float* %a
191 %ins = insertelement <4 x float> undef, float %ld, i32 0
192 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
193 %ext = extractelement <4 x float> %res, i32 0
197 define <4 x float> @rsqrtss_full_pgso(<4 x float>* %a) !prof !14 {
198 ; SSE-LABEL: rsqrtss_full_pgso:
200 ; SSE-NEXT: rsqrtss (%rdi), %xmm0
203 ; AVX-LABEL: rsqrtss_full_pgso:
205 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
207 %ld = load <4 x float>, <4 x float>* %a
208 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
212 define float @sqrtss_size(float* %a) optsize{
213 ; SSE-LABEL: sqrtss_size:
215 ; SSE-NEXT: sqrtss (%rdi), %xmm0
218 ; AVX-LABEL: sqrtss_size:
220 ; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
222 %ld = load float, float* %a
223 %ins = insertelement <4 x float> undef, float %ld, i32 0
224 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
225 %ext = extractelement <4 x float> %res, i32 0
229 define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
230 ; SSE-LABEL: sqrtss_full_size:
232 ; SSE-NEXT: movaps (%rdi), %xmm0
233 ; SSE-NEXT: sqrtss %xmm0, %xmm0
236 ; AVX-LABEL: sqrtss_full_size:
238 ; AVX-NEXT: vmovaps (%rdi), %xmm0
239 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
241 %ld = load <4 x float>, <4 x float>* %a
242 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
246 define <4 x float> @sqrtss_full_size_volatile(<4 x float>* %a) optsize{
247 ; SSE-LABEL: sqrtss_full_size_volatile:
249 ; SSE-NEXT: movaps (%rdi), %xmm0
250 ; SSE-NEXT: sqrtss %xmm0, %xmm0
253 ; AVX-LABEL: sqrtss_full_size_volatile:
255 ; AVX-NEXT: vmovaps (%rdi), %xmm0
256 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
258 %ld = load volatile <4 x float>, <4 x float>* %a
259 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
263 define float @sqrtss_pgso(float* %a) !prof !14 {
264 ; SSE-LABEL: sqrtss_pgso:
266 ; SSE-NEXT: sqrtss (%rdi), %xmm0
269 ; AVX-LABEL: sqrtss_pgso:
271 ; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
273 %ld = load float, float* %a
274 %ins = insertelement <4 x float> undef, float %ld, i32 0
275 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
276 %ext = extractelement <4 x float> %res, i32 0
280 define <4 x float> @sqrtss_full_pgso(<4 x float>* %a) !prof !14 {
281 ; SSE-LABEL: sqrtss_full_pgso:
283 ; SSE-NEXT: movaps (%rdi), %xmm0
284 ; SSE-NEXT: sqrtss %xmm0, %xmm0
287 ; AVX-LABEL: sqrtss_full_pgso:
289 ; AVX-NEXT: vmovaps (%rdi), %xmm0
290 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
292 %ld = load <4 x float>, <4 x float>* %a
293 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
297 define <4 x float> @sqrtss_full_pgso_volatile(<4 x float>* %a) !prof !14 {
298 ; SSE-LABEL: sqrtss_full_pgso_volatile:
300 ; SSE-NEXT: movaps (%rdi), %xmm0
301 ; SSE-NEXT: sqrtss %xmm0, %xmm0
304 ; AVX-LABEL: sqrtss_full_pgso_volatile:
306 ; AVX-NEXT: vmovaps (%rdi), %xmm0
307 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
309 %ld = load volatile <4 x float>, <4 x float>* %a
310 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
314 define double @sqrtsd_size(double* %a) optsize {
315 ; SSE-LABEL: sqrtsd_size:
317 ; SSE-NEXT: sqrtsd (%rdi), %xmm0
320 ; AVX-LABEL: sqrtsd_size:
322 ; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
324 %ld = load double, double* %a
325 %ins = insertelement <2 x double> undef, double %ld, i32 0
326 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
327 %ext = extractelement <2 x double> %res, i32 0
331 define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
332 ; SSE-LABEL: sqrtsd_full_size:
334 ; SSE-NEXT: movapd (%rdi), %xmm0
335 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
338 ; AVX-LABEL: sqrtsd_full_size:
340 ; AVX-NEXT: vmovapd (%rdi), %xmm0
341 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
343 %ld = load <2 x double>, <2 x double>* %a
344 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
345 ret <2 x double> %res
348 define <2 x double> @sqrtsd_full_size_volatile(<2 x double>* %a) optsize {
349 ; SSE-LABEL: sqrtsd_full_size_volatile:
351 ; SSE-NEXT: movapd (%rdi), %xmm0
352 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
355 ; AVX-LABEL: sqrtsd_full_size_volatile:
357 ; AVX-NEXT: vmovapd (%rdi), %xmm0
358 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
360 %ld = load volatile <2 x double>, <2 x double>* %a
361 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
362 ret <2 x double> %res
365 define double @sqrtsd_pgso(double* %a) !prof !14 {
366 ; SSE-LABEL: sqrtsd_pgso:
368 ; SSE-NEXT: sqrtsd (%rdi), %xmm0
371 ; AVX-LABEL: sqrtsd_pgso:
373 ; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
375 %ld = load double, double* %a
376 %ins = insertelement <2 x double> undef, double %ld, i32 0
377 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
378 %ext = extractelement <2 x double> %res, i32 0
382 define <2 x double> @sqrtsd_full_pgso(<2 x double>* %a) !prof !14 {
383 ; SSE-LABEL: sqrtsd_full_pgso:
385 ; SSE-NEXT: movapd (%rdi), %xmm0
386 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
389 ; AVX-LABEL: sqrtsd_full_pgso:
391 ; AVX-NEXT: vmovapd (%rdi), %xmm0
392 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
394 %ld = load <2 x double>, <2 x double>* %a
395 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
396 ret <2 x double> %res
399 define <2 x double> @sqrtsd_full_pgso_volatile(<2 x double>* %a) !prof !14 {
400 ; SSE-LABEL: sqrtsd_full_pgso_volatile:
402 ; SSE-NEXT: movapd (%rdi), %xmm0
403 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
406 ; AVX-LABEL: sqrtsd_full_pgso_volatile:
408 ; AVX-NEXT: vmovapd (%rdi), %xmm0
409 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
411 %ld = load volatile <2 x double>, <2 x double>* %a
412 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
413 ret <2 x double> %res
416 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
417 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
418 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
419 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
421 !llvm.module.flags = !{!0}
422 !0 = !{i32 1, !"ProfileSummary", !1}
423 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
424 !2 = !{!"ProfileFormat", !"InstrProf"}
425 !3 = !{!"TotalCount", i64 10000}
426 !4 = !{!"MaxCount", i64 10}
427 !5 = !{!"MaxInternalCount", i64 1}
428 !6 = !{!"MaxFunctionCount", i64 1000}
429 !7 = !{!"NumCounts", i64 3}
430 !8 = !{!"NumFunctions", i64 3}
431 !9 = !{!"DetailedSummary", !10}
432 !10 = !{!11, !12, !13}
433 !11 = !{i32 10000, i64 100, i32 1}
434 !12 = !{i32 999000, i64 100, i32 1}
435 !13 = !{i32 999999, i64 1, i32 2}
436 !14 = !{!"function_entry_count", i64 0}