1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
4 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX
6 ; Verify we fold loads into unary sse intrinsics only when optimizing for size
8 define float @rcpss(float* %a) {
11 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
12 ; SSE-NEXT: rcpss %xmm0, %xmm0
17 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
18 ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
20 %ld = load float, float* %a
21 %ins = insertelement <4 x float> undef, float %ld, i32 0
22 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
23 %ext = extractelement <4 x float> %res, i32 0
27 define float @rsqrtss(float* %a) {
30 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
31 ; SSE-NEXT: rsqrtss %xmm0, %xmm0
36 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
37 ; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
39 %ld = load float, float* %a
40 %ins = insertelement <4 x float> undef, float %ld, i32 0
41 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
42 %ext = extractelement <4 x float> %res, i32 0
46 define float @sqrtss(float* %a) {
49 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
50 ; SSE-NEXT: sqrtss %xmm0, %xmm0
55 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
56 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
58 %ld = load float, float* %a
59 %ins = insertelement <4 x float> undef, float %ld, i32 0
60 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
61 %ext = extractelement <4 x float> %res, i32 0
65 define double @sqrtsd(double* %a) {
68 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
69 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
74 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
75 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
77 %ld = load double, double* %a
78 %ins = insertelement <2 x double> undef, double %ld, i32 0
79 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
80 %ext = extractelement <2 x double> %res, i32 0
84 define float @rcpss_size(float* %a) optsize {
85 ; SSE-LABEL: rcpss_size:
87 ; SSE-NEXT: rcpss (%rdi), %xmm0
90 ; AVX-LABEL: rcpss_size:
92 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
94 %ld = load float, float* %a
95 %ins = insertelement <4 x float> undef, float %ld, i32 0
96 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
97 %ext = extractelement <4 x float> %res, i32 0
101 define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
102 ; SSE-LABEL: rcpss_full_size:
104 ; SSE-NEXT: rcpss (%rdi), %xmm0
107 ; AVX-LABEL: rcpss_full_size:
109 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
111 %ld = load <4 x float>, <4 x float>* %a
112 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
116 define float @rsqrtss_size(float* %a) optsize {
117 ; SSE-LABEL: rsqrtss_size:
119 ; SSE-NEXT: rsqrtss (%rdi), %xmm0
122 ; AVX-LABEL: rsqrtss_size:
124 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
126 %ld = load float, float* %a
127 %ins = insertelement <4 x float> undef, float %ld, i32 0
128 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
129 %ext = extractelement <4 x float> %res, i32 0
133 define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
134 ; SSE-LABEL: rsqrtss_full_size:
136 ; SSE-NEXT: rsqrtss (%rdi), %xmm0
139 ; AVX-LABEL: rsqrtss_full_size:
141 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
143 %ld = load <4 x float>, <4 x float>* %a
144 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
148 define float @sqrtss_size(float* %a) optsize{
149 ; SSE-LABEL: sqrtss_size:
151 ; SSE-NEXT: sqrtss (%rdi), %xmm0
154 ; AVX-LABEL: sqrtss_size:
156 ; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
158 %ld = load float, float* %a
159 %ins = insertelement <4 x float> undef, float %ld, i32 0
160 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
161 %ext = extractelement <4 x float> %res, i32 0
165 define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
166 ; SSE-LABEL: sqrtss_full_size:
168 ; SSE-NEXT: movaps (%rdi), %xmm0
169 ; SSE-NEXT: sqrtss %xmm0, %xmm0
172 ; AVX-LABEL: sqrtss_full_size:
174 ; AVX-NEXT: vmovaps (%rdi), %xmm0
175 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
177 %ld = load <4 x float>, <4 x float>* %a
178 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
182 define <4 x float> @sqrtss_full_size_volatile(<4 x float>* %a) optsize{
183 ; SSE-LABEL: sqrtss_full_size_volatile:
185 ; SSE-NEXT: movaps (%rdi), %xmm0
186 ; SSE-NEXT: sqrtss %xmm0, %xmm0
189 ; AVX-LABEL: sqrtss_full_size_volatile:
191 ; AVX-NEXT: vmovaps (%rdi), %xmm0
192 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
194 %ld = load volatile <4 x float>, <4 x float>* %a
195 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
199 define double @sqrtsd_size(double* %a) optsize {
200 ; SSE-LABEL: sqrtsd_size:
202 ; SSE-NEXT: sqrtsd (%rdi), %xmm0
205 ; AVX-LABEL: sqrtsd_size:
207 ; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
209 %ld = load double, double* %a
210 %ins = insertelement <2 x double> undef, double %ld, i32 0
211 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
212 %ext = extractelement <2 x double> %res, i32 0
216 define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
217 ; SSE-LABEL: sqrtsd_full_size:
219 ; SSE-NEXT: movapd (%rdi), %xmm0
220 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
223 ; AVX-LABEL: sqrtsd_full_size:
225 ; AVX-NEXT: vmovapd (%rdi), %xmm0
226 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
228 %ld = load <2 x double>, <2 x double>* %a
229 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
230 ret <2 x double> %res
233 define <2 x double> @sqrtsd_full_size_volatile(<2 x double>* %a) optsize {
234 ; SSE-LABEL: sqrtsd_full_size_volatile:
236 ; SSE-NEXT: movapd (%rdi), %xmm0
237 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
240 ; AVX-LABEL: sqrtsd_full_size_volatile:
242 ; AVX-NEXT: vmovapd (%rdi), %xmm0
243 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
245 %ld = load volatile <2 x double>, <2 x double>* %a
246 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
247 ret <2 x double> %res
250 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
251 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
252 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
253 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone