1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mattr=avx512vl,avx512dq,avx512bw -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
7 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9 @src64 = common global [8 x double] zeroinitializer, align 64
10 @src32 = common global [16 x float] zeroinitializer, align 64
11 @dst64 = common global [8 x double] zeroinitializer, align 64
12 @dst32 = common global [16 x float] zeroinitializer, align 64
14 declare float @llvm.sqrt.f32(float)
15 declare double @llvm.sqrt.f64(double)
21 define void @sqrt_2f64() #0 {
22 ; CHECK-LABEL: @sqrt_2f64(
23 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src64, align 8
24 ; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
25 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @dst64, align 8
26 ; CHECK-NEXT: ret void
28 %a0 = load double, ptr @src64, align 8
29 %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 1), align 8
30 %sqrt0 = call double @llvm.sqrt.f64(double %a0)
31 %sqrt1 = call double @llvm.sqrt.f64(double %a1)
32 store double %sqrt0, ptr @dst64, align 8
33 store double %sqrt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
37 define void @sqrt_4f64() #0 {
38 ; SSE-LABEL: @sqrt_4f64(
39 ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src64, align 8
40 ; SSE-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
41 ; SSE-NEXT: store <2 x double> [[TMP2]], ptr @dst64, align 8
42 ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 2), align 8
43 ; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
44 ; SSE-NEXT: store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 8
47 ; AVX-LABEL: @sqrt_4f64(
48 ; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr @src64, align 8
49 ; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
50 ; AVX-NEXT: store <4 x double> [[TMP2]], ptr @dst64, align 8
53 %a0 = load double, ptr @src64, align 8
54 %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 1), align 8
55 %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 2), align 8
56 %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 3), align 8
57 %sqrt0 = call double @llvm.sqrt.f64(double %a0)
58 %sqrt1 = call double @llvm.sqrt.f64(double %a1)
59 %sqrt2 = call double @llvm.sqrt.f64(double %a2)
60 %sqrt3 = call double @llvm.sqrt.f64(double %a3)
61 store double %sqrt0, ptr @dst64, align 8
62 store double %sqrt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
63 store double %sqrt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 8
64 store double %sqrt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 8
68 define void @sqrt_8f64() #0 {
69 ; SSE-LABEL: @sqrt_8f64(
70 ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src64, align 4
71 ; SSE-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
72 ; SSE-NEXT: store <2 x double> [[TMP2]], ptr @dst64, align 4
73 ; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 2), align 4
74 ; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
75 ; SSE-NEXT: store <2 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 4
76 ; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 4), align 4
77 ; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]])
78 ; SSE-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
79 ; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 6), align 4
80 ; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP7]])
81 ; SSE-NEXT: store <2 x double> [[TMP8]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 4
84 ; AVX256-LABEL: @sqrt_8f64(
85 ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr @src64, align 4
86 ; AVX256-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
87 ; AVX256-NEXT: store <4 x double> [[TMP2]], ptr @dst64, align 4
88 ; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 4), align 4
89 ; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP3]])
90 ; AVX256-NEXT: store <4 x double> [[TMP4]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
91 ; AVX256-NEXT: ret void
93 ; AVX512-LABEL: @sqrt_8f64(
94 ; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src64, align 4
95 ; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[TMP1]])
96 ; AVX512-NEXT: store <8 x double> [[TMP2]], ptr @dst64, align 4
97 ; AVX512-NEXT: ret void
99 %a0 = load double, ptr @src64, align 4
100 %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 1), align 4
101 %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 2), align 4
102 %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 3), align 4
103 %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 4), align 4
104 %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 5), align 4
105 %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 6), align 4
106 %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src64, i32 0, i64 7), align 4
107 %sqrt0 = call double @llvm.sqrt.f64(double %a0)
108 %sqrt1 = call double @llvm.sqrt.f64(double %a1)
109 %sqrt2 = call double @llvm.sqrt.f64(double %a2)
110 %sqrt3 = call double @llvm.sqrt.f64(double %a3)
111 %sqrt4 = call double @llvm.sqrt.f64(double %a4)
112 %sqrt5 = call double @llvm.sqrt.f64(double %a5)
113 %sqrt6 = call double @llvm.sqrt.f64(double %a6)
114 %sqrt7 = call double @llvm.sqrt.f64(double %a7)
115 store double %sqrt0, ptr @dst64, align 4
116 store double %sqrt1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 4
117 store double %sqrt2, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 2), align 4
118 store double %sqrt3, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 3), align 4
119 store double %sqrt4, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
120 store double %sqrt5, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 5), align 4
121 store double %sqrt6, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 4
122 store double %sqrt7, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 7), align 4
126 define void @sqrt_4f32() #0 {
127 ; CHECK-LABEL: @sqrt_4f32(
128 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @src32, align 4
129 ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
130 ; CHECK-NEXT: store <4 x float> [[TMP2]], ptr @dst32, align 4
131 ; CHECK-NEXT: ret void
133 %a0 = load float, ptr @src32, align 4
134 %a1 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 1), align 4
135 %a2 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 2), align 4
136 %a3 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 3), align 4
137 %sqrt0 = call float @llvm.sqrt.f32(float %a0)
138 %sqrt1 = call float @llvm.sqrt.f32(float %a1)
139 %sqrt2 = call float @llvm.sqrt.f32(float %a2)
140 %sqrt3 = call float @llvm.sqrt.f32(float %a3)
141 store float %sqrt0, ptr @dst32, align 4
142 store float %sqrt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
143 store float %sqrt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 4
144 store float %sqrt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
148 define void @sqrt_8f32() #0 {
149 ; SSE-LABEL: @sqrt_8f32(
150 ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @src32, align 4
151 ; SSE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
152 ; SSE-NEXT: store <4 x float> [[TMP2]], ptr @dst32, align 4
153 ; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 4), align 4
154 ; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
155 ; SSE-NEXT: store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 4
158 ; AVX-LABEL: @sqrt_8f32(
159 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @src32, align 4
160 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
161 ; AVX-NEXT: store <8 x float> [[TMP2]], ptr @dst32, align 4
164 %a0 = load float, ptr @src32, align 4
165 %a1 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 1), align 4
166 %a2 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 2), align 4
167 %a3 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 3), align 4
168 %a4 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 4), align 4
169 %a5 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 5), align 4
170 %a6 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 6), align 4
171 %a7 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 7), align 4
172 %sqrt0 = call float @llvm.sqrt.f32(float %a0)
173 %sqrt1 = call float @llvm.sqrt.f32(float %a1)
174 %sqrt2 = call float @llvm.sqrt.f32(float %a2)
175 %sqrt3 = call float @llvm.sqrt.f32(float %a3)
176 %sqrt4 = call float @llvm.sqrt.f32(float %a4)
177 %sqrt5 = call float @llvm.sqrt.f32(float %a5)
178 %sqrt6 = call float @llvm.sqrt.f32(float %a6)
179 %sqrt7 = call float @llvm.sqrt.f32(float %a7)
180 store float %sqrt0, ptr @dst32, align 4
181 store float %sqrt1, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
182 store float %sqrt2, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 4
183 store float %sqrt3, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
184 store float %sqrt4, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 4
185 store float %sqrt5, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5), align 4
186 store float %sqrt6, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6), align 4
187 store float %sqrt7, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7), align 4
191 define void @sqrt_16f32() #0 {
192 ; SSE-LABEL: @sqrt_16f32(
193 ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @src32, align 4
194 ; SSE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
195 ; SSE-NEXT: store <4 x float> [[TMP2]], ptr @dst32, align 4
196 ; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 4), align 4
197 ; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
198 ; SSE-NEXT: store <4 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 4
199 ; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 8), align 4
200 ; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP5]])
201 ; SSE-NEXT: store <4 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
202 ; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 12), align 4
203 ; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP7]])
204 ; SSE-NEXT: store <4 x float> [[TMP8]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 4
207 ; AVX256-LABEL: @sqrt_16f32(
208 ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @src32, align 4
209 ; AVX256-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
210 ; AVX256-NEXT: store <8 x float> [[TMP2]], ptr @dst32, align 4
211 ; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 8), align 4
212 ; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP3]])
213 ; AVX256-NEXT: store <8 x float> [[TMP4]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
214 ; AVX256-NEXT: ret void
216 ; AVX512-LABEL: @sqrt_16f32(
217 ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr @src32, align 4
218 ; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[TMP1]])
219 ; AVX512-NEXT: store <16 x float> [[TMP2]], ptr @dst32, align 4
220 ; AVX512-NEXT: ret void
222 %a0 = load float, ptr @src32, align 4
223 %a1 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 1), align 4
224 %a2 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 2), align 4
225 %a3 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 3), align 4
226 %a4 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 4), align 4
227 %a5 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 5), align 4
228 %a6 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 6), align 4
229 %a7 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 7), align 4
230 %a8 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 8), align 4
231 %a9 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 9), align 4
232 %a10 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 10), align 4
233 %a11 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 11), align 4
234 %a12 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 12), align 4
235 %a13 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 13), align 4
236 %a14 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 14), align 4
237 %a15 = load float, ptr getelementptr inbounds ([16 x float], ptr @src32, i32 0, i64 15), align 4
238 %sqrt0 = call float @llvm.sqrt.f32(float %a0 )
239 %sqrt1 = call float @llvm.sqrt.f32(float %a1 )
240 %sqrt2 = call float @llvm.sqrt.f32(float %a2 )
241 %sqrt3 = call float @llvm.sqrt.f32(float %a3 )
242 %sqrt4 = call float @llvm.sqrt.f32(float %a4 )
243 %sqrt5 = call float @llvm.sqrt.f32(float %a5 )
244 %sqrt6 = call float @llvm.sqrt.f32(float %a6 )
245 %sqrt7 = call float @llvm.sqrt.f32(float %a7 )
246 %sqrt8 = call float @llvm.sqrt.f32(float %a8 )
247 %sqrt9 = call float @llvm.sqrt.f32(float %a9 )
248 %sqrt10 = call float @llvm.sqrt.f32(float %a10)
249 %sqrt11 = call float @llvm.sqrt.f32(float %a11)
250 %sqrt12 = call float @llvm.sqrt.f32(float %a12)
251 %sqrt13 = call float @llvm.sqrt.f32(float %a13)
252 %sqrt14 = call float @llvm.sqrt.f32(float %a14)
253 %sqrt15 = call float @llvm.sqrt.f32(float %a15)
254 store float %sqrt0 , ptr @dst32, align 4
255 store float %sqrt1 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 1), align 4
256 store float %sqrt2 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 2), align 4
257 store float %sqrt3 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 3), align 4
258 store float %sqrt4 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 4), align 4
259 store float %sqrt5 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 5), align 4
260 store float %sqrt6 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 6), align 4
261 store float %sqrt7 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 7), align 4
262 store float %sqrt8 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
263 store float %sqrt9 , ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 9), align 4
264 store float %sqrt10, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 10), align 4
265 store float %sqrt11, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 11), align 4
266 store float %sqrt12, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 4
267 store float %sqrt13, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 13), align 4
268 store float %sqrt14, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 14), align 4
269 store float %sqrt15, ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 15), align 4
273 attributes #0 = { nounwind }