1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
3 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
5 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
6 target triple = "arm64-apple-ios14.0.0"
8 declare float @llvm.sin.f32(float)
10 ; Accelerate provides sin() for <4 x float>
11 define <4 x float> @int_sin_4x(ptr %a) {
12 ; CHECK-LABEL: @int_sin_4x(
14 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
15 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
16 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
18 ; NOACCELERATE-LABEL: @int_sin_4x(
19 ; NOACCELERATE-NEXT: entry:
20 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
21 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
22 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
23 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
24 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
25 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
26 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
27 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
28 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
29 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
30 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
31 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
34 %0 = load <4 x float>, ptr %a, align 16
35 %vecext = extractelement <4 x float> %0, i32 0
36 %1 = tail call fast float @llvm.sin.f32(float %vecext)
37 %vecins = insertelement <4 x float> undef, float %1, i32 0
38 %vecext.1 = extractelement <4 x float> %0, i32 1
39 %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
40 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
41 %vecext.2 = extractelement <4 x float> %0, i32 2
42 %3 = tail call fast float @llvm.sin.f32(float %vecext.2)
43 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
44 %vecext.3 = extractelement <4 x float> %0, i32 3
45 %4 = tail call fast float @llvm.sin.f32(float %vecext.3)
46 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
47 ret <4 x float> %vecins.3
50 declare float @ceilf(float) readonly nounwind willreturn
52 define <4 x float> @ceil_4x(ptr %a) {
53 ; CHECK-LABEL: @ceil_4x(
55 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
56 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
57 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
59 ; NOACCELERATE-LABEL: @ceil_4x(
60 ; NOACCELERATE-NEXT: entry:
61 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
62 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
63 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
66 %0 = load <4 x float>, ptr %a, align 16
67 %vecext = extractelement <4 x float> %0, i32 0
68 %1 = tail call fast float @ceilf(float %vecext)
69 %vecins = insertelement <4 x float> undef, float %1, i32 0
70 %vecext.1 = extractelement <4 x float> %0, i32 1
71 %2 = tail call fast float @ceilf(float %vecext.1)
72 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
73 %vecext.2 = extractelement <4 x float> %0, i32 2
74 %3 = tail call fast float @ceilf(float %vecext.2)
75 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
76 %vecext.3 = extractelement <4 x float> %0, i32 3
77 %4 = tail call fast float @ceilf(float %vecext.3)
78 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
79 ret <4 x float> %vecins.3
82 declare float @fabsf(float) readonly nounwind willreturn
84 define <4 x float> @fabs_4x(ptr %a) {
85 ; CHECK-LABEL: @fabs_4x(
87 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
88 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
89 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
91 ; NOACCELERATE-LABEL: @fabs_4x(
92 ; NOACCELERATE-NEXT: entry:
93 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
94 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
95 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
98 %0 = load <4 x float>, ptr %a, align 16
99 %vecext = extractelement <4 x float> %0, i32 0
100 %1 = tail call fast float @fabsf(float %vecext)
101 %vecins = insertelement <4 x float> undef, float %1, i32 0
102 %vecext.1 = extractelement <4 x float> %0, i32 1
103 %2 = tail call fast float @fabsf(float %vecext.1)
104 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
105 %vecext.2 = extractelement <4 x float> %0, i32 2
106 %3 = tail call fast float @fabsf(float %vecext.2)
107 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
108 %vecext.3 = extractelement <4 x float> %0, i32 3
109 %4 = tail call fast float @fabsf(float %vecext.3)
110 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
111 ret <4 x float> %vecins.3
113 declare float @llvm.fabs.f32(float)
114 define <4 x float> @int_fabs_4x(ptr %a) {
115 ; CHECK-LABEL: @int_fabs_4x(
117 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
118 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
119 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
121 ; NOACCELERATE-LABEL: @int_fabs_4x(
122 ; NOACCELERATE-NEXT: entry:
123 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
124 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
125 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
128 %0 = load <4 x float>, ptr %a, align 16
129 %vecext = extractelement <4 x float> %0, i32 0
130 %1 = tail call fast float @llvm.fabs.f32(float %vecext)
131 %vecins = insertelement <4 x float> undef, float %1, i32 0
132 %vecext.1 = extractelement <4 x float> %0, i32 1
133 %2 = tail call fast float @llvm.fabs.f32(float %vecext.1)
134 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
135 %vecext.2 = extractelement <4 x float> %0, i32 2
136 %3 = tail call fast float @llvm.fabs.f32(float %vecext.2)
137 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
138 %vecext.3 = extractelement <4 x float> %0, i32 3
139 %4 = tail call fast float @llvm.fabs.f32(float %vecext.3)
140 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
141 ret <4 x float> %vecins.3
143 declare float @floorf(float) readonly nounwind willreturn
144 define <4 x float> @floor_4x(ptr %a) {
145 ; CHECK-LABEL: @floor_4x(
147 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
148 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
149 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
151 ; NOACCELERATE-LABEL: @floor_4x(
152 ; NOACCELERATE-NEXT: entry:
153 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
154 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
155 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
158 %0 = load <4 x float>, ptr %a, align 16
159 %vecext = extractelement <4 x float> %0, i32 0
160 %1 = tail call fast float @floorf(float %vecext)
161 %vecins = insertelement <4 x float> undef, float %1, i32 0
162 %vecext.1 = extractelement <4 x float> %0, i32 1
163 %2 = tail call fast float @floorf(float %vecext.1)
164 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
165 %vecext.2 = extractelement <4 x float> %0, i32 2
166 %3 = tail call fast float @floorf(float %vecext.2)
167 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
168 %vecext.3 = extractelement <4 x float> %0, i32 3
169 %4 = tail call fast float @floorf(float %vecext.3)
170 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
171 ret <4 x float> %vecins.3
173 declare float @sqrtf(float) readonly nounwind willreturn
174 define <4 x float> @sqrt_4x(ptr %a) {
175 ; CHECK-LABEL: @sqrt_4x(
177 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
178 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
179 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
181 ; NOACCELERATE-LABEL: @sqrt_4x(
182 ; NOACCELERATE-NEXT: entry:
183 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
184 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
185 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
188 %0 = load <4 x float>, ptr %a, align 16
189 %vecext = extractelement <4 x float> %0, i32 0
190 %1 = tail call fast float @sqrtf(float %vecext)
191 %vecins = insertelement <4 x float> undef, float %1, i32 0
192 %vecext.1 = extractelement <4 x float> %0, i32 1
193 %2 = tail call fast float @sqrtf(float %vecext.1)
194 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
195 %vecext.2 = extractelement <4 x float> %0, i32 2
196 %3 = tail call fast float @sqrtf(float %vecext.2)
197 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
198 %vecext.3 = extractelement <4 x float> %0, i32 3
199 %4 = tail call fast float @sqrtf(float %vecext.3)
200 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
201 ret <4 x float> %vecins.3
203 declare float @expf(float) readonly nounwind willreturn
204 define <4 x float> @exp_4x(ptr %a) {
205 ; CHECK-LABEL: @exp_4x(
207 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
208 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
209 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
211 ; NOACCELERATE-LABEL: @exp_4x(
212 ; NOACCELERATE-NEXT: entry:
213 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
214 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
215 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
216 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
217 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
218 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
219 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
220 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
221 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
222 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
223 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
224 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
227 %0 = load <4 x float>, ptr %a, align 16
228 %vecext = extractelement <4 x float> %0, i32 0
229 %1 = tail call fast float @expf(float %vecext)
230 %vecins = insertelement <4 x float> undef, float %1, i32 0
231 %vecext.1 = extractelement <4 x float> %0, i32 1
232 %2 = tail call fast float @expf(float %vecext.1)
233 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
234 %vecext.2 = extractelement <4 x float> %0, i32 2
235 %3 = tail call fast float @expf(float %vecext.2)
236 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
237 %vecext.3 = extractelement <4 x float> %0, i32 3
238 %4 = tail call fast float @expf(float %vecext.3)
239 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
240 ret <4 x float> %vecins.3
242 declare float @expm1f(float) readonly nounwind willreturn
243 define <4 x float> @expm1_4x(ptr %a) {
244 ; CHECK-LABEL: @expm1_4x(
246 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
247 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
248 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
250 ; NOACCELERATE-LABEL: @expm1_4x(
251 ; NOACCELERATE-NEXT: entry:
252 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
253 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
254 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expm1f(float [[VECEXT]])
255 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
256 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
257 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expm1f(float [[VECEXT_1]])
258 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
259 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
260 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expm1f(float [[VECEXT_2]])
261 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
262 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
263 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expm1f(float [[VECEXT_3]])
264 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
265 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
268 %0 = load <4 x float>, ptr %a, align 16
269 %vecext = extractelement <4 x float> %0, i32 0
270 %1 = tail call fast float @expm1f(float %vecext)
271 %vecins = insertelement <4 x float> undef, float %1, i32 0
272 %vecext.1 = extractelement <4 x float> %0, i32 1
273 %2 = tail call fast float @expm1f(float %vecext.1)
274 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
275 %vecext.2 = extractelement <4 x float> %0, i32 2
276 %3 = tail call fast float @expm1f(float %vecext.2)
277 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
278 %vecext.3 = extractelement <4 x float> %0, i32 3
279 %4 = tail call fast float @expm1f(float %vecext.3)
280 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
281 ret <4 x float> %vecins.3
283 declare float @logf(float) readonly nounwind willreturn
284 define <4 x float> @log_4x(ptr %a) {
285 ; CHECK-LABEL: @log_4x(
287 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
288 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
289 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
291 ; NOACCELERATE-LABEL: @log_4x(
292 ; NOACCELERATE-NEXT: entry:
293 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
294 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
295 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
296 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
297 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
298 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
299 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
300 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
301 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
302 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
303 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
304 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
307 %0 = load <4 x float>, ptr %a, align 16
308 %vecext = extractelement <4 x float> %0, i32 0
309 %1 = tail call fast float @logf(float %vecext)
310 %vecins = insertelement <4 x float> undef, float %1, i32 0
311 %vecext.1 = extractelement <4 x float> %0, i32 1
312 %2 = tail call fast float @logf(float %vecext.1)
313 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
314 %vecext.2 = extractelement <4 x float> %0, i32 2
315 %3 = tail call fast float @logf(float %vecext.2)
316 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
317 %vecext.3 = extractelement <4 x float> %0, i32 3
318 %4 = tail call fast float @logf(float %vecext.3)
319 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
320 ret <4 x float> %vecins.3
322 declare float @log1pf(float) readonly nounwind willreturn
323 define <4 x float> @log1p_4x(ptr %a) {
324 ; CHECK-LABEL: @log1p_4x(
326 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
327 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
328 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
330 ; NOACCELERATE-LABEL: @log1p_4x(
331 ; NOACCELERATE-NEXT: entry:
332 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
333 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
334 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @log1pf(float [[VECEXT]])
335 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
336 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
337 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @log1pf(float [[VECEXT_1]])
338 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
339 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
340 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @log1pf(float [[VECEXT_2]])
341 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
342 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
343 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @log1pf(float [[VECEXT_3]])
344 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
345 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
348 %0 = load <4 x float>, ptr %a, align 16
349 %vecext = extractelement <4 x float> %0, i32 0
350 %1 = tail call fast float @log1pf(float %vecext)
351 %vecins = insertelement <4 x float> undef, float %1, i32 0
352 %vecext.1 = extractelement <4 x float> %0, i32 1
353 %2 = tail call fast float @log1pf(float %vecext.1)
354 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
355 %vecext.2 = extractelement <4 x float> %0, i32 2
356 %3 = tail call fast float @log1pf(float %vecext.2)
357 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
358 %vecext.3 = extractelement <4 x float> %0, i32 3
359 %4 = tail call fast float @log1pf(float %vecext.3)
360 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
361 ret <4 x float> %vecins.3
363 declare float @log10pf(float) readonly nounwind willreturn
364 define <4 x float> @log10p_4x(ptr %a) {
365 ; CHECK-LABEL: @log10p_4x(
367 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
368 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
369 ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
370 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
371 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
372 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
373 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
374 ; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
375 ; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
376 ; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
377 ; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
378 ; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
379 ; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
380 ; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
382 ; NOACCELERATE-LABEL: @log10p_4x(
383 ; NOACCELERATE-NEXT: entry:
384 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
385 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
386 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
387 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
388 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
389 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
390 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
391 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
392 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
393 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
394 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
395 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
396 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
397 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
400 %0 = load <4 x float>, ptr %a, align 16
401 %vecext = extractelement <4 x float> %0, i32 0
402 %1 = tail call fast float @log10pf(float %vecext)
403 %vecins = insertelement <4 x float> undef, float %1, i32 0
404 %vecext.1 = extractelement <4 x float> %0, i32 1
405 %2 = tail call fast float @log10pf(float %vecext.1)
406 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
407 %vecext.2 = extractelement <4 x float> %0, i32 2
408 %3 = tail call fast float @log10pf(float %vecext.2)
409 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
410 %vecext.3 = extractelement <4 x float> %0, i32 3
411 %4 = tail call fast float @log10pf(float %vecext.3)
412 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
413 ret <4 x float> %vecins.3
415 declare float @logbf(float) readonly nounwind willreturn
416 define <4 x float> @logb_4x(ptr %a) {
417 ; CHECK-LABEL: @logb_4x(
419 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
420 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
421 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
423 ; NOACCELERATE-LABEL: @logb_4x(
424 ; NOACCELERATE-NEXT: entry:
425 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
426 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
427 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
428 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
429 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
430 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
431 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
432 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
433 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
434 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
435 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
436 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
437 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
438 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
441 %0 = load <4 x float>, ptr %a, align 16
442 %vecext = extractelement <4 x float> %0, i32 0
443 %1 = tail call fast float @logbf(float %vecext)
444 %vecins = insertelement <4 x float> undef, float %1, i32 0
445 %vecext.1 = extractelement <4 x float> %0, i32 1
446 %2 = tail call fast float @logbf(float %vecext.1)
447 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
448 %vecext.2 = extractelement <4 x float> %0, i32 2
449 %3 = tail call fast float @logbf(float %vecext.2)
450 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
451 %vecext.3 = extractelement <4 x float> %0, i32 3
452 %4 = tail call fast float @logbf(float %vecext.3)
453 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
454 ret <4 x float> %vecins.3
456 declare float @sinf(float) readonly nounwind willreturn
457 define <4 x float> @sin_4x(ptr %a) {
458 ; CHECK-LABEL: @sin_4x(
460 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
461 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
462 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
464 ; NOACCELERATE-LABEL: @sin_4x(
465 ; NOACCELERATE-NEXT: entry:
466 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
467 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
468 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
469 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
470 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
471 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
472 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
473 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
474 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
475 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
476 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
477 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
480 %0 = load <4 x float>, ptr %a, align 16
481 %vecext = extractelement <4 x float> %0, i32 0
482 %1 = tail call fast float @sinf(float %vecext)
483 %vecins = insertelement <4 x float> undef, float %1, i32 0
484 %vecext.1 = extractelement <4 x float> %0, i32 1
485 %2 = tail call fast float @sinf(float %vecext.1)
486 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
487 %vecext.2 = extractelement <4 x float> %0, i32 2
488 %3 = tail call fast float @sinf(float %vecext.2)
489 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
490 %vecext.3 = extractelement <4 x float> %0, i32 3
491 %4 = tail call fast float @sinf(float %vecext.3)
492 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
493 ret <4 x float> %vecins.3
495 declare float @cosf(float) readonly nounwind willreturn
496 define <4 x float> @cos_4x(ptr %a) {
497 ; CHECK-LABEL: @cos_4x(
499 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
500 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
501 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
503 ; NOACCELERATE-LABEL: @cos_4x(
504 ; NOACCELERATE-NEXT: entry:
505 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
506 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
507 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
508 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
509 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
510 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
511 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
512 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
513 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
514 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
515 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
516 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
519 %0 = load <4 x float>, ptr %a, align 16
520 %vecext = extractelement <4 x float> %0, i32 0
521 %1 = tail call fast float @cosf(float %vecext)
522 %vecins = insertelement <4 x float> undef, float %1, i32 0
523 %vecext.1 = extractelement <4 x float> %0, i32 1
524 %2 = tail call fast float @cosf(float %vecext.1)
525 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
526 %vecext.2 = extractelement <4 x float> %0, i32 2
527 %3 = tail call fast float @cosf(float %vecext.2)
528 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
529 %vecext.3 = extractelement <4 x float> %0, i32 3
530 %4 = tail call fast float @cosf(float %vecext.3)
531 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
532 ret <4 x float> %vecins.3
534 declare float @tanf(float) readonly nounwind willreturn
535 define <4 x float> @tan_4x(ptr %a) {
536 ; CHECK-LABEL: @tan_4x(
538 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
539 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
540 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
542 ; NOACCELERATE-LABEL: @tan_4x(
543 ; NOACCELERATE-NEXT: entry:
544 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
545 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
546 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
547 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
548 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
549 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
550 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
551 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
552 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
553 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
554 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
555 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
558 %0 = load <4 x float>, ptr %a, align 16
559 %vecext = extractelement <4 x float> %0, i32 0
560 %1 = tail call fast float @tanf(float %vecext)
561 %vecins = insertelement <4 x float> undef, float %1, i32 0
562 %vecext.1 = extractelement <4 x float> %0, i32 1
563 %2 = tail call fast float @tanf(float %vecext.1)
564 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
565 %vecext.2 = extractelement <4 x float> %0, i32 2
566 %3 = tail call fast float @tanf(float %vecext.2)
567 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
568 %vecext.3 = extractelement <4 x float> %0, i32 3
569 %4 = tail call fast float @tanf(float %vecext.3)
570 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
571 ret <4 x float> %vecins.3
573 declare float @asinf(float) readonly nounwind willreturn
574 define <4 x float> @asin_4x(ptr %a) {
575 ; CHECK-LABEL: @asin_4x(
577 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
578 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
579 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
581 ; NOACCELERATE-LABEL: @asin_4x(
582 ; NOACCELERATE-NEXT: entry:
583 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
584 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
585 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
586 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
587 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
588 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
589 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
590 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
591 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
592 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
593 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
594 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
597 %0 = load <4 x float>, ptr %a, align 16
598 %vecext = extractelement <4 x float> %0, i32 0
599 %1 = tail call fast float @asinf(float %vecext)
600 %vecins = insertelement <4 x float> undef, float %1, i32 0
601 %vecext.1 = extractelement <4 x float> %0, i32 1
602 %2 = tail call fast float @asinf(float %vecext.1)
603 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
604 %vecext.2 = extractelement <4 x float> %0, i32 2
605 %3 = tail call fast float @asinf(float %vecext.2)
606 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
607 %vecext.3 = extractelement <4 x float> %0, i32 3
608 %4 = tail call fast float @asinf(float %vecext.3)
609 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
610 ret <4 x float> %vecins.3
612 define <4 x float> @int_asin_4x(ptr %a) {
613 ; CHECK-LABEL: @int_asin_4x(
615 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
616 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
617 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
619 ; NOACCELERATE-LABEL: @int_asin_4x(
620 ; NOACCELERATE-NEXT: entry:
621 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
622 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
623 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
624 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
625 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
626 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
627 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
628 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
629 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
630 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
631 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
632 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
635 %0 = load <4 x float>, ptr %a, align 16
636 %vecext = extractelement <4 x float> %0, i32 0
637 %1 = tail call fast float @llvm.asin.f32(float %vecext)
638 %vecins = insertelement <4 x float> undef, float %1, i32 0
639 %vecext.1 = extractelement <4 x float> %0, i32 1
640 %2 = tail call fast float @llvm.asin.f32(float %vecext.1)
641 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
642 %vecext.2 = extractelement <4 x float> %0, i32 2
643 %3 = tail call fast float @llvm.asin.f32(float %vecext.2)
644 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
645 %vecext.3 = extractelement <4 x float> %0, i32 3
646 %4 = tail call fast float @llvm.asin.f32(float %vecext.3)
647 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
648 ret <4 x float> %vecins.3
650 declare float @acosf(float) readonly nounwind willreturn
651 define <4 x float> @acos_4x(ptr %a) {
652 ; CHECK-LABEL: @acos_4x(
654 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
655 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
656 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
658 ; NOACCELERATE-LABEL: @acos_4x(
659 ; NOACCELERATE-NEXT: entry:
660 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
661 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
662 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
663 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
664 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
665 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
666 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
667 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
668 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
669 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
670 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
671 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
674 %0 = load <4 x float>, ptr %a, align 16
675 %vecext = extractelement <4 x float> %0, i32 0
676 %1 = tail call fast float @acosf(float %vecext)
677 %vecins = insertelement <4 x float> undef, float %1, i32 0
678 %vecext.1 = extractelement <4 x float> %0, i32 1
679 %2 = tail call fast float @acosf(float %vecext.1)
680 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
681 %vecext.2 = extractelement <4 x float> %0, i32 2
682 %3 = tail call fast float @acosf(float %vecext.2)
683 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
684 %vecext.3 = extractelement <4 x float> %0, i32 3
685 %4 = tail call fast float @acosf(float %vecext.3)
686 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
687 ret <4 x float> %vecins.3
689 define <4 x float> @int_acos_4x(ptr %a) {
690 ; CHECK-LABEL: @int_acos_4x(
692 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
693 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
694 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
696 ; NOACCELERATE-LABEL: @int_acos_4x(
697 ; NOACCELERATE-NEXT: entry:
698 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
699 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
700 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
701 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
702 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
703 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
704 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
705 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
706 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
707 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
708 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
709 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
712 %0 = load <4 x float>, ptr %a, align 16
713 %vecext = extractelement <4 x float> %0, i32 0
714 %1 = tail call fast float @llvm.acos.f32(float %vecext)
715 %vecins = insertelement <4 x float> undef, float %1, i32 0
716 %vecext.1 = extractelement <4 x float> %0, i32 1
717 %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
718 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
719 %vecext.2 = extractelement <4 x float> %0, i32 2
720 %3 = tail call fast float @llvm.acos.f32(float %vecext.2)
721 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
722 %vecext.3 = extractelement <4 x float> %0, i32 3
723 %4 = tail call fast float @llvm.acos.f32(float %vecext.3)
724 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
725 ret <4 x float> %vecins.3
727 declare float @atanf(float) readonly nounwind willreturn
728 define <4 x float> @atan_4x(ptr %a) {
729 ; CHECK-LABEL: @atan_4x(
731 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
732 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
733 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
735 ; NOACCELERATE-LABEL: @atan_4x(
736 ; NOACCELERATE-NEXT: entry:
737 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
738 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
739 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
740 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
741 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
742 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
743 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
744 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
745 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
746 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
747 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
748 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
751 %0 = load <4 x float>, ptr %a, align 16
752 %vecext = extractelement <4 x float> %0, i32 0
753 %1 = tail call fast float @atanf(float %vecext)
754 %vecins = insertelement <4 x float> undef, float %1, i32 0
755 %vecext.1 = extractelement <4 x float> %0, i32 1
756 %2 = tail call fast float @atanf(float %vecext.1)
757 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
758 %vecext.2 = extractelement <4 x float> %0, i32 2
759 %3 = tail call fast float @atanf(float %vecext.2)
760 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
761 %vecext.3 = extractelement <4 x float> %0, i32 3
762 %4 = tail call fast float @atanf(float %vecext.3)
763 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
764 ret <4 x float> %vecins.3
766 define <4 x float> @int_atan_4x(ptr %a) {
767 ; CHECK-LABEL: @int_atan_4x(
769 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
770 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
771 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
773 ; NOACCELERATE-LABEL: @int_atan_4x(
774 ; NOACCELERATE-NEXT: entry:
775 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
776 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
777 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
778 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
779 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
780 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
781 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
782 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
783 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
784 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
785 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
786 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
789 %0 = load <4 x float>, ptr %a, align 16
790 %vecext = extractelement <4 x float> %0, i32 0
791 %1 = tail call fast float @llvm.atan.f32(float %vecext)
792 %vecins = insertelement <4 x float> undef, float %1, i32 0
793 %vecext.1 = extractelement <4 x float> %0, i32 1
794 %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
795 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
796 %vecext.2 = extractelement <4 x float> %0, i32 2
797 %3 = tail call fast float @llvm.atan.f32(float %vecext.2)
798 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
799 %vecext.3 = extractelement <4 x float> %0, i32 3
800 %4 = tail call fast float @llvm.atan.f32(float %vecext.3)
801 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
802 ret <4 x float> %vecins.3
804 declare float @atan2f(float,float) readonly nounwind willreturn
805 define <4 x float> @atan2_4x(ptr %a, ptr %b) {
806 ; CHECK-LABEL: @atan2_4x(
808 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
809 ; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
810 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
811 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
813 ; NOACCELERATE-LABEL: @atan2_4x(
814 ; NOACCELERATE-NEXT: entry:
815 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
816 ; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
817 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
818 ; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
819 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]])
820 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
821 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
822 ; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
823 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
824 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
825 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
826 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
827 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
828 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
829 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
830 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
833 %0 = load <4 x float>, ptr %a, align 16
834 %bb = load <4 x float>, ptr %b, align 16
835 %vecext = extractelement <4 x float> %0, i32 0
836 %vecextb = extractelement <4 x float> %bb, i32 0
837 %1 = tail call fast float @atan2f(float %vecext, float %vecextb)
838 %vecins = insertelement <4 x float> undef, float %1, i32 0
839 %vecext.1 = extractelement <4 x float> %0, i32 1
840 %vecextb.1 = extractelement <4 x float> %bb, i32 1
841 %2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1)
842 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
843 %vecext.2 = extractelement <4 x float> %0, i32 2
844 %vecextb.2 = extractelement <4 x float> %bb, i32 2
845 %3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2)
846 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
847 %vecext.3 = extractelement <4 x float> %0, i32 3
848 %vecextb.3 = extractelement <4 x float> %bb, i32 3
849 %4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3)
850 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
851 ret <4 x float> %vecins.3
853 define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
854 ; CHECK-LABEL: @int_atan2_4x(
856 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
857 ; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
858 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
859 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
861 ; NOACCELERATE-LABEL: @int_atan2_4x(
862 ; NOACCELERATE-NEXT: entry:
863 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
864 ; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
865 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
866 ; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
867 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]])
868 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
869 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
870 ; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
871 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
872 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
873 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
874 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
875 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
876 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
877 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
878 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
881 %0 = load <4 x float>, ptr %a, align 16
882 %bb = load <4 x float>, ptr %b, align 16
883 %vecext = extractelement <4 x float> %0, i32 0
884 %vecextb = extractelement <4 x float> %bb, i32 0
885 %1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb)
886 %vecins = insertelement <4 x float> undef, float %1, i32 0
887 %vecext.1 = extractelement <4 x float> %0, i32 1
888 %vecextb.1 = extractelement <4 x float> %bb, i32 1
889 %2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1)
890 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
891 %vecext.2 = extractelement <4 x float> %0, i32 2
892 %vecextb.2 = extractelement <4 x float> %bb, i32 2
893 %3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2)
894 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
895 %vecext.3 = extractelement <4 x float> %0, i32 3
896 %vecextb.3 = extractelement <4 x float> %bb, i32 3
897 %4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3)
898 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
899 ret <4 x float> %vecins.3
901 declare float @sinhf(float) readonly nounwind willreturn
902 define <4 x float> @sinh_4x(ptr %a) {
903 ; CHECK-LABEL: @sinh_4x(
905 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
906 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
907 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
909 ; NOACCELERATE-LABEL: @sinh_4x(
910 ; NOACCELERATE-NEXT: entry:
911 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
912 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
913 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
914 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
915 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
916 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
917 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
918 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
919 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
920 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
921 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
922 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
925 %0 = load <4 x float>, ptr %a, align 16
926 %vecext = extractelement <4 x float> %0, i32 0
927 %1 = tail call fast float @sinhf(float %vecext)
928 %vecins = insertelement <4 x float> undef, float %1, i32 0
929 %vecext.1 = extractelement <4 x float> %0, i32 1
930 %2 = tail call fast float @sinhf(float %vecext.1)
931 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
932 %vecext.2 = extractelement <4 x float> %0, i32 2
933 %3 = tail call fast float @sinhf(float %vecext.2)
934 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
935 %vecext.3 = extractelement <4 x float> %0, i32 3
936 %4 = tail call fast float @sinhf(float %vecext.3)
937 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
938 ret <4 x float> %vecins.3
940 define <4 x float> @int_sinh_4x(ptr %a) {
941 ; CHECK-LABEL: @int_sinh_4x(
943 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
944 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
945 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
947 ; NOACCELERATE-LABEL: @int_sinh_4x(
948 ; NOACCELERATE-NEXT: entry:
949 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
950 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
951 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
952 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
953 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
954 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
955 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
956 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
957 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
958 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
959 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
960 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
963 %0 = load <4 x float>, ptr %a, align 16
964 %vecext = extractelement <4 x float> %0, i32 0
965 %1 = tail call fast float @llvm.sinh.f32(float %vecext)
966 %vecins = insertelement <4 x float> undef, float %1, i32 0
967 %vecext.1 = extractelement <4 x float> %0, i32 1
968 %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
969 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
970 %vecext.2 = extractelement <4 x float> %0, i32 2
971 %3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
972 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
973 %vecext.3 = extractelement <4 x float> %0, i32 3
974 %4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
975 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
976 ret <4 x float> %vecins.3
978 declare float @coshf(float) readonly nounwind willreturn
979 define <4 x float> @cosh_4x(ptr %a) {
980 ; CHECK-LABEL: @cosh_4x(
982 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
983 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
984 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
986 ; NOACCELERATE-LABEL: @cosh_4x(
987 ; NOACCELERATE-NEXT: entry:
988 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
989 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
990 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
991 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
992 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
993 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
994 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
995 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
996 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
997 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
998 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
999 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
1002 %0 = load <4 x float>, ptr %a, align 16
1003 %vecext = extractelement <4 x float> %0, i32 0
1004 %1 = tail call fast float @coshf(float %vecext)
1005 %vecins = insertelement <4 x float> undef, float %1, i32 0
1006 %vecext.1 = extractelement <4 x float> %0, i32 1
1007 %2 = tail call fast float @coshf(float %vecext.1)
1008 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1009 %vecext.2 = extractelement <4 x float> %0, i32 2
1010 %3 = tail call fast float @coshf(float %vecext.2)
1011 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1012 %vecext.3 = extractelement <4 x float> %0, i32 3
1013 %4 = tail call fast float @coshf(float %vecext.3)
1014 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1015 ret <4 x float> %vecins.3
1017 define <4 x float> @int_cosh_4x(ptr %a) {
1018 ; CHECK-LABEL: @int_cosh_4x(
1019 ; CHECK-NEXT: entry:
1020 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1021 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
1022 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1024 ; NOACCELERATE-LABEL: @int_cosh_4x(
1025 ; NOACCELERATE-NEXT: entry:
1026 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1027 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1028 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
1029 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1030 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1031 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
1032 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1033 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1034 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
1035 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1036 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1037 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
1040 %0 = load <4 x float>, ptr %a, align 16
1041 %vecext = extractelement <4 x float> %0, i32 0
1042 %1 = tail call fast float @llvm.cosh.f32(float %vecext)
1043 %vecins = insertelement <4 x float> undef, float %1, i32 0
1044 %vecext.1 = extractelement <4 x float> %0, i32 1
1045 %2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
1046 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1047 %vecext.2 = extractelement <4 x float> %0, i32 2
1048 %3 = tail call fast float @llvm.cosh.f32(float %vecext.2)
1049 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1050 %vecext.3 = extractelement <4 x float> %0, i32 3
1051 %4 = tail call fast float @llvm.cosh.f32(float %vecext.3)
1052 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1053 ret <4 x float> %vecins.3
1055 declare float @tanhf(float) readonly nounwind willreturn
1056 define <4 x float> @tanh_4x(ptr %a) {
1057 ; CHECK-LABEL: @tanh_4x(
1058 ; CHECK-NEXT: entry:
1059 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1060 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
1061 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1063 ; NOACCELERATE-LABEL: @tanh_4x(
1064 ; NOACCELERATE-NEXT: entry:
1065 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1066 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1067 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
1068 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1069 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1070 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
1071 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1072 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1073 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
1074 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1075 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1076 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
1079 %0 = load <4 x float>, ptr %a, align 16
1080 %vecext = extractelement <4 x float> %0, i32 0
1081 %1 = tail call fast float @tanhf(float %vecext)
1082 %vecins = insertelement <4 x float> undef, float %1, i32 0
1083 %vecext.1 = extractelement <4 x float> %0, i32 1
1084 %2 = tail call fast float @tanhf(float %vecext.1)
1085 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1086 %vecext.2 = extractelement <4 x float> %0, i32 2
1087 %3 = tail call fast float @tanhf(float %vecext.2)
1088 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1089 %vecext.3 = extractelement <4 x float> %0, i32 3
1090 %4 = tail call fast float @tanhf(float %vecext.3)
1091 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1092 ret <4 x float> %vecins.3
1094 define <4 x float> @int_tanh_4x(ptr %a) {
1095 ; CHECK-LABEL: @int_tanh_4x(
1096 ; CHECK-NEXT: entry:
1097 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1098 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
1099 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1101 ; NOACCELERATE-LABEL: @int_tanh_4x(
1102 ; NOACCELERATE-NEXT: entry:
1103 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1104 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1105 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
1106 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1107 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1108 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
1109 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1110 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1111 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
1112 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1113 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1114 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
1117 %0 = load <4 x float>, ptr %a, align 16
1118 %vecext = extractelement <4 x float> %0, i32 0
1119 %1 = tail call fast float @llvm.tanh.f32(float %vecext)
1120 %vecins = insertelement <4 x float> undef, float %1, i32 0
1121 %vecext.1 = extractelement <4 x float> %0, i32 1
1122 %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
1123 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1124 %vecext.2 = extractelement <4 x float> %0, i32 2
1125 %3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
1126 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1127 %vecext.3 = extractelement <4 x float> %0, i32 3
1128 %4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
1129 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1130 ret <4 x float> %vecins.3
1132 declare float @asinhf(float) readonly nounwind willreturn
1133 define <4 x float> @asinh_4x(ptr %a) {
1134 ; CHECK-LABEL: @asinh_4x(
1135 ; CHECK-NEXT: entry:
1136 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1137 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
1138 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1140 ; NOACCELERATE-LABEL: @asinh_4x(
1141 ; NOACCELERATE-NEXT: entry:
1142 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1143 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1144 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
1145 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1146 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1147 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
1148 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1149 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1150 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
1151 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1152 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1153 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
1154 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1155 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
1158 %0 = load <4 x float>, ptr %a, align 16
1159 %vecext = extractelement <4 x float> %0, i32 0
1160 %1 = tail call fast float @asinhf(float %vecext)
1161 %vecins = insertelement <4 x float> undef, float %1, i32 0
1162 %vecext.1 = extractelement <4 x float> %0, i32 1
1163 %2 = tail call fast float @asinhf(float %vecext.1)
1164 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1165 %vecext.2 = extractelement <4 x float> %0, i32 2
1166 %3 = tail call fast float @asinhf(float %vecext.2)
1167 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1168 %vecext.3 = extractelement <4 x float> %0, i32 3
1169 %4 = tail call fast float @asinhf(float %vecext.3)
1170 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1171 ret <4 x float> %vecins.3
1173 declare float @acoshf(float) readonly nounwind willreturn
1174 define <4 x float> @acosh_4x(ptr %a) {
1175 ; CHECK-LABEL: @acosh_4x(
1176 ; CHECK-NEXT: entry:
1177 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1178 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
1179 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1181 ; NOACCELERATE-LABEL: @acosh_4x(
1182 ; NOACCELERATE-NEXT: entry:
1183 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1184 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1185 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
1186 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1187 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1188 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
1189 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1190 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1191 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
1192 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1193 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1194 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
1195 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1196 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
1199 %0 = load <4 x float>, ptr %a, align 16
1200 %vecext = extractelement <4 x float> %0, i32 0
1201 %1 = tail call fast float @acoshf(float %vecext)
1202 %vecins = insertelement <4 x float> undef, float %1, i32 0
1203 %vecext.1 = extractelement <4 x float> %0, i32 1
1204 %2 = tail call fast float @acoshf(float %vecext.1)
1205 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1206 %vecext.2 = extractelement <4 x float> %0, i32 2
1207 %3 = tail call fast float @acoshf(float %vecext.2)
1208 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1209 %vecext.3 = extractelement <4 x float> %0, i32 3
1210 %4 = tail call fast float @acoshf(float %vecext.3)
1211 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1212 ret <4 x float> %vecins.3
1214 declare float @atanhf(float) readonly nounwind willreturn
1215 define <4 x float> @atanh_4x(ptr %a) {
1216 ; CHECK-LABEL: @atanh_4x(
1217 ; CHECK-NEXT: entry:
1218 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1219 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
1220 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1222 ; NOACCELERATE-LABEL: @atanh_4x(
1223 ; NOACCELERATE-NEXT: entry:
1224 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1225 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1226 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
1227 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1228 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1229 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
1230 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1231 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1232 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanhf(float [[VECEXT_2]])
1233 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1234 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1235 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanhf(float [[VECEXT_3]])
1236 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1237 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
1240 %0 = load <4 x float>, ptr %a, align 16
1241 %vecext = extractelement <4 x float> %0, i32 0
1242 %1 = tail call fast float @atanhf(float %vecext)
1243 %vecins = insertelement <4 x float> undef, float %1, i32 0
1244 %vecext.1 = extractelement <4 x float> %0, i32 1
1245 %2 = tail call fast float @atanhf(float %vecext.1)
1246 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1247 %vecext.2 = extractelement <4 x float> %0, i32 2
1248 %3 = tail call fast float @atanhf(float %vecext.2)
1249 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1250 %vecext.3 = extractelement <4 x float> %0, i32 3
1251 %4 = tail call fast float @atanhf(float %vecext.3)
1252 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1253 ret <4 x float> %vecins.3
1256 ; Accelerate *does not* provide sin() for <2 x float>.
1257 define <2 x float> @sin_2x(ptr %a) {
1258 ; CHECK-LABEL: @sin_2x(
1259 ; CHECK-NEXT: entry:
1260 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1261 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1262 ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
1263 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1264 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1265 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
1266 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1267 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
1269 ; NOACCELERATE-LABEL: @sin_2x(
1270 ; NOACCELERATE-NEXT: entry:
1271 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1272 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1273 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
1274 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1275 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1276 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
1277 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1278 ; NOACCELERATE-NEXT: ret <2 x float> [[VECINS_1]]
1281 %0 = load <2 x float>, ptr %a, align 16
1282 %vecext = extractelement <2 x float> %0, i32 0
1283 %1 = tail call fast float @llvm.sin.f32(float %vecext)
1284 %vecins = insertelement <2 x float> undef, float %1, i32 0
1285 %vecext.1 = extractelement <2 x float> %0, i32 1
1286 %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
1287 %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1288 ret <2 x float> %vecins.1
1292 declare float @llvm.cos.f32(float)
1294 ; Accelerate provides cos() for <4 x float>
1295 define <4 x float> @int_cos_4x(ptr %a) {
1296 ; CHECK-LABEL: @int_cos_4x(
1297 ; CHECK-NEXT: entry:
1298 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1299 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
1300 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1302 ; NOACCELERATE-LABEL: @int_cos_4x(
1303 ; NOACCELERATE-NEXT: entry:
1304 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1305 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1306 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1307 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1308 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1309 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1310 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1311 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1312 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
1313 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1314 ; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1315 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
1318 %0 = load <4 x float>, ptr %a, align 16
1319 %vecext = extractelement <4 x float> %0, i32 0
1320 %1 = tail call fast float @llvm.cos.f32(float %vecext)
1321 %vecins = insertelement <4 x float> undef, float %1, i32 0
1322 %vecext.1 = extractelement <4 x float> %0, i32 1
1323 %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1324 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1325 %vecext.2 = extractelement <4 x float> %0, i32 2
1326 %3 = tail call fast float @llvm.cos.f32(float %vecext.2)
1327 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1328 %vecext.3 = extractelement <4 x float> %0, i32 3
1329 %4 = tail call fast float @llvm.cos.f32(float %vecext.3)
1330 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1331 ret <4 x float> %vecins.3
1334 ; Accelerate *does not* provide cos() for <2 x float>.
1335 define <2 x float> @cos_2x(ptr %a) {
1336 ; CHECK-LABEL: @cos_2x(
1337 ; CHECK-NEXT: entry:
1338 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1339 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1340 ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
1341 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1342 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1343 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
1344 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1345 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
1347 ; NOACCELERATE-LABEL: @cos_2x(
1348 ; NOACCELERATE-NEXT: entry:
1349 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1350 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1351 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1352 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1353 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1354 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1355 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1356 ; NOACCELERATE-NEXT: ret <2 x float> [[VECINS_1]]
1359 %0 = load <2 x float>, ptr %a, align 16
1360 %vecext = extractelement <2 x float> %0, i32 0
1361 %1 = tail call fast float @llvm.cos.f32(float %vecext)
1362 %vecins = insertelement <2 x float> undef, float %1, i32 0
1363 %vecext.1 = extractelement <2 x float> %0, i32 1
1364 %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1365 %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1366 ret <2 x float> %vecins.1