1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -inject-tli-mappings -slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
3 ; RUN: opt -inject-tli-mappings -slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
5 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
6 target triple = "arm64-apple-ios14.0.0"
8 declare float @llvm.sin.f32(float)
10 ; Accelerate provides sin() for <4 x float>
11 define <4 x float> @int_sin_4x(<4 x float>* %a) {
12 ; CHECK-LABEL: @int_sin_4x(
14 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
15 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
16 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
18 ; NOACCELERATE-LABEL: @int_sin_4x(
19 ; NOACCELERATE-NEXT: entry:
20 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
21 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
22 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
23 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
24 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
25 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
26 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
27 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
28 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
29 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
30 ; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
31 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
32 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
33 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
34 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
37 %0 = load <4 x float>, <4 x float>* %a, align 16
38 %vecext = extractelement <4 x float> %0, i32 0
39 %1 = tail call fast float @llvm.sin.f32(float %vecext)
40 %vecins = insertelement <4 x float> poison, float %1, i32 0
41 %vecext.1 = extractelement <4 x float> %0, i32 1
42 %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
43 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
44 %vecext.2 = extractelement <4 x float> %0, i32 2
45 %3 = tail call fast float @llvm.sin.f32(float %vecext.2)
46 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
47 %vecext.3 = extractelement <4 x float> %0, i32 3
48 %4 = tail call fast float @llvm.sin.f32(float %vecext.3)
49 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
50 ret <4 x float> %vecins.3
53 declare float @ceilf(float) readonly
55 define <4 x float> @ceil_4x(<4 x float>* %a) {
56 ; CHECK-LABEL: @ceil_4x(
58 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
59 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
60 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
62 ; NOACCELERATE-LABEL: @ceil_4x(
63 ; NOACCELERATE-NEXT: entry:
64 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
65 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
66 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
69 %0 = load <4 x float>, <4 x float>* %a, align 16
70 %vecext = extractelement <4 x float> %0, i32 0
71 %1 = tail call fast float @ceilf(float %vecext)
72 %vecins = insertelement <4 x float> poison, float %1, i32 0
73 %vecext.1 = extractelement <4 x float> %0, i32 1
74 %2 = tail call fast float @ceilf(float %vecext.1)
75 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
76 %vecext.2 = extractelement <4 x float> %0, i32 2
77 %3 = tail call fast float @ceilf(float %vecext.2)
78 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
79 %vecext.3 = extractelement <4 x float> %0, i32 3
80 %4 = tail call fast float @ceilf(float %vecext.3)
81 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
82 ret <4 x float> %vecins.3
85 declare float @fabsf(float) readonly
87 define <4 x float> @fabs_4x(<4 x float>* %a) {
88 ; CHECK-LABEL: @fabs_4x(
90 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
91 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
92 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
94 ; NOACCELERATE-LABEL: @fabs_4x(
95 ; NOACCELERATE-NEXT: entry:
96 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
97 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
98 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
101 %0 = load <4 x float>, <4 x float>* %a, align 16
102 %vecext = extractelement <4 x float> %0, i32 0
103 %1 = tail call fast float @fabsf(float %vecext)
104 %vecins = insertelement <4 x float> poison, float %1, i32 0
105 %vecext.1 = extractelement <4 x float> %0, i32 1
106 %2 = tail call fast float @fabsf(float %vecext.1)
107 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
108 %vecext.2 = extractelement <4 x float> %0, i32 2
109 %3 = tail call fast float @fabsf(float %vecext.2)
110 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
111 %vecext.3 = extractelement <4 x float> %0, i32 3
112 %4 = tail call fast float @fabsf(float %vecext.3)
113 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
114 ret <4 x float> %vecins.3
116 declare float @llvm.fabs.f32(float)
117 define <4 x float> @int_fabs_4x(<4 x float>* %a) {
118 ; CHECK-LABEL: @int_fabs_4x(
120 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
121 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
122 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
124 ; NOACCELERATE-LABEL: @int_fabs_4x(
125 ; NOACCELERATE-NEXT: entry:
126 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
127 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
128 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
131 %0 = load <4 x float>, <4 x float>* %a, align 16
132 %vecext = extractelement <4 x float> %0, i32 0
133 %1 = tail call fast float @llvm.fabs.f32(float %vecext)
134 %vecins = insertelement <4 x float> poison, float %1, i32 0
135 %vecext.1 = extractelement <4 x float> %0, i32 1
136 %2 = tail call fast float @llvm.fabs.f32(float %vecext.1)
137 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
138 %vecext.2 = extractelement <4 x float> %0, i32 2
139 %3 = tail call fast float @llvm.fabs.f32(float %vecext.2)
140 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
141 %vecext.3 = extractelement <4 x float> %0, i32 3
142 %4 = tail call fast float @llvm.fabs.f32(float %vecext.3)
143 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
144 ret <4 x float> %vecins.3
146 declare float @floorf(float) readonly
147 define <4 x float> @floor_4x(<4 x float>* %a) {
148 ; CHECK-LABEL: @floor_4x(
150 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
151 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
152 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
154 ; NOACCELERATE-LABEL: @floor_4x(
155 ; NOACCELERATE-NEXT: entry:
156 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
157 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
158 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
161 %0 = load <4 x float>, <4 x float>* %a, align 16
162 %vecext = extractelement <4 x float> %0, i32 0
163 %1 = tail call fast float @floorf(float %vecext)
164 %vecins = insertelement <4 x float> poison, float %1, i32 0
165 %vecext.1 = extractelement <4 x float> %0, i32 1
166 %2 = tail call fast float @floorf(float %vecext.1)
167 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
168 %vecext.2 = extractelement <4 x float> %0, i32 2
169 %3 = tail call fast float @floorf(float %vecext.2)
170 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
171 %vecext.3 = extractelement <4 x float> %0, i32 3
172 %4 = tail call fast float @floorf(float %vecext.3)
173 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
174 ret <4 x float> %vecins.3
176 declare float @sqrtf(float) readonly
177 define <4 x float> @sqrt_4x(<4 x float>* %a) {
178 ; CHECK-LABEL: @sqrt_4x(
180 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
181 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
182 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
184 ; NOACCELERATE-LABEL: @sqrt_4x(
185 ; NOACCELERATE-NEXT: entry:
186 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
187 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
188 ; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]]
191 %0 = load <4 x float>, <4 x float>* %a, align 16
192 %vecext = extractelement <4 x float> %0, i32 0
193 %1 = tail call fast float @sqrtf(float %vecext)
194 %vecins = insertelement <4 x float> poison, float %1, i32 0
195 %vecext.1 = extractelement <4 x float> %0, i32 1
196 %2 = tail call fast float @sqrtf(float %vecext.1)
197 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
198 %vecext.2 = extractelement <4 x float> %0, i32 2
199 %3 = tail call fast float @sqrtf(float %vecext.2)
200 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
201 %vecext.3 = extractelement <4 x float> %0, i32 3
202 %4 = tail call fast float @sqrtf(float %vecext.3)
203 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
204 ret <4 x float> %vecins.3
206 declare float @expf(float) readonly
207 define <4 x float> @exp_4x(<4 x float>* %a) {
208 ; CHECK-LABEL: @exp_4x(
210 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
211 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
212 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
214 ; NOACCELERATE-LABEL: @exp_4x(
215 ; NOACCELERATE-NEXT: entry:
216 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
217 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
218 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
219 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
220 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
221 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
222 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
223 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
224 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
225 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
226 ; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
227 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
228 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
229 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
230 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
233 %0 = load <4 x float>, <4 x float>* %a, align 16
234 %vecext = extractelement <4 x float> %0, i32 0
235 %1 = tail call fast float @expf(float %vecext)
236 %vecins = insertelement <4 x float> poison, float %1, i32 0
237 %vecext.1 = extractelement <4 x float> %0, i32 1
238 %2 = tail call fast float @expf(float %vecext.1)
239 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
240 %vecext.2 = extractelement <4 x float> %0, i32 2
241 %3 = tail call fast float @expf(float %vecext.2)
242 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
243 %vecext.3 = extractelement <4 x float> %0, i32 3
244 %4 = tail call fast float @expf(float %vecext.3)
245 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
246 ret <4 x float> %vecins.3
248 declare float @expm1f(float) readonly
249 define <4 x float> @expm1_4x(<4 x float>* %a) {
250 ; CHECK-LABEL: @expm1_4x(
252 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
253 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
254 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
256 ; NOACCELERATE-LABEL: @expm1_4x(
257 ; NOACCELERATE-NEXT: entry:
258 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
259 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
260 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expm1f(float [[VECEXT]])
261 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
262 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
263 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expm1f(float [[VECEXT_1]])
264 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
265 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
266 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @expm1f(float [[VECEXT_2]])
267 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
268 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
269 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @expm1f(float [[VECEXT_3]])
270 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
271 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
274 %0 = load <4 x float>, <4 x float>* %a, align 16
275 %vecext = extractelement <4 x float> %0, i32 0
276 %1 = tail call fast float @expm1f(float %vecext)
277 %vecins = insertelement <4 x float> poison, float %1, i32 0
278 %vecext.1 = extractelement <4 x float> %0, i32 1
279 %2 = tail call fast float @expm1f(float %vecext.1)
280 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
281 %vecext.2 = extractelement <4 x float> %0, i32 2
282 %3 = tail call fast float @expm1f(float %vecext.2)
283 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
284 %vecext.3 = extractelement <4 x float> %0, i32 3
285 %4 = tail call fast float @expm1f(float %vecext.3)
286 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
287 ret <4 x float> %vecins.3
289 declare float @logf(float) readonly
290 define <4 x float> @log_4x(<4 x float>* %a) {
291 ; CHECK-LABEL: @log_4x(
293 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
294 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
295 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
297 ; NOACCELERATE-LABEL: @log_4x(
298 ; NOACCELERATE-NEXT: entry:
299 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
300 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
301 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
302 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
303 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
304 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
305 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
306 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
307 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
308 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
309 ; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
310 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
311 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
312 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
313 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
316 %0 = load <4 x float>, <4 x float>* %a, align 16
317 %vecext = extractelement <4 x float> %0, i32 0
318 %1 = tail call fast float @logf(float %vecext)
319 %vecins = insertelement <4 x float> poison, float %1, i32 0
320 %vecext.1 = extractelement <4 x float> %0, i32 1
321 %2 = tail call fast float @logf(float %vecext.1)
322 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
323 %vecext.2 = extractelement <4 x float> %0, i32 2
324 %3 = tail call fast float @logf(float %vecext.2)
325 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
326 %vecext.3 = extractelement <4 x float> %0, i32 3
327 %4 = tail call fast float @logf(float %vecext.3)
328 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
329 ret <4 x float> %vecins.3
331 declare float @log1pf(float) readonly
332 define <4 x float> @log1p_4x(<4 x float>* %a) {
333 ; CHECK-LABEL: @log1p_4x(
335 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
336 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
337 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
339 ; NOACCELERATE-LABEL: @log1p_4x(
340 ; NOACCELERATE-NEXT: entry:
341 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
342 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
343 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @log1pf(float [[VECEXT]])
344 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
345 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
346 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @log1pf(float [[VECEXT_1]])
347 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
348 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
349 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @log1pf(float [[VECEXT_2]])
350 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
351 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
352 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @log1pf(float [[VECEXT_3]])
353 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
354 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
357 %0 = load <4 x float>, <4 x float>* %a, align 16
358 %vecext = extractelement <4 x float> %0, i32 0
359 %1 = tail call fast float @log1pf(float %vecext)
360 %vecins = insertelement <4 x float> poison, float %1, i32 0
361 %vecext.1 = extractelement <4 x float> %0, i32 1
362 %2 = tail call fast float @log1pf(float %vecext.1)
363 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
364 %vecext.2 = extractelement <4 x float> %0, i32 2
365 %3 = tail call fast float @log1pf(float %vecext.2)
366 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
367 %vecext.3 = extractelement <4 x float> %0, i32 3
368 %4 = tail call fast float @log1pf(float %vecext.3)
369 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
370 ret <4 x float> %vecins.3
372 declare float @log10pf(float) readonly
373 define <4 x float> @log10p_4x(<4 x float>* %a) {
374 ; CHECK-LABEL: @log10p_4x(
376 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
377 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
378 ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
379 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
380 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
381 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
382 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
383 ; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
384 ; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
385 ; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
386 ; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
387 ; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
388 ; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
389 ; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
391 ; NOACCELERATE-LABEL: @log10p_4x(
392 ; NOACCELERATE-NEXT: entry:
393 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
394 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
395 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
396 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
397 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
398 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
399 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
400 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
401 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
402 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
403 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
404 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
405 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
406 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
409 %0 = load <4 x float>, <4 x float>* %a, align 16
410 %vecext = extractelement <4 x float> %0, i32 0
411 %1 = tail call fast float @log10pf(float %vecext)
412 %vecins = insertelement <4 x float> poison, float %1, i32 0
413 %vecext.1 = extractelement <4 x float> %0, i32 1
414 %2 = tail call fast float @log10pf(float %vecext.1)
415 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
416 %vecext.2 = extractelement <4 x float> %0, i32 2
417 %3 = tail call fast float @log10pf(float %vecext.2)
418 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
419 %vecext.3 = extractelement <4 x float> %0, i32 3
420 %4 = tail call fast float @log10pf(float %vecext.3)
421 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
422 ret <4 x float> %vecins.3
424 declare float @logbf(float) readonly
425 define <4 x float> @logb_4x(<4 x float>* %a) {
426 ; CHECK-LABEL: @logb_4x(
428 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
429 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
430 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
432 ; NOACCELERATE-LABEL: @logb_4x(
433 ; NOACCELERATE-NEXT: entry:
434 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
435 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
436 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
437 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
438 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
439 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
440 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
441 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
442 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
443 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
444 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
445 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
446 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
447 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
450 %0 = load <4 x float>, <4 x float>* %a, align 16
451 %vecext = extractelement <4 x float> %0, i32 0
452 %1 = tail call fast float @logbf(float %vecext)
453 %vecins = insertelement <4 x float> poison, float %1, i32 0
454 %vecext.1 = extractelement <4 x float> %0, i32 1
455 %2 = tail call fast float @logbf(float %vecext.1)
456 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
457 %vecext.2 = extractelement <4 x float> %0, i32 2
458 %3 = tail call fast float @logbf(float %vecext.2)
459 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
460 %vecext.3 = extractelement <4 x float> %0, i32 3
461 %4 = tail call fast float @logbf(float %vecext.3)
462 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
463 ret <4 x float> %vecins.3
465 declare float @sinf(float) readonly
466 define <4 x float> @sin_4x(<4 x float>* %a) {
467 ; CHECK-LABEL: @sin_4x(
469 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
470 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
471 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
473 ; NOACCELERATE-LABEL: @sin_4x(
474 ; NOACCELERATE-NEXT: entry:
475 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
476 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
477 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
478 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
479 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
480 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
481 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
482 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
483 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
484 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
485 ; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
486 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
487 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
488 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
489 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
492 %0 = load <4 x float>, <4 x float>* %a, align 16
493 %vecext = extractelement <4 x float> %0, i32 0
494 %1 = tail call fast float @sinf(float %vecext)
495 %vecins = insertelement <4 x float> poison, float %1, i32 0
496 %vecext.1 = extractelement <4 x float> %0, i32 1
497 %2 = tail call fast float @sinf(float %vecext.1)
498 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
499 %vecext.2 = extractelement <4 x float> %0, i32 2
500 %3 = tail call fast float @sinf(float %vecext.2)
501 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
502 %vecext.3 = extractelement <4 x float> %0, i32 3
503 %4 = tail call fast float @sinf(float %vecext.3)
504 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
505 ret <4 x float> %vecins.3
507 declare float @cosf(float) readonly
508 define <4 x float> @cos_4x(<4 x float>* %a) {
509 ; CHECK-LABEL: @cos_4x(
511 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
512 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
513 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
515 ; NOACCELERATE-LABEL: @cos_4x(
516 ; NOACCELERATE-NEXT: entry:
517 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
518 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
519 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
520 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
521 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
522 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
523 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
524 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
525 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
526 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
527 ; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
528 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
529 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
530 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
531 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
534 %0 = load <4 x float>, <4 x float>* %a, align 16
535 %vecext = extractelement <4 x float> %0, i32 0
536 %1 = tail call fast float @cosf(float %vecext)
537 %vecins = insertelement <4 x float> poison, float %1, i32 0
538 %vecext.1 = extractelement <4 x float> %0, i32 1
539 %2 = tail call fast float @cosf(float %vecext.1)
540 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
541 %vecext.2 = extractelement <4 x float> %0, i32 2
542 %3 = tail call fast float @cosf(float %vecext.2)
543 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
544 %vecext.3 = extractelement <4 x float> %0, i32 3
545 %4 = tail call fast float @cosf(float %vecext.3)
546 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
547 ret <4 x float> %vecins.3
549 declare float @tanf(float) readonly
550 define <4 x float> @tan_4x(<4 x float>* %a) {
551 ; CHECK-LABEL: @tan_4x(
553 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
554 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
555 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
557 ; NOACCELERATE-LABEL: @tan_4x(
558 ; NOACCELERATE-NEXT: entry:
559 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
560 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
561 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
562 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
563 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
564 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
565 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
566 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
567 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
568 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
569 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
570 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
571 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
572 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
575 %0 = load <4 x float>, <4 x float>* %a, align 16
576 %vecext = extractelement <4 x float> %0, i32 0
577 %1 = tail call fast float @tanf(float %vecext)
578 %vecins = insertelement <4 x float> poison, float %1, i32 0
579 %vecext.1 = extractelement <4 x float> %0, i32 1
580 %2 = tail call fast float @tanf(float %vecext.1)
581 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
582 %vecext.2 = extractelement <4 x float> %0, i32 2
583 %3 = tail call fast float @tanf(float %vecext.2)
584 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
585 %vecext.3 = extractelement <4 x float> %0, i32 3
586 %4 = tail call fast float @tanf(float %vecext.3)
587 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
588 ret <4 x float> %vecins.3
590 declare float @asinf(float) readonly
591 define <4 x float> @asin_4x(<4 x float>* %a) {
592 ; CHECK-LABEL: @asin_4x(
594 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
595 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
596 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
598 ; NOACCELERATE-LABEL: @asin_4x(
599 ; NOACCELERATE-NEXT: entry:
600 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
601 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
602 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
603 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
604 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
605 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
606 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
607 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
608 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinf(float [[VECEXT_2]])
609 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
610 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
611 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinf(float [[VECEXT_3]])
612 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
613 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
616 %0 = load <4 x float>, <4 x float>* %a, align 16
617 %vecext = extractelement <4 x float> %0, i32 0
618 %1 = tail call fast float @asinf(float %vecext)
619 %vecins = insertelement <4 x float> poison, float %1, i32 0
620 %vecext.1 = extractelement <4 x float> %0, i32 1
621 %2 = tail call fast float @asinf(float %vecext.1)
622 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
623 %vecext.2 = extractelement <4 x float> %0, i32 2
624 %3 = tail call fast float @asinf(float %vecext.2)
625 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
626 %vecext.3 = extractelement <4 x float> %0, i32 3
627 %4 = tail call fast float @asinf(float %vecext.3)
628 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
629 ret <4 x float> %vecins.3
631 declare float @acosf(float) readonly
632 define <4 x float> @acos_4x(<4 x float>* %a) {
633 ; CHECK-LABEL: @acos_4x(
635 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
636 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
637 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
639 ; NOACCELERATE-LABEL: @acos_4x(
640 ; NOACCELERATE-NEXT: entry:
641 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
642 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
643 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
644 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
645 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
646 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
647 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
648 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
649 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
650 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
651 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
652 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
653 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
654 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
657 %0 = load <4 x float>, <4 x float>* %a, align 16
658 %vecext = extractelement <4 x float> %0, i32 0
659 %1 = tail call fast float @acosf(float %vecext)
660 %vecins = insertelement <4 x float> poison, float %1, i32 0
661 %vecext.1 = extractelement <4 x float> %0, i32 1
662 %2 = tail call fast float @acosf(float %vecext.1)
663 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
664 %vecext.2 = extractelement <4 x float> %0, i32 2
665 %3 = tail call fast float @acosf(float %vecext.2)
666 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
667 %vecext.3 = extractelement <4 x float> %0, i32 3
668 %4 = tail call fast float @acosf(float %vecext.3)
669 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
670 ret <4 x float> %vecins.3
672 declare float @atanf(float) readonly
673 define <4 x float> @atan_4x(<4 x float>* %a) {
674 ; CHECK-LABEL: @atan_4x(
676 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
677 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
678 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
680 ; NOACCELERATE-LABEL: @atan_4x(
681 ; NOACCELERATE-NEXT: entry:
682 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
683 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
684 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
685 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
686 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
687 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
688 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
689 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
690 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
691 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
692 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
693 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
694 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
695 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
698 %0 = load <4 x float>, <4 x float>* %a, align 16
699 %vecext = extractelement <4 x float> %0, i32 0
700 %1 = tail call fast float @atanf(float %vecext)
701 %vecins = insertelement <4 x float> poison, float %1, i32 0
702 %vecext.1 = extractelement <4 x float> %0, i32 1
703 %2 = tail call fast float @atanf(float %vecext.1)
704 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
705 %vecext.2 = extractelement <4 x float> %0, i32 2
706 %3 = tail call fast float @atanf(float %vecext.2)
707 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
708 %vecext.3 = extractelement <4 x float> %0, i32 3
709 %4 = tail call fast float @atanf(float %vecext.3)
710 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
711 ret <4 x float> %vecins.3
713 declare float @sinhf(float) readonly
714 define <4 x float> @sinh_4x(<4 x float>* %a) {
715 ; CHECK-LABEL: @sinh_4x(
717 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
718 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
719 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
721 ; NOACCELERATE-LABEL: @sinh_4x(
722 ; NOACCELERATE-NEXT: entry:
723 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
724 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
725 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
726 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
727 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
728 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
729 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
730 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
731 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
732 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
733 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
734 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
735 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
736 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
739 %0 = load <4 x float>, <4 x float>* %a, align 16
740 %vecext = extractelement <4 x float> %0, i32 0
741 %1 = tail call fast float @sinhf(float %vecext)
742 %vecins = insertelement <4 x float> poison, float %1, i32 0
743 %vecext.1 = extractelement <4 x float> %0, i32 1
744 %2 = tail call fast float @sinhf(float %vecext.1)
745 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
746 %vecext.2 = extractelement <4 x float> %0, i32 2
747 %3 = tail call fast float @sinhf(float %vecext.2)
748 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
749 %vecext.3 = extractelement <4 x float> %0, i32 3
750 %4 = tail call fast float @sinhf(float %vecext.3)
751 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
752 ret <4 x float> %vecins.3
754 declare float @coshf(float) readonly
755 define <4 x float> @cosh_4x(<4 x float>* %a) {
756 ; CHECK-LABEL: @cosh_4x(
758 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
759 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
760 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
762 ; NOACCELERATE-LABEL: @cosh_4x(
763 ; NOACCELERATE-NEXT: entry:
764 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
765 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
766 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
767 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
768 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
769 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
770 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
771 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
772 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @coshf(float [[VECEXT_2]])
773 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
774 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
775 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @coshf(float [[VECEXT_3]])
776 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
777 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
780 %0 = load <4 x float>, <4 x float>* %a, align 16
781 %vecext = extractelement <4 x float> %0, i32 0
782 %1 = tail call fast float @coshf(float %vecext)
783 %vecins = insertelement <4 x float> poison, float %1, i32 0
784 %vecext.1 = extractelement <4 x float> %0, i32 1
785 %2 = tail call fast float @coshf(float %vecext.1)
786 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
787 %vecext.2 = extractelement <4 x float> %0, i32 2
788 %3 = tail call fast float @coshf(float %vecext.2)
789 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
790 %vecext.3 = extractelement <4 x float> %0, i32 3
791 %4 = tail call fast float @coshf(float %vecext.3)
792 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
793 ret <4 x float> %vecins.3
795 declare float @tanhf(float) readonly
796 define <4 x float> @tanh_4x(<4 x float>* %a) {
797 ; CHECK-LABEL: @tanh_4x(
799 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
800 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
801 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
803 ; NOACCELERATE-LABEL: @tanh_4x(
804 ; NOACCELERATE-NEXT: entry:
805 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
806 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
807 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
808 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
809 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
810 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
811 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
812 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
813 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
814 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
815 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
816 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
817 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
818 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
821 %0 = load <4 x float>, <4 x float>* %a, align 16
822 %vecext = extractelement <4 x float> %0, i32 0
823 %1 = tail call fast float @tanhf(float %vecext)
824 %vecins = insertelement <4 x float> poison, float %1, i32 0
825 %vecext.1 = extractelement <4 x float> %0, i32 1
826 %2 = tail call fast float @tanhf(float %vecext.1)
827 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
828 %vecext.2 = extractelement <4 x float> %0, i32 2
829 %3 = tail call fast float @tanhf(float %vecext.2)
830 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
831 %vecext.3 = extractelement <4 x float> %0, i32 3
832 %4 = tail call fast float @tanhf(float %vecext.3)
833 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
834 ret <4 x float> %vecins.3
836 declare float @asinhf(float) readonly
837 define <4 x float> @asinh_4x(<4 x float>* %a) {
838 ; CHECK-LABEL: @asinh_4x(
840 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
841 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
842 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
844 ; NOACCELERATE-LABEL: @asinh_4x(
845 ; NOACCELERATE-NEXT: entry:
846 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
847 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
848 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
849 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
850 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
851 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
852 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
853 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
854 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
855 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
856 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
857 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
858 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
859 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
862 %0 = load <4 x float>, <4 x float>* %a, align 16
863 %vecext = extractelement <4 x float> %0, i32 0
864 %1 = tail call fast float @asinhf(float %vecext)
865 %vecins = insertelement <4 x float> poison, float %1, i32 0
866 %vecext.1 = extractelement <4 x float> %0, i32 1
867 %2 = tail call fast float @asinhf(float %vecext.1)
868 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
869 %vecext.2 = extractelement <4 x float> %0, i32 2
870 %3 = tail call fast float @asinhf(float %vecext.2)
871 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
872 %vecext.3 = extractelement <4 x float> %0, i32 3
873 %4 = tail call fast float @asinhf(float %vecext.3)
874 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
875 ret <4 x float> %vecins.3
877 declare float @acoshf(float) readonly
878 define <4 x float> @acosh_4x(<4 x float>* %a) {
879 ; CHECK-LABEL: @acosh_4x(
881 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
882 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
883 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
885 ; NOACCELERATE-LABEL: @acosh_4x(
886 ; NOACCELERATE-NEXT: entry:
887 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
888 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
889 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
890 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
891 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
892 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
893 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
894 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
895 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
896 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
897 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
898 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
899 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
900 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
903 %0 = load <4 x float>, <4 x float>* %a, align 16
904 %vecext = extractelement <4 x float> %0, i32 0
905 %1 = tail call fast float @acoshf(float %vecext)
906 %vecins = insertelement <4 x float> poison, float %1, i32 0
907 %vecext.1 = extractelement <4 x float> %0, i32 1
908 %2 = tail call fast float @acoshf(float %vecext.1)
909 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
910 %vecext.2 = extractelement <4 x float> %0, i32 2
911 %3 = tail call fast float @acoshf(float %vecext.2)
912 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
913 %vecext.3 = extractelement <4 x float> %0, i32 3
914 %4 = tail call fast float @acoshf(float %vecext.3)
915 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
916 ret <4 x float> %vecins.3
918 declare float @atanhf(float) readonly
919 define <4 x float> @atanh_4x(<4 x float>* %a) {
920 ; CHECK-LABEL: @atanh_4x(
922 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
923 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
924 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
926 ; NOACCELERATE-LABEL: @atanh_4x(
927 ; NOACCELERATE-NEXT: entry:
928 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
929 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
930 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
931 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
932 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
933 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
934 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
935 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
936 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atanhf(float [[VECEXT_2]])
937 ; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
938 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
939 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atanhf(float [[VECEXT_3]])
940 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
941 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
944 %0 = load <4 x float>, <4 x float>* %a, align 16
945 %vecext = extractelement <4 x float> %0, i32 0
946 %1 = tail call fast float @atanhf(float %vecext)
947 %vecins = insertelement <4 x float> poison, float %1, i32 0
948 %vecext.1 = extractelement <4 x float> %0, i32 1
949 %2 = tail call fast float @atanhf(float %vecext.1)
950 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
951 %vecext.2 = extractelement <4 x float> %0, i32 2
952 %3 = tail call fast float @atanhf(float %vecext.2)
953 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
954 %vecext.3 = extractelement <4 x float> %0, i32 3
955 %4 = tail call fast float @atanhf(float %vecext.3)
956 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
957 ret <4 x float> %vecins.3
960 ; Accelerate *does not* provide sin() for <2 x float>.
961 define <2 x float> @sin_2x(<2 x float>* %a) {
962 ; CHECK-LABEL: @sin_2x(
964 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
965 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
966 ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
967 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
968 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
969 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
970 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
971 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
973 ; NOACCELERATE-LABEL: @sin_2x(
974 ; NOACCELERATE-NEXT: entry:
975 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
976 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
977 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
978 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
979 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
980 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
981 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
982 ; NOACCELERATE-NEXT: ret <2 x float> [[VECINS_1]]
985 %0 = load <2 x float>, <2 x float>* %a, align 16
986 %vecext = extractelement <2 x float> %0, i32 0
987 %1 = tail call fast float @llvm.sin.f32(float %vecext)
988 %vecins = insertelement <2 x float> poison, float %1, i32 0
989 %vecext.1 = extractelement <2 x float> %0, i32 1
990 %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
991 %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
992 ret <2 x float> %vecins.1
996 declare float @llvm.cos.f32(float)
998 ; Accelerate provides cos() for <4 x float>
999 define <4 x float> @int_cos_4x(<4 x float>* %a) {
1000 ; CHECK-LABEL: @int_cos_4x(
1001 ; CHECK-NEXT: entry:
1002 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
1003 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
1004 ; CHECK-NEXT: ret <4 x float> [[TMP1]]
1006 ; NOACCELERATE-LABEL: @int_cos_4x(
1007 ; NOACCELERATE-NEXT: entry:
1008 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
1009 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1010 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1011 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
1012 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1013 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1014 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
1015 ; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
1016 ; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
1017 ; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1018 ; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
1019 ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1020 ; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
1021 ; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
1022 ; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
1025 %0 = load <4 x float>, <4 x float>* %a, align 16
1026 %vecext = extractelement <4 x float> %0, i32 0
1027 %1 = tail call fast float @llvm.cos.f32(float %vecext)
1028 %vecins = insertelement <4 x float> poison, float %1, i32 0
1029 %vecext.1 = extractelement <4 x float> %0, i32 1
1030 %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1031 %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1032 %vecext.2 = extractelement <4 x float> %0, i32 2
1033 %3 = tail call fast float @llvm.cos.f32(float %vecext.2)
1034 %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1035 %vecext.3 = extractelement <4 x float> %0, i32 3
1036 %4 = tail call fast float @llvm.cos.f32(float %vecext.3)
1037 %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1038 ret <4 x float> %vecins.3
1041 ; Accelerate *does not* provide cos() for <2 x float>.
1042 define <2 x float> @cos_2x(<2 x float>* %a) {
1043 ; CHECK-LABEL: @cos_2x(
1044 ; CHECK-NEXT: entry:
1045 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
1046 ; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1047 ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
1048 ; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
1049 ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1050 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
1051 ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1052 ; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
1054 ; NOACCELERATE-LABEL: @cos_2x(
1055 ; NOACCELERATE-NEXT: entry:
1056 ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
1057 ; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1058 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1059 ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
1060 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1061 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1062 ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1063 ; NOACCELERATE-NEXT: ret <2 x float> [[VECINS_1]]
1066 %0 = load <2 x float>, <2 x float>* %a, align 16
1067 %vecext = extractelement <2 x float> %0, i32 0
1068 %1 = tail call fast float @llvm.cos.f32(float %vecext)
1069 %vecins = insertelement <2 x float> poison, float %1, i32 0
1070 %vecext.1 = extractelement <2 x float> %0, i32 1
1071 %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1072 %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1073 ret <2 x float> %vecins.1