1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
5 typedef half half2
__attribute__((ext_vector_type(2)));
6 typedef float float2
__attribute__((ext_vector_type(2)));
7 typedef float float4
__attribute__((ext_vector_type(4)));
8 typedef short int si8
__attribute__((ext_vector_type(8)));
9 typedef unsigned int u4
__attribute__((ext_vector_type(4)));
10 typedef double double2
__attribute__((ext_vector_type(2)));
11 typedef double double3
__attribute__((ext_vector_type(3)));
13 __attribute__((address_space(1))) int int_as_one
;
17 void test_builtin_elementwise_abs(float f1
, float f2
, double d1
, double d2
,
18 float4 vf1
, float4 vf2
, si8 vi1
, si8 vi2
,
19 long long int i1
, long long int i2
, short si
,
20 _BitInt(31) bi1
, _BitInt(31) bi2
) {
21 // CHECK-LABEL: define void @test_builtin_elementwise_abs(
22 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
23 // CHECK-NEXT: call float @llvm.fabs.f32(float [[F1]])
24 f2
= __builtin_elementwise_abs(f1
);
26 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
27 // CHECK-NEXT: call double @llvm.fabs.f64(double [[D1]])
28 d2
= __builtin_elementwise_abs(d1
);
30 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
31 // CHECK-NEXT: call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VF1]])
32 vf2
= __builtin_elementwise_abs(vf1
);
34 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
35 // CHECK-NEXT: call i64 @llvm.abs.i64(i64 [[I1]], i1 false)
36 i2
= __builtin_elementwise_abs(i1
);
38 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
39 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[VI1]], i1 false)
40 vi2
= __builtin_elementwise_abs(vi1
);
42 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
43 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[CVI2]], i1 false)
45 vi2
= __builtin_elementwise_abs(cvi2
);
47 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
48 // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[BI1]], i1 false)
49 bi2
= __builtin_elementwise_abs(bi1
);
51 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
52 // CHECK-NEXT: call i32 @llvm.abs.i32(i32 [[IA1]], i1 false)
53 b
= __builtin_elementwise_abs(int_as_one
);
55 // CHECK: call i32 @llvm.abs.i32(i32 -10, i1 false)
56 b
= __builtin_elementwise_abs(-10);
58 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
59 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
60 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.abs.i32(i32 [[SI_EXT]], i1 false)
61 // CHECK-NEXT: = trunc i32 [[RES]] to i16
62 si
= __builtin_elementwise_abs(si
);
65 void test_builtin_elementwise_add_sat(float f1
, float f2
, double d1
, double d2
,
66 float4 vf1
, float4 vf2
, long long int i1
,
67 long long int i2
, si8 vi1
, si8 vi2
,
68 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
69 _BitInt(31) bi1
, _BitInt(31) bi2
,
70 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
71 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
72 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
73 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 [[I2]])
74 i1
= __builtin_elementwise_add_sat(i1
, i2
);
76 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
77 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 10)
78 i1
= __builtin_elementwise_add_sat(i1
, 10);
80 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
81 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
82 // CHECK-NEXT: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
83 vi1
= __builtin_elementwise_add_sat(vi1
, vi2
);
85 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
86 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
87 // CHECK-NEXT: call i32 @llvm.uadd.sat.i32(i32 [[U1]], i32 [[U2]])
88 u1
= __builtin_elementwise_add_sat(u1
, u2
);
90 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
91 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
92 // CHECK-NEXT: call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
93 vu1
= __builtin_elementwise_add_sat(vu1
, vu2
);
95 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
96 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
97 // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[BI1]], i31 [[BI2]])
98 bi1
= __builtin_elementwise_add_sat(bi1
, bi2
);
100 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
101 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
102 // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[BU1]], i55 [[BU2]])
103 bu1
= __builtin_elementwise_add_sat(bu1
, bu2
);
105 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
106 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
107 // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[IAS1]], i32 [[B]])
108 int_as_one
= __builtin_elementwise_add_sat(int_as_one
, b
);
110 // CHECK: call i32 @llvm.sadd.sat.i32(i32 1, i32 97)
111 i1
= __builtin_elementwise_add_sat(1, 'a');
114 void test_builtin_elementwise_sub_sat(float f1
, float f2
, double d1
, double d2
,
115 float4 vf1
, float4 vf2
, long long int i1
,
116 long long int i2
, si8 vi1
, si8 vi2
,
117 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
118 _BitInt(31) bi1
, _BitInt(31) bi2
,
119 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
120 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
121 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
122 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 [[I2]])
123 i1
= __builtin_elementwise_sub_sat(i1
, i2
);
125 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
126 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 10)
127 i1
= __builtin_elementwise_sub_sat(i1
, 10);
129 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
130 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
131 // CHECK-NEXT: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
132 vi1
= __builtin_elementwise_sub_sat(vi1
, vi2
);
134 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
135 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
136 // CHECK-NEXT: call i32 @llvm.usub.sat.i32(i32 [[U1]], i32 [[U2]])
137 u1
= __builtin_elementwise_sub_sat(u1
, u2
);
139 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
140 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
141 // CHECK-NEXT: call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
142 vu1
= __builtin_elementwise_sub_sat(vu1
, vu2
);
144 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
145 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
146 // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[BI1]], i31 [[BI2]])
147 bi1
= __builtin_elementwise_sub_sat(bi1
, bi2
);
149 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
150 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
151 // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[BU1]], i55 [[BU2]])
152 bu1
= __builtin_elementwise_sub_sat(bu1
, bu2
);
154 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
155 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
156 // CHECK-NEXT: call i32 @llvm.ssub.sat.i32(i32 [[IAS1]], i32 [[B]])
157 int_as_one
= __builtin_elementwise_sub_sat(int_as_one
, b
);
159 // CHECK: call i32 @llvm.ssub.sat.i32(i32 1, i32 97)
160 i1
= __builtin_elementwise_sub_sat(1, 'a');
163 void test_builtin_elementwise_max(float f1
, float f2
, double d1
, double d2
,
164 float4 vf1
, float4 vf2
, long long int i1
,
165 long long int i2
, si8 vi1
, si8 vi2
,
166 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
167 _BitInt(31) bi1
, _BitInt(31) bi2
,
168 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
169 // CHECK-LABEL: define void @test_builtin_elementwise_max(
170 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
171 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
172 // CHECK-NEXT: call float @llvm.maxnum.f32(float %0, float %1)
173 f1
= __builtin_elementwise_max(f1
, f2
);
175 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
176 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
177 // CHECK-NEXT: call double @llvm.maxnum.f64(double [[D1]], double [[D2]])
178 d1
= __builtin_elementwise_max(d1
, d2
);
180 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
181 // CHECK-NEXT: call double @llvm.maxnum.f64(double 2.000000e+01, double [[D2]])
182 d1
= __builtin_elementwise_max(20.0, d2
);
184 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
185 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
186 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
187 vf1
= __builtin_elementwise_max(vf1
, vf2
);
189 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
190 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
191 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 [[I2]])
192 i1
= __builtin_elementwise_max(i1
, i2
);
194 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
195 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 10)
196 i1
= __builtin_elementwise_max(i1
, 10);
198 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
199 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
200 // CHECK-NEXT: call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
201 vi1
= __builtin_elementwise_max(vi1
, vi2
);
203 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
204 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
205 // CHECK-NEXT: call i32 @llvm.umax.i32(i32 [[U1]], i32 [[U2]])
206 u1
= __builtin_elementwise_max(u1
, u2
);
208 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
209 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
210 // CHECK-NEXT: call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
211 vu1
= __builtin_elementwise_max(vu1
, vu2
);
213 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
214 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
215 // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[BI1]], i31 [[BI2]])
216 bi1
= __builtin_elementwise_max(bi1
, bi2
);
218 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
219 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
220 // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[BU1]], i55 [[BU2]])
221 bu1
= __builtin_elementwise_max(bu1
, bu2
);
223 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
224 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
225 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
226 const float4 cvf1
= vf1
;
227 vf1
= __builtin_elementwise_max(cvf1
, vf2
);
229 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
230 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
231 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
232 vf1
= __builtin_elementwise_max(vf2
, cvf1
);
234 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
235 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
236 // CHECK-NEXT: call i32 @llvm.smax.i32(i32 [[IAS1]], i32 [[B]])
237 int_as_one
= __builtin_elementwise_max(int_as_one
, b
);
239 // CHECK: call i32 @llvm.smax.i32(i32 1, i32 97)
240 i1
= __builtin_elementwise_max(1, 'a');
243 void test_builtin_elementwise_min(float f1
, float f2
, double d1
, double d2
,
244 float4 vf1
, float4 vf2
, long long int i1
,
245 long long int i2
, si8 vi1
, si8 vi2
,
246 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
247 _BitInt(31) bi1
, _BitInt(31) bi2
,
248 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
249 // CHECK-LABEL: define void @test_builtin_elementwise_min(
250 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
251 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
252 // CHECK-NEXT: call float @llvm.minnum.f32(float %0, float %1)
253 f1
= __builtin_elementwise_min(f1
, f2
);
255 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
256 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
257 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double [[D2]])
258 d1
= __builtin_elementwise_min(d1
, d2
);
260 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
261 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double 2.000000e+00)
262 d1
= __builtin_elementwise_min(d1
, 2.0);
264 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
265 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
266 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
267 vf1
= __builtin_elementwise_min(vf1
, vf2
);
269 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
270 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
271 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[I1]], i64 [[I2]])
272 i1
= __builtin_elementwise_min(i1
, i2
);
274 // CHECK: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
275 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 -11, i64 [[I2]])
276 i1
= __builtin_elementwise_min(-11, i2
);
278 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
279 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
280 // CHECK-NEXT: call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
281 vi1
= __builtin_elementwise_min(vi1
, vi2
);
283 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
284 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
285 // CHECK-NEXT: call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
286 u1
= __builtin_elementwise_min(u1
, u2
);
288 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
289 // CHECK-NEXT: [[ZEXT_U1:%.+]] = zext i32 [[U1]] to i64
290 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
291 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[ZEXT_U1]], i64 [[I2]])
292 u1
= __builtin_elementwise_min(u1
, i2
);
294 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
295 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
296 // CHECK-NEXT: call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
297 vu1
= __builtin_elementwise_min(vu1
, vu2
);
299 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
300 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
301 // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[BI1]], i31 [[BI2]])
302 bi1
= __builtin_elementwise_min(bi1
, bi2
);
304 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
305 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
306 // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[BU1]], i55 [[BU2]])
307 bu1
= __builtin_elementwise_min(bu1
, bu2
);
309 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
310 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
311 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
312 const float4 cvf1
= vf1
;
313 vf1
= __builtin_elementwise_min(cvf1
, vf2
);
315 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
316 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
317 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
318 vf1
= __builtin_elementwise_min(vf2
, cvf1
);
320 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
321 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
322 // CHECK-NEXT: call i32 @llvm.smin.i32(i32 [[IAS1]], i32 [[B]])
323 int_as_one
= __builtin_elementwise_min(int_as_one
, b
);
326 void test_builtin_elementwise_bitreverse(si8 vi1
, si8 vi2
,
327 long long int i1
, long long int i2
, short si
,
328 _BitInt(31) bi1
, _BitInt(31) bi2
) {
331 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
332 // CHECK-NEXT: call i64 @llvm.bitreverse.i64(i64 [[I1]])
333 i2
= __builtin_elementwise_bitreverse(i1
);
335 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
336 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[VI1]])
337 vi2
= __builtin_elementwise_bitreverse(vi1
);
339 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
340 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[CVI2]])
341 const si8 cvi2
= vi2
;
342 vi2
= __builtin_elementwise_bitreverse(cvi2
);
344 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
345 // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[BI1]])
346 bi2
= __builtin_elementwise_bitreverse(bi1
);
348 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
349 // CHECK-NEXT: call i32 @llvm.bitreverse.i32(i32 [[IA1]])
350 b
= __builtin_elementwise_bitreverse(int_as_one
);
352 // CHECK: call i32 @llvm.bitreverse.i32(i32 -10)
353 b
= __builtin_elementwise_bitreverse(-10);
355 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
356 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
357 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.bitreverse.i32(i32 [[SI_EXT]])
358 // CHECK-NEXT: = trunc i32 [[RES]] to i16
359 si
= __builtin_elementwise_bitreverse(si
);
362 void test_builtin_elementwise_ceil(float f1
, float f2
, double d1
, double d2
,
363 float4 vf1
, float4 vf2
) {
364 // CHECK-LABEL: define void @test_builtin_elementwise_ceil(
365 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
366 // CHECK-NEXT: call float @llvm.ceil.f32(float [[F1]])
367 f2
= __builtin_elementwise_ceil(f1
);
369 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
370 // CHECK-NEXT: call double @llvm.ceil.f64(double [[D1]])
371 d2
= __builtin_elementwise_ceil(d1
);
373 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
374 // CHECK-NEXT: call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VF1]])
375 vf2
= __builtin_elementwise_ceil(vf1
);
378 void test_builtin_elementwise_cos(float f1
, float f2
, double d1
, double d2
,
379 float4 vf1
, float4 vf2
) {
380 // CHECK-LABEL: define void @test_builtin_elementwise_cos(
381 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
382 // CHECK-NEXT: call float @llvm.cos.f32(float [[F1]])
383 f2
= __builtin_elementwise_cos(f1
);
385 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
386 // CHECK-NEXT: call double @llvm.cos.f64(double [[D1]])
387 d2
= __builtin_elementwise_cos(d1
);
389 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
390 // CHECK-NEXT: call <4 x float> @llvm.cos.v4f32(<4 x float> [[VF1]])
391 vf2
= __builtin_elementwise_cos(vf1
);
394 void test_builtin_elementwise_exp(float f1
, float f2
, double d1
, double d2
,
395 float4 vf1
, float4 vf2
) {
396 // CHECK-LABEL: define void @test_builtin_elementwise_exp(
397 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
398 // CHECK-NEXT: call float @llvm.exp.f32(float [[F1]])
399 f2
= __builtin_elementwise_exp(f1
);
401 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
402 // CHECK-NEXT: call double @llvm.exp.f64(double [[D1]])
403 d2
= __builtin_elementwise_exp(d1
);
405 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
406 // CHECK-NEXT: call <4 x float> @llvm.exp.v4f32(<4 x float> [[VF1]])
407 vf2
= __builtin_elementwise_exp(vf1
);
410 void test_builtin_elementwise_exp2(float f1
, float f2
, double d1
, double d2
,
411 float4 vf1
, float4 vf2
) {
412 // CHECK-LABEL: define void @test_builtin_elementwise_exp2(
413 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
414 // CHECK-NEXT: call float @llvm.exp2.f32(float [[F1]])
415 f2
= __builtin_elementwise_exp2(f1
);
417 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
418 // CHECK-NEXT: call double @llvm.exp2.f64(double [[D1]])
419 d2
= __builtin_elementwise_exp2(d1
);
421 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
422 // CHECK-NEXT: call <4 x float> @llvm.exp2.v4f32(<4 x float> [[VF1]])
423 vf2
= __builtin_elementwise_exp2(vf1
);
427 void test_builtin_elementwise_floor(float f1
, float f2
, double d1
, double d2
,
428 float4 vf1
, float4 vf2
) {
429 // CHECK-LABEL: define void @test_builtin_elementwise_floor(
430 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
431 // CHECK-NEXT: call float @llvm.floor.f32(float [[F1]])
432 f2
= __builtin_elementwise_floor(f1
);
434 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
435 // CHECK-NEXT: call double @llvm.floor.f64(double [[D1]])
436 d2
= __builtin_elementwise_floor(d1
);
438 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
439 // CHECK-NEXT: call <4 x float> @llvm.floor.v4f32(<4 x float> [[VF1]])
440 vf2
= __builtin_elementwise_floor(vf1
);
443 void test_builtin_elementwise_log(float f1
, float f2
, double d1
, double d2
,
444 float4 vf1
, float4 vf2
) {
445 // CHECK-LABEL: define void @test_builtin_elementwise_log(
446 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
447 // CHECK-NEXT: call float @llvm.log.f32(float [[F1]])
448 f2
= __builtin_elementwise_log(f1
);
450 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
451 // CHECK-NEXT: call double @llvm.log.f64(double [[D1]])
452 d2
= __builtin_elementwise_log(d1
);
454 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
455 // CHECK-NEXT: call <4 x float> @llvm.log.v4f32(<4 x float> [[VF1]])
456 vf2
= __builtin_elementwise_log(vf1
);
459 void test_builtin_elementwise_log10(float f1
, float f2
, double d1
, double d2
,
460 float4 vf1
, float4 vf2
) {
461 // CHECK-LABEL: define void @test_builtin_elementwise_log10(
462 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
463 // CHECK-NEXT: call float @llvm.log10.f32(float [[F1]])
464 f2
= __builtin_elementwise_log10(f1
);
466 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
467 // CHECK-NEXT: call double @llvm.log10.f64(double [[D1]])
468 d2
= __builtin_elementwise_log10(d1
);
470 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
471 // CHECK-NEXT: call <4 x float> @llvm.log10.v4f32(<4 x float> [[VF1]])
472 vf2
= __builtin_elementwise_log10(vf1
);
475 void test_builtin_elementwise_log2(float f1
, float f2
, double d1
, double d2
,
476 float4 vf1
, float4 vf2
) {
477 // CHECK-LABEL: define void @test_builtin_elementwise_log2(
478 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
479 // CHECK-NEXT: call float @llvm.log2.f32(float [[F1]])
480 f2
= __builtin_elementwise_log2(f1
);
482 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
483 // CHECK-NEXT: call double @llvm.log2.f64(double [[D1]])
484 d2
= __builtin_elementwise_log2(d1
);
486 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
487 // CHECK-NEXT: call <4 x float> @llvm.log2.v4f32(<4 x float> [[VF1]])
488 vf2
= __builtin_elementwise_log2(vf1
);
491 void test_builtin_elementwise_pow(float f1
, float f2
, double d1
, double d2
,
492 float4 vf1
, float4 vf2
) {
494 // CHECK-LABEL: define void @test_builtin_elementwise_pow(
495 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
496 // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4
497 // CHECK-NEXT: call float @llvm.pow.f32(float [[F1]], float [[F2]])
498 f2
= __builtin_elementwise_pow(f1
, f2
);
500 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
501 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
502 // CHECK-NEXT: call double @llvm.pow.f64(double [[D1]], double [[D2]])
503 d2
= __builtin_elementwise_pow(d1
, d2
);
505 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
506 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
507 // CHECK-NEXT: call <4 x float> @llvm.pow.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
508 vf2
= __builtin_elementwise_pow(vf1
, vf2
);
511 void test_builtin_elementwise_roundeven(float f1
, float f2
, double d1
, double d2
,
512 float4 vf1
, float4 vf2
) {
513 // CHECK-LABEL: define void @test_builtin_elementwise_roundeven(
514 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
515 // CHECK-NEXT: call float @llvm.roundeven.f32(float [[F1]])
516 f2
= __builtin_elementwise_roundeven(f1
);
518 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
519 // CHECK-NEXT: call double @llvm.roundeven.f64(double [[D1]])
520 d2
= __builtin_elementwise_roundeven(d1
);
522 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
523 // CHECK-NEXT: call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[VF1]])
524 vf2
= __builtin_elementwise_roundeven(vf1
);
527 void test_builtin_elementwise_round(float f1
, float f2
, double d1
, double d2
,
528 float4 vf1
, float4 vf2
) {
529 // CHECK-LABEL: define void @test_builtin_elementwise_round(
530 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
531 // CHECK-NEXT: call float @llvm.round.f32(float [[F1]])
532 f2
= __builtin_elementwise_round(f1
);
534 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
535 // CHECK-NEXT: call double @llvm.round.f64(double [[D1]])
536 d2
= __builtin_elementwise_round(d1
);
538 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
539 // CHECK-NEXT: call <4 x float> @llvm.round.v4f32(<4 x float> [[VF1]])
540 vf2
= __builtin_elementwise_round(vf1
);
543 void test_builtin_elementwise_rint(float f1
, float f2
, double d1
, double d2
,
544 float4 vf1
, float4 vf2
) {
545 // CHECK-LABEL: define void @test_builtin_elementwise_rint(
546 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
547 // CHECK-NEXT: call float @llvm.rint.f32(float [[F1]])
548 f2
= __builtin_elementwise_rint(f1
);
550 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
551 // CHECK-NEXT: call double @llvm.rint.f64(double [[D1]])
552 d2
= __builtin_elementwise_rint(d1
);
554 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
555 // CHECK-NEXT: call <4 x float> @llvm.rint.v4f32(<4 x float> [[VF1]])
556 vf2
= __builtin_elementwise_rint(vf1
);
559 void test_builtin_elementwise_nearbyint(float f1
, float f2
, double d1
, double d2
,
560 float4 vf1
, float4 vf2
) {
561 // CHECK-LABEL: define void @test_builtin_elementwise_nearbyint(
562 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
563 // CHECK-NEXT: call float @llvm.nearbyint.f32(float [[F1]])
564 f2
= __builtin_elementwise_nearbyint(f1
);
566 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
567 // CHECK-NEXT: call double @llvm.nearbyint.f64(double [[D1]])
568 d2
= __builtin_elementwise_nearbyint(d1
);
570 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
571 // CHECK-NEXT: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VF1]])
572 vf2
= __builtin_elementwise_nearbyint(vf1
);
575 void test_builtin_elementwise_sin(float f1
, float f2
, double d1
, double d2
,
576 float4 vf1
, float4 vf2
) {
577 // CHECK-LABEL: define void @test_builtin_elementwise_sin(
578 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
579 // CHECK-NEXT: call float @llvm.sin.f32(float [[F1]])
580 f2
= __builtin_elementwise_sin(f1
);
582 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
583 // CHECK-NEXT: call double @llvm.sin.f64(double [[D1]])
584 d2
= __builtin_elementwise_sin(d1
);
586 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
587 // CHECK-NEXT: call <4 x float> @llvm.sin.v4f32(<4 x float> [[VF1]])
588 vf2
= __builtin_elementwise_sin(vf1
);
591 void test_builtin_elementwise_sqrt(float f1
, float f2
, double d1
, double d2
,
592 float4 vf1
, float4 vf2
) {
593 // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
594 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
595 // CHECK-NEXT: call float @llvm.sqrt.f32(float [[F1]])
596 f2
= __builtin_elementwise_sqrt(f1
);
598 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
599 // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
600 d2
= __builtin_elementwise_sqrt(d1
);
602 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
603 // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
604 vf2
= __builtin_elementwise_sqrt(vf1
);
607 void test_builtin_elementwise_trunc(float f1
, float f2
, double d1
, double d2
,
608 float4 vf1
, float4 vf2
) {
609 // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
610 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
611 // CHECK-NEXT: call float @llvm.trunc.f32(float [[F1]])
612 f2
= __builtin_elementwise_trunc(f1
);
614 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
615 // CHECK-NEXT: call double @llvm.trunc.f64(double [[D1]])
616 d2
= __builtin_elementwise_trunc(d1
);
618 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
619 // CHECK-NEXT: call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VF1]])
620 vf2
= __builtin_elementwise_trunc(vf1
);
623 void test_builtin_elementwise_canonicalize(float f1
, float f2
, double d1
, double d2
,
624 float4 vf1
, float4 vf2
) {
625 // CHECK-LABEL: define void @test_builtin_elementwise_canonicalize(
626 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
627 // CHECK-NEXT: call float @llvm.canonicalize.f32(float [[F1]])
628 f2
= __builtin_elementwise_canonicalize(f1
);
630 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
631 // CHECK-NEXT: call double @llvm.canonicalize.f64(double [[D1]])
632 d2
= __builtin_elementwise_canonicalize(d1
);
634 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
635 // CHECK-NEXT: call <4 x float> @llvm.canonicalize.v4f32(<4 x float> [[VF1]])
636 vf2
= __builtin_elementwise_canonicalize(vf1
);
639 void test_builtin_elementwise_copysign(float f1
, float f2
, double d1
, double d2
,
640 float4 vf1
, float4 vf2
, double2 v2f64
) {
641 // CHECK-LABEL: define void @test_builtin_elementwise_copysign(
642 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
643 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
644 // CHECK-NEXT: call float @llvm.copysign.f32(float %0, float %1)
645 f1
= __builtin_elementwise_copysign(f1
, f2
);
647 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
648 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
649 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double [[D2]])
650 d1
= __builtin_elementwise_copysign(d1
, d2
);
652 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
653 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double 2.000000e+00)
654 d1
= __builtin_elementwise_copysign(d1
, 2.0);
656 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
657 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
658 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
659 vf1
= __builtin_elementwise_copysign(vf1
, vf2
);
661 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
662 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
663 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
664 const float4 cvf1
= vf1
;
665 vf1
= __builtin_elementwise_copysign(cvf1
, vf2
);
667 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
668 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
669 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
670 vf1
= __builtin_elementwise_copysign(vf2
, cvf1
);
673 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
674 // CHECK-NEXT: call float @llvm.copysign.f32(float [[F1]], float 2.000000e+00)
675 f1
= __builtin_elementwise_copysign(f1
, 2.0f
);
677 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
678 // CHECK-NEXT: call float @llvm.copysign.f32(float 2.000000e+00, float [[F1]])
679 f1
= __builtin_elementwise_copysign(2.0f
, f1
);
681 // CHECK: [[V2F64:%.+]] = load <2 x double>, ptr %v2f64.addr, align 16
682 // CHECK-NEXT: call <2 x double> @llvm.copysign.v2f64(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x double> [[V2F64]])
683 v2f64
= __builtin_elementwise_copysign((double2
)1.0, v2f64
);
686 void test_builtin_elementwise_fma(float f32
, double f64
,
687 float2 v2f32
, float4 v4f32
,
688 double2 v2f64
, double3 v3f64
,
689 const float4 c_v4f32
,
690 half f16
, half2 v2f16
) {
691 // CHECK-LABEL: define void @test_builtin_elementwise_fma(
692 // CHECK: [[F32_0:%.+]] = load float, ptr %f32.addr
693 // CHECK-NEXT: [[F32_1:%.+]] = load float, ptr %f32.addr
694 // CHECK-NEXT: [[F32_2:%.+]] = load float, ptr %f32.addr
695 // CHECK-NEXT: call float @llvm.fma.f32(float [[F32_0]], float [[F32_1]], float [[F32_2]])
696 float f2
= __builtin_elementwise_fma(f32
, f32
, f32
);
698 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
699 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
700 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
701 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
702 double d2
= __builtin_elementwise_fma(f64
, f64
, f64
);
704 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %v4f32.addr
705 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %v4f32.addr
706 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %v4f32.addr
707 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
708 float4 tmp_v4f32
= __builtin_elementwise_fma(v4f32
, v4f32
, v4f32
);
711 // FIXME: Are we really still doing the 3 vector load workaround
712 // CHECK: [[V3F64_LOAD_0:%.+]] = load <4 x double>, ptr %v3f64.addr
713 // CHECK-NEXT: [[V3F64_0:%.+]] = shufflevector
714 // CHECK-NEXT: [[V3F64_LOAD_1:%.+]] = load <4 x double>, ptr %v3f64.addr
715 // CHECK-NEXT: [[V3F64_1:%.+]] = shufflevector
716 // CHECK-NEXT: [[V3F64_LOAD_2:%.+]] = load <4 x double>, ptr %v3f64.addr
717 // CHECK-NEXT: [[V3F64_2:%.+]] = shufflevector
718 // CHECK-NEXT: call <3 x double> @llvm.fma.v3f64(<3 x double> [[V3F64_0]], <3 x double> [[V3F64_1]], <3 x double> [[V3F64_2]])
719 v3f64
= __builtin_elementwise_fma(v3f64
, v3f64
, v3f64
);
721 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
722 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
723 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
724 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
725 v2f64
= __builtin_elementwise_fma(f64
, f64
, f64
);
727 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %c_v4f32.addr
728 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %c_v4f32.addr
729 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %c_v4f32.addr
730 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
731 v4f32
= __builtin_elementwise_fma(c_v4f32
, c_v4f32
, c_v4f32
);
733 // CHECK: [[F16_0:%.+]] = load half, ptr %f16.addr
734 // CHECK-NEXT: [[F16_1:%.+]] = load half, ptr %f16.addr
735 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
736 // CHECK-NEXT: call half @llvm.fma.f16(half [[F16_0]], half [[F16_1]], half [[F16_2]])
737 half tmp_f16
= __builtin_elementwise_fma(f16
, f16
, f16
);
739 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
740 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
741 // CHECK-NEXT: [[V2F16_2:%.+]] = load <2 x half>, ptr %v2f16.addr
742 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
743 half2 tmp0_v2f16
= __builtin_elementwise_fma(v2f16
, v2f16
, v2f16
);
745 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
746 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
747 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
748 // CHECK-NEXT: [[V2F16_2_INSERT:%.+]] = insertelement
749 // CHECK-NEXT: [[V2F16_2:%.+]] = shufflevector <2 x half> [[V2F16_2_INSERT]], <2 x half> poison, <2 x i32> zeroinitializer
750 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
751 half2 tmp1_v2f16
= __builtin_elementwise_fma(v2f16
, v2f16
, (half2
)f16
);
753 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
754 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
755 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> <half 0xH4400, half 0xH4400>)
756 half2 tmp2_v2f16
= __builtin_elementwise_fma(v2f16
, v2f16
, (half2
)4.0);