1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
5 typedef half half2
__attribute__((ext_vector_type(2)));
6 typedef float float2
__attribute__((ext_vector_type(2)));
7 typedef float float4
__attribute__((ext_vector_type(4)));
8 typedef short int si8
__attribute__((ext_vector_type(8)));
9 typedef unsigned int u4
__attribute__((ext_vector_type(4)));
10 typedef double double2
__attribute__((ext_vector_type(2)));
11 typedef double double3
__attribute__((ext_vector_type(3)));
13 __attribute__((address_space(1))) int int_as_one
;
17 void test_builtin_elementwise_abs(float f1
, float f2
, double d1
, double d2
,
18 float4 vf1
, float4 vf2
, si8 vi1
, si8 vi2
,
19 long long int i1
, long long int i2
, short si
,
20 _BitInt(31) bi1
, _BitInt(31) bi2
) {
21 // CHECK-LABEL: define void @test_builtin_elementwise_abs(
22 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
23 // CHECK-NEXT: call float @llvm.fabs.f32(float [[F1]])
24 f2
= __builtin_elementwise_abs(f1
);
26 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
27 // CHECK-NEXT: call double @llvm.fabs.f64(double [[D1]])
28 d2
= __builtin_elementwise_abs(d1
);
30 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
31 // CHECK-NEXT: call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VF1]])
32 vf2
= __builtin_elementwise_abs(vf1
);
34 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
35 // CHECK-NEXT: call i64 @llvm.abs.i64(i64 [[I1]], i1 false)
36 i2
= __builtin_elementwise_abs(i1
);
38 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
39 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[VI1]], i1 false)
40 vi2
= __builtin_elementwise_abs(vi1
);
42 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
43 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[CVI2]], i1 false)
45 vi2
= __builtin_elementwise_abs(cvi2
);
47 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
48 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
49 // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[LOADEDV]], i1 false)
50 bi2
= __builtin_elementwise_abs(bi1
);
52 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
53 // CHECK-NEXT: call i32 @llvm.abs.i32(i32 [[IA1]], i1 false)
54 b
= __builtin_elementwise_abs(int_as_one
);
56 // CHECK: call i32 @llvm.abs.i32(i32 -10, i1 false)
57 b
= __builtin_elementwise_abs(-10);
59 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
60 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
61 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.abs.i32(i32 [[SI_EXT]], i1 false)
62 // CHECK-NEXT: = trunc i32 [[RES]] to i16
63 si
= __builtin_elementwise_abs(si
);
66 void test_builtin_elementwise_add_sat(float f1
, float f2
, double d1
, double d2
,
67 float4 vf1
, float4 vf2
, long long int i1
,
68 long long int i2
, si8 vi1
, si8 vi2
,
69 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
70 _BitInt(31) bi1
, _BitInt(31) bi2
,
71 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
72 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
73 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
74 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 [[I2]])
75 i1
= __builtin_elementwise_add_sat(i1
, i2
);
77 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
78 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 10)
79 i1
= __builtin_elementwise_add_sat(i1
, 10);
81 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
82 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
83 // CHECK-NEXT: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
84 vi1
= __builtin_elementwise_add_sat(vi1
, vi2
);
86 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
87 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
88 // CHECK-NEXT: call i32 @llvm.uadd.sat.i32(i32 [[U1]], i32 [[U2]])
89 u1
= __builtin_elementwise_add_sat(u1
, u2
);
91 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
92 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
93 // CHECK-NEXT: call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
94 vu1
= __builtin_elementwise_add_sat(vu1
, vu2
);
96 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
97 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
98 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
99 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
100 // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
101 bi1
= __builtin_elementwise_add_sat(bi1
, bi2
);
103 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
104 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
105 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
106 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
107 // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
108 bu1
= __builtin_elementwise_add_sat(bu1
, bu2
);
110 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
111 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
112 // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[IAS1]], i32 [[B]])
113 int_as_one
= __builtin_elementwise_add_sat(int_as_one
, b
);
115 // CHECK: store i64 98, ptr %i1.addr, align 8
116 i1
= __builtin_elementwise_add_sat(1, 'a');
119 void test_builtin_elementwise_sub_sat(float f1
, float f2
, double d1
, double d2
,
120 float4 vf1
, float4 vf2
, long long int i1
,
121 long long int i2
, si8 vi1
, si8 vi2
,
122 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
123 _BitInt(31) bi1
, _BitInt(31) bi2
,
124 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
125 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
126 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
127 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 [[I2]])
128 i1
= __builtin_elementwise_sub_sat(i1
, i2
);
130 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
131 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 10)
132 i1
= __builtin_elementwise_sub_sat(i1
, 10);
134 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
135 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
136 // CHECK-NEXT: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
137 vi1
= __builtin_elementwise_sub_sat(vi1
, vi2
);
139 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
140 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
141 // CHECK-NEXT: call i32 @llvm.usub.sat.i32(i32 [[U1]], i32 [[U2]])
142 u1
= __builtin_elementwise_sub_sat(u1
, u2
);
144 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
145 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
146 // CHECK-NEXT: call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
147 vu1
= __builtin_elementwise_sub_sat(vu1
, vu2
);
149 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
150 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
151 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
152 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
153 // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
154 bi1
= __builtin_elementwise_sub_sat(bi1
, bi2
);
156 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
157 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
158 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
159 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
160 // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
161 bu1
= __builtin_elementwise_sub_sat(bu1
, bu2
);
163 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
164 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
165 // CHECK-NEXT: call i32 @llvm.ssub.sat.i32(i32 [[IAS1]], i32 [[B]])
166 int_as_one
= __builtin_elementwise_sub_sat(int_as_one
, b
);
168 // CHECK: store i64 -96, ptr %i1.addr, align 8
169 i1
= __builtin_elementwise_sub_sat(1, 'a');
172 void test_builtin_elementwise_maximum(float f1
, float f2
, double d1
, double d2
,
173 float4 vf1
, float4 vf2
, long long int i1
,
174 long long int i2
, si8 vi1
, si8 vi2
,
175 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
176 _BitInt(31) bi1
, _BitInt(31) bi2
,
177 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
178 // CHECK-LABEL: define void @test_builtin_elementwise_maximum(
179 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
180 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
181 // CHECK-NEXT: call float @llvm.maximum.f32(float [[F1]], float [[F2]])
182 f1
= __builtin_elementwise_maximum(f1
, f2
);
184 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
185 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
186 // CHECK-NEXT: call double @llvm.maximum.f64(double [[D1]], double [[D2]])
187 d1
= __builtin_elementwise_maximum(d1
, d2
);
189 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
190 // CHECK-NEXT: call double @llvm.maximum.f64(double 2.000000e+01, double [[D2]])
191 d1
= __builtin_elementwise_maximum(20.0, d2
);
193 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
194 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
195 // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
196 vf1
= __builtin_elementwise_maximum(vf1
, vf2
);
198 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
199 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
200 // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
201 const float4 cvf1
= vf1
;
202 vf1
= __builtin_elementwise_maximum(cvf1
, vf2
);
204 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
205 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
206 // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
207 vf1
= __builtin_elementwise_maximum(vf2
, cvf1
);
210 void test_builtin_elementwise_minimum(float f1
, float f2
, double d1
, double d2
,
211 float4 vf1
, float4 vf2
, long long int i1
,
212 long long int i2
, si8 vi1
, si8 vi2
,
213 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
214 _BitInt(31) bi1
, _BitInt(31) bi2
,
215 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
216 // CHECK-LABEL: define void @test_builtin_elementwise_minimum(
217 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
218 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
219 // CHECK-NEXT: call float @llvm.minimum.f32(float [[F1]], float [[F2]])
220 f1
= __builtin_elementwise_minimum(f1
, f2
);
222 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
223 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
224 // CHECK-NEXT: call double @llvm.minimum.f64(double [[D1]], double [[D2]])
225 d1
= __builtin_elementwise_minimum(d1
, d2
);
227 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
228 // CHECK-NEXT: call double @llvm.minimum.f64(double [[D1]], double 2.000000e+00)
229 d1
= __builtin_elementwise_minimum(d1
, 2.0);
231 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
232 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
233 // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
234 vf1
= __builtin_elementwise_minimum(vf1
, vf2
);
236 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
237 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
238 // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
239 const float4 cvf1
= vf1
;
240 vf1
= __builtin_elementwise_minimum(cvf1
, vf2
);
242 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
243 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
244 // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
245 vf1
= __builtin_elementwise_minimum(vf2
, cvf1
);
248 void test_builtin_elementwise_max(float f1
, float f2
, double d1
, double d2
,
249 float4 vf1
, float4 vf2
, long long int i1
,
250 long long int i2
, si8 vi1
, si8 vi2
,
251 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
252 _BitInt(31) bi1
, _BitInt(31) bi2
,
253 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
254 // CHECK-LABEL: define void @test_builtin_elementwise_max(
255 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
256 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
257 // CHECK-NEXT: call float @llvm.maxnum.f32(float [[F1]], float [[F2]])
258 f1
= __builtin_elementwise_max(f1
, f2
);
260 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
261 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
262 // CHECK-NEXT: call double @llvm.maxnum.f64(double [[D1]], double [[D2]])
263 d1
= __builtin_elementwise_max(d1
, d2
);
265 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
266 // CHECK-NEXT: call double @llvm.maxnum.f64(double 2.000000e+01, double [[D2]])
267 d1
= __builtin_elementwise_max(20.0, d2
);
269 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
270 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
271 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
272 vf1
= __builtin_elementwise_max(vf1
, vf2
);
274 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
275 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
276 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 [[I2]])
277 i1
= __builtin_elementwise_max(i1
, i2
);
279 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
280 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 10)
281 i1
= __builtin_elementwise_max(i1
, 10);
283 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
284 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
285 // CHECK-NEXT: call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
286 vi1
= __builtin_elementwise_max(vi1
, vi2
);
288 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
289 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
290 // CHECK-NEXT: call i32 @llvm.umax.i32(i32 [[U1]], i32 [[U2]])
291 u1
= __builtin_elementwise_max(u1
, u2
);
293 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
294 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
295 // CHECK-NEXT: call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
296 vu1
= __builtin_elementwise_max(vu1
, vu2
);
298 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
299 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
300 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
301 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
302 // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
303 bi1
= __builtin_elementwise_max(bi1
, bi2
);
305 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
306 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
307 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
308 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
309 // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
310 bu1
= __builtin_elementwise_max(bu1
, bu2
);
312 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
313 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
314 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
315 const float4 cvf1
= vf1
;
316 vf1
= __builtin_elementwise_max(cvf1
, vf2
);
318 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
319 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
320 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
321 vf1
= __builtin_elementwise_max(vf2
, cvf1
);
323 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
324 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
325 // CHECK-NEXT: call i32 @llvm.smax.i32(i32 [[IAS1]], i32 [[B]])
326 int_as_one
= __builtin_elementwise_max(int_as_one
, b
);
328 // CHECK: call i32 @llvm.smax.i32(i32 1, i32 97)
329 i1
= __builtin_elementwise_max(1, 'a');
332 void test_builtin_elementwise_min(float f1
, float f2
, double d1
, double d2
,
333 float4 vf1
, float4 vf2
, long long int i1
,
334 long long int i2
, si8 vi1
, si8 vi2
,
335 unsigned u1
, unsigned u2
, u4 vu1
, u4 vu2
,
336 _BitInt(31) bi1
, _BitInt(31) bi2
,
337 unsigned _BitInt(55) bu1
, unsigned _BitInt(55) bu2
) {
338 // CHECK-LABEL: define void @test_builtin_elementwise_min(
339 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
340 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
341 // CHECK-NEXT: call float @llvm.minnum.f32(float [[F1]], float [[F2]])
342 f1
= __builtin_elementwise_min(f1
, f2
);
344 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
345 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
346 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double [[D2]])
347 d1
= __builtin_elementwise_min(d1
, d2
);
349 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
350 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double 2.000000e+00)
351 d1
= __builtin_elementwise_min(d1
, 2.0);
353 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
354 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
355 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
356 vf1
= __builtin_elementwise_min(vf1
, vf2
);
358 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
359 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
360 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[I1]], i64 [[I2]])
361 i1
= __builtin_elementwise_min(i1
, i2
);
363 // CHECK: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
364 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 -11, i64 [[I2]])
365 i1
= __builtin_elementwise_min(-11, i2
);
367 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
368 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
369 // CHECK-NEXT: call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
370 vi1
= __builtin_elementwise_min(vi1
, vi2
);
372 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
373 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
374 // CHECK-NEXT: call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
375 u1
= __builtin_elementwise_min(u1
, u2
);
377 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
378 // CHECK-NEXT: [[ZEXT_U1:%.+]] = zext i32 [[U1]] to i64
379 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
380 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[ZEXT_U1]], i64 [[I2]])
381 u1
= __builtin_elementwise_min(u1
, i2
);
383 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
384 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
385 // CHECK-NEXT: call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
386 vu1
= __builtin_elementwise_min(vu1
, vu2
);
388 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
389 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
390 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
391 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
392 // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
393 bi1
= __builtin_elementwise_min(bi1
, bi2
);
395 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
396 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
397 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
398 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
399 // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
400 bu1
= __builtin_elementwise_min(bu1
, bu2
);
402 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
403 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
404 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
405 const float4 cvf1
= vf1
;
406 vf1
= __builtin_elementwise_min(cvf1
, vf2
);
408 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
409 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
410 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
411 vf1
= __builtin_elementwise_min(vf2
, cvf1
);
413 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
414 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
415 // CHECK-NEXT: call i32 @llvm.smin.i32(i32 [[IAS1]], i32 [[B]])
416 int_as_one
= __builtin_elementwise_min(int_as_one
, b
);
419 void test_builtin_elementwise_bitreverse(si8 vi1
, si8 vi2
,
420 long long int i1
, long long int i2
, short si
,
421 _BitInt(31) bi1
, _BitInt(31) bi2
) {
424 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
425 // CHECK-NEXT: call i64 @llvm.bitreverse.i64(i64 [[I1]])
426 i2
= __builtin_elementwise_bitreverse(i1
);
428 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
429 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[VI1]])
430 vi2
= __builtin_elementwise_bitreverse(vi1
);
432 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
433 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[CVI2]])
434 const si8 cvi2
= vi2
;
435 vi2
= __builtin_elementwise_bitreverse(cvi2
);
437 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
438 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
439 // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[LOADEDV]])
440 bi2
= __builtin_elementwise_bitreverse(bi1
);
442 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
443 // CHECK-NEXT: call i32 @llvm.bitreverse.i32(i32 [[IA1]])
444 b
= __builtin_elementwise_bitreverse(int_as_one
);
446 // CHECK: store i32 1879048191, ptr @b, align 4
447 b
= __builtin_elementwise_bitreverse(-10);
449 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
450 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
451 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.bitreverse.i32(i32 [[SI_EXT]])
452 // CHECK-NEXT: = trunc i32 [[RES]] to i16
453 si
= __builtin_elementwise_bitreverse(si
);
456 void test_builtin_elementwise_ceil(float f1
, float f2
, double d1
, double d2
,
457 float4 vf1
, float4 vf2
) {
458 // CHECK-LABEL: define void @test_builtin_elementwise_ceil(
459 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
460 // CHECK-NEXT: call float @llvm.ceil.f32(float [[F1]])
461 f2
= __builtin_elementwise_ceil(f1
);
463 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
464 // CHECK-NEXT: call double @llvm.ceil.f64(double [[D1]])
465 d2
= __builtin_elementwise_ceil(d1
);
467 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
468 // CHECK-NEXT: call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VF1]])
469 vf2
= __builtin_elementwise_ceil(vf1
);
472 void test_builtin_elementwise_acos(float f1
, float f2
, double d1
, double d2
,
473 float4 vf1
, float4 vf2
) {
474 // CHECK-LABEL: define void @test_builtin_elementwise_acos(
475 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
476 // CHECK-NEXT: call float @llvm.acos.f32(float [[F1]])
477 f2
= __builtin_elementwise_acos(f1
);
479 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
480 // CHECK-NEXT: call double @llvm.acos.f64(double [[D1]])
481 d2
= __builtin_elementwise_acos(d1
);
483 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
484 // CHECK-NEXT: call <4 x float> @llvm.acos.v4f32(<4 x float> [[VF1]])
485 vf2
= __builtin_elementwise_acos(vf1
);
488 void test_builtin_elementwise_asin(float f1
, float f2
, double d1
, double d2
,
489 float4 vf1
, float4 vf2
) {
490 // CHECK-LABEL: define void @test_builtin_elementwise_asin(
491 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
492 // CHECK-NEXT: call float @llvm.asin.f32(float [[F1]])
493 f2
= __builtin_elementwise_asin(f1
);
495 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
496 // CHECK-NEXT: call double @llvm.asin.f64(double [[D1]])
497 d2
= __builtin_elementwise_asin(d1
);
499 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
500 // CHECK-NEXT: call <4 x float> @llvm.asin.v4f32(<4 x float> [[VF1]])
501 vf2
= __builtin_elementwise_asin(vf1
);
504 void test_builtin_elementwise_atan(float f1
, float f2
, double d1
, double d2
,
505 float4 vf1
, float4 vf2
) {
506 // CHECK-LABEL: define void @test_builtin_elementwise_atan(
507 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
508 // CHECK-NEXT: call float @llvm.atan.f32(float [[F1]])
509 f2
= __builtin_elementwise_atan(f1
);
511 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
512 // CHECK-NEXT: call double @llvm.atan.f64(double [[D1]])
513 d2
= __builtin_elementwise_atan(d1
);
515 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
516 // CHECK-NEXT: call <4 x float> @llvm.atan.v4f32(<4 x float> [[VF1]])
517 vf2
= __builtin_elementwise_atan(vf1
);
520 void test_builtin_elementwise_atan2(float f1
, float f2
, float f3
, double d1
,
521 double d2
, double d3
, float4 vf1
,
522 float4 vf2
, float4 vf3
) {
523 // CHECK-LABEL: define void @test_builtin_elementwise_atan2(
524 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
525 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
526 // CHECK-NEXT: call float @llvm.atan2.f32(float [[F1]], float [[F2]])
527 f3
= __builtin_elementwise_atan2(f1
, f2
);
529 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
530 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
531 // CHECK-NEXT: call double @llvm.atan2.f64(double [[D1]], double [[D2]])
532 d3
= __builtin_elementwise_atan2(d1
, d2
);
534 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
535 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
536 // CHECK-NEXT: call <4 x float> @llvm.atan2.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
537 vf3
= __builtin_elementwise_atan2(vf1
, vf2
);
540 void test_builtin_elementwise_cos(float f1
, float f2
, double d1
, double d2
,
541 float4 vf1
, float4 vf2
) {
542 // CHECK-LABEL: define void @test_builtin_elementwise_cos(
543 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
544 // CHECK-NEXT: call float @llvm.cos.f32(float [[F1]])
545 f2
= __builtin_elementwise_cos(f1
);
547 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
548 // CHECK-NEXT: call double @llvm.cos.f64(double [[D1]])
549 d2
= __builtin_elementwise_cos(d1
);
551 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
552 // CHECK-NEXT: call <4 x float> @llvm.cos.v4f32(<4 x float> [[VF1]])
553 vf2
= __builtin_elementwise_cos(vf1
);
556 void test_builtin_elementwise_cosh(float f1
, float f2
, double d1
, double d2
,
557 float4 vf1
, float4 vf2
) {
558 // CHECK-LABEL: define void @test_builtin_elementwise_cosh(
559 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
560 // CHECK-NEXT: call float @llvm.cosh.f32(float [[F1]])
561 f2
= __builtin_elementwise_cosh(f1
);
563 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
564 // CHECK-NEXT: call double @llvm.cosh.f64(double [[D1]])
565 d2
= __builtin_elementwise_cosh(d1
);
567 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
568 // CHECK-NEXT: call <4 x float> @llvm.cosh.v4f32(<4 x float> [[VF1]])
569 vf2
= __builtin_elementwise_cosh(vf1
);
572 void test_builtin_elementwise_exp(float f1
, float f2
, double d1
, double d2
,
573 float4 vf1
, float4 vf2
) {
574 // CHECK-LABEL: define void @test_builtin_elementwise_exp(
575 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
576 // CHECK-NEXT: call float @llvm.exp.f32(float [[F1]])
577 f2
= __builtin_elementwise_exp(f1
);
579 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
580 // CHECK-NEXT: call double @llvm.exp.f64(double [[D1]])
581 d2
= __builtin_elementwise_exp(d1
);
583 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
584 // CHECK-NEXT: call <4 x float> @llvm.exp.v4f32(<4 x float> [[VF1]])
585 vf2
= __builtin_elementwise_exp(vf1
);
588 void test_builtin_elementwise_exp2(float f1
, float f2
, double d1
, double d2
,
589 float4 vf1
, float4 vf2
) {
590 // CHECK-LABEL: define void @test_builtin_elementwise_exp2(
591 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
592 // CHECK-NEXT: call float @llvm.exp2.f32(float [[F1]])
593 f2
= __builtin_elementwise_exp2(f1
);
595 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
596 // CHECK-NEXT: call double @llvm.exp2.f64(double [[D1]])
597 d2
= __builtin_elementwise_exp2(d1
);
599 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
600 // CHECK-NEXT: call <4 x float> @llvm.exp2.v4f32(<4 x float> [[VF1]])
601 vf2
= __builtin_elementwise_exp2(vf1
);
605 void test_builtin_elementwise_floor(float f1
, float f2
, double d1
, double d2
,
606 float4 vf1
, float4 vf2
) {
607 // CHECK-LABEL: define void @test_builtin_elementwise_floor(
608 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
609 // CHECK-NEXT: call float @llvm.floor.f32(float [[F1]])
610 f2
= __builtin_elementwise_floor(f1
);
612 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
613 // CHECK-NEXT: call double @llvm.floor.f64(double [[D1]])
614 d2
= __builtin_elementwise_floor(d1
);
616 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
617 // CHECK-NEXT: call <4 x float> @llvm.floor.v4f32(<4 x float> [[VF1]])
618 vf2
= __builtin_elementwise_floor(vf1
);
621 void test_builtin_elementwise_log(float f1
, float f2
, double d1
, double d2
,
622 float4 vf1
, float4 vf2
) {
623 // CHECK-LABEL: define void @test_builtin_elementwise_log(
624 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
625 // CHECK-NEXT: call float @llvm.log.f32(float [[F1]])
626 f2
= __builtin_elementwise_log(f1
);
628 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
629 // CHECK-NEXT: call double @llvm.log.f64(double [[D1]])
630 d2
= __builtin_elementwise_log(d1
);
632 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
633 // CHECK-NEXT: call <4 x float> @llvm.log.v4f32(<4 x float> [[VF1]])
634 vf2
= __builtin_elementwise_log(vf1
);
637 void test_builtin_elementwise_log10(float f1
, float f2
, double d1
, double d2
,
638 float4 vf1
, float4 vf2
) {
639 // CHECK-LABEL: define void @test_builtin_elementwise_log10(
640 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
641 // CHECK-NEXT: call float @llvm.log10.f32(float [[F1]])
642 f2
= __builtin_elementwise_log10(f1
);
644 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
645 // CHECK-NEXT: call double @llvm.log10.f64(double [[D1]])
646 d2
= __builtin_elementwise_log10(d1
);
648 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
649 // CHECK-NEXT: call <4 x float> @llvm.log10.v4f32(<4 x float> [[VF1]])
650 vf2
= __builtin_elementwise_log10(vf1
);
653 void test_builtin_elementwise_log2(float f1
, float f2
, double d1
, double d2
,
654 float4 vf1
, float4 vf2
) {
655 // CHECK-LABEL: define void @test_builtin_elementwise_log2(
656 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
657 // CHECK-NEXT: call float @llvm.log2.f32(float [[F1]])
658 f2
= __builtin_elementwise_log2(f1
);
660 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
661 // CHECK-NEXT: call double @llvm.log2.f64(double [[D1]])
662 d2
= __builtin_elementwise_log2(d1
);
664 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
665 // CHECK-NEXT: call <4 x float> @llvm.log2.v4f32(<4 x float> [[VF1]])
666 vf2
= __builtin_elementwise_log2(vf1
);
669 void test_builtin_elementwise_popcount(si8 vi1
, si8 vi2
, long long int i1
,
670 long long int i2
, short si
,
671 _BitInt(31) bi1
, _BitInt(31) bi2
) {
672 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
673 // CHECK-NEXT: call i64 @llvm.ctpop.i64(i64 [[I1]])
674 i2
= __builtin_elementwise_popcount(i1
);
676 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
677 // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[VI1]])
678 vi2
= __builtin_elementwise_popcount(vi1
);
680 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
681 // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[CVI2]])
682 const si8 cvi2
= vi2
;
683 vi2
= __builtin_elementwise_popcount(cvi2
);
685 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
686 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
687 // CHECK-NEXT: call i31 @llvm.ctpop.i31(i31 [[LOADEDV]])
688 bi2
= __builtin_elementwise_popcount(bi1
);
690 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
691 // CHECK-NEXT: call i32 @llvm.ctpop.i32(i32 [[IA1]])
692 b
= __builtin_elementwise_popcount(int_as_one
);
694 // CHECK: store i32 30, ptr @b, align 4
695 b
= __builtin_elementwise_popcount(-10);
697 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
698 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
699 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.ctpop.i32(i32 [[SI_EXT]])
700 // CHECK-NEXT: = trunc i32 [[RES]] to i16
701 si
= __builtin_elementwise_popcount(si
);
704 void test_builtin_elementwise_fmod(float f1
, float f2
, double d1
, double d2
,
705 float4 vf1
, float4 vf2
) {
707 // CHECK-LABEL: define void @test_builtin_elementwise_fmod(
708 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
709 // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4
710 // CHECK-NEXT: frem float [[F1]], [[F2]]
711 f2
= __builtin_elementwise_fmod(f1
, f2
);
713 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
714 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
715 // CHECK-NEXT: frem double [[D1]], [[D2]]
716 d2
= __builtin_elementwise_fmod(d1
, d2
);
718 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
719 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
720 // CHECK-NEXT: frem <4 x float> [[VF1]], [[VF2]]
721 vf2
= __builtin_elementwise_fmod(vf1
, vf2
);
724 void test_builtin_elementwise_pow(float f1
, float f2
, double d1
, double d2
,
725 float4 vf1
, float4 vf2
) {
727 // CHECK-LABEL: define void @test_builtin_elementwise_pow(
728 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
729 // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4
730 // CHECK-NEXT: call float @llvm.pow.f32(float [[F1]], float [[F2]])
731 f2
= __builtin_elementwise_pow(f1
, f2
);
733 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
734 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
735 // CHECK-NEXT: call double @llvm.pow.f64(double [[D1]], double [[D2]])
736 d2
= __builtin_elementwise_pow(d1
, d2
);
738 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
739 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
740 // CHECK-NEXT: call <4 x float> @llvm.pow.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
741 vf2
= __builtin_elementwise_pow(vf1
, vf2
);
744 void test_builtin_elementwise_roundeven(float f1
, float f2
, double d1
, double d2
,
745 float4 vf1
, float4 vf2
) {
746 // CHECK-LABEL: define void @test_builtin_elementwise_roundeven(
747 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
748 // CHECK-NEXT: call float @llvm.roundeven.f32(float [[F1]])
749 f2
= __builtin_elementwise_roundeven(f1
);
751 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
752 // CHECK-NEXT: call double @llvm.roundeven.f64(double [[D1]])
753 d2
= __builtin_elementwise_roundeven(d1
);
755 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
756 // CHECK-NEXT: call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[VF1]])
757 vf2
= __builtin_elementwise_roundeven(vf1
);
760 void test_builtin_elementwise_round(float f1
, float f2
, double d1
, double d2
,
761 float4 vf1
, float4 vf2
) {
762 // CHECK-LABEL: define void @test_builtin_elementwise_round(
763 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
764 // CHECK-NEXT: call float @llvm.round.f32(float [[F1]])
765 f2
= __builtin_elementwise_round(f1
);
767 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
768 // CHECK-NEXT: call double @llvm.round.f64(double [[D1]])
769 d2
= __builtin_elementwise_round(d1
);
771 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
772 // CHECK-NEXT: call <4 x float> @llvm.round.v4f32(<4 x float> [[VF1]])
773 vf2
= __builtin_elementwise_round(vf1
);
776 void test_builtin_elementwise_rint(float f1
, float f2
, double d1
, double d2
,
777 float4 vf1
, float4 vf2
) {
778 // CHECK-LABEL: define void @test_builtin_elementwise_rint(
779 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
780 // CHECK-NEXT: call float @llvm.rint.f32(float [[F1]])
781 f2
= __builtin_elementwise_rint(f1
);
783 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
784 // CHECK-NEXT: call double @llvm.rint.f64(double [[D1]])
785 d2
= __builtin_elementwise_rint(d1
);
787 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
788 // CHECK-NEXT: call <4 x float> @llvm.rint.v4f32(<4 x float> [[VF1]])
789 vf2
= __builtin_elementwise_rint(vf1
);
792 void test_builtin_elementwise_nearbyint(float f1
, float f2
, double d1
, double d2
,
793 float4 vf1
, float4 vf2
) {
794 // CHECK-LABEL: define void @test_builtin_elementwise_nearbyint(
795 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
796 // CHECK-NEXT: call float @llvm.nearbyint.f32(float [[F1]])
797 f2
= __builtin_elementwise_nearbyint(f1
);
799 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
800 // CHECK-NEXT: call double @llvm.nearbyint.f64(double [[D1]])
801 d2
= __builtin_elementwise_nearbyint(d1
);
803 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
804 // CHECK-NEXT: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VF1]])
805 vf2
= __builtin_elementwise_nearbyint(vf1
);
808 void test_builtin_elementwise_sin(float f1
, float f2
, double d1
, double d2
,
809 float4 vf1
, float4 vf2
) {
810 // CHECK-LABEL: define void @test_builtin_elementwise_sin(
811 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
812 // CHECK-NEXT: call float @llvm.sin.f32(float [[F1]])
813 f2
= __builtin_elementwise_sin(f1
);
815 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
816 // CHECK-NEXT: call double @llvm.sin.f64(double [[D1]])
817 d2
= __builtin_elementwise_sin(d1
);
819 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
820 // CHECK-NEXT: call <4 x float> @llvm.sin.v4f32(<4 x float> [[VF1]])
821 vf2
= __builtin_elementwise_sin(vf1
);
824 void test_builtin_elementwise_sinh(float f1
, float f2
, double d1
, double d2
,
825 float4 vf1
, float4 vf2
) {
826 // CHECK-LABEL: define void @test_builtin_elementwise_sinh(
827 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
828 // CHECK-NEXT: call float @llvm.sinh.f32(float [[F1]])
829 f2
= __builtin_elementwise_sinh(f1
);
831 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
832 // CHECK-NEXT: call double @llvm.sinh.f64(double [[D1]])
833 d2
= __builtin_elementwise_sinh(d1
);
835 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
836 // CHECK-NEXT: call <4 x float> @llvm.sinh.v4f32(<4 x float> [[VF1]])
837 vf2
= __builtin_elementwise_sinh(vf1
);
840 void test_builtin_elementwise_sqrt(float f1
, float f2
, double d1
, double d2
,
841 float4 vf1
, float4 vf2
) {
842 // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
843 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
844 // CHECK-NEXT: call float @llvm.sqrt.f32(float [[F1]])
845 f2
= __builtin_elementwise_sqrt(f1
);
847 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
848 // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
849 d2
= __builtin_elementwise_sqrt(d1
);
851 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
852 // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
853 vf2
= __builtin_elementwise_sqrt(vf1
);
856 void test_builtin_elementwise_tan(float f1
, float f2
, double d1
, double d2
,
857 float4 vf1
, float4 vf2
) {
858 // CHECK-LABEL: define void @test_builtin_elementwise_tan(
859 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
860 // CHECK-NEXT: call float @llvm.tan.f32(float [[F1]])
861 f2
= __builtin_elementwise_tan(f1
);
863 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
864 // CHECK-NEXT: call double @llvm.tan.f64(double [[D1]])
865 d2
= __builtin_elementwise_tan(d1
);
867 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
868 // CHECK-NEXT: call <4 x float> @llvm.tan.v4f32(<4 x float> [[VF1]])
869 vf2
= __builtin_elementwise_tan(vf1
);
872 void test_builtin_elementwise_tanh(float f1
, float f2
, double d1
, double d2
,
873 float4 vf1
, float4 vf2
) {
874 // CHECK-LABEL: define void @test_builtin_elementwise_tanh(
875 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
876 // CHECK-NEXT: call float @llvm.tanh.f32(float [[F1]])
877 f2
= __builtin_elementwise_tanh(f1
);
879 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
880 // CHECK-NEXT: call double @llvm.tanh.f64(double [[D1]])
881 d2
= __builtin_elementwise_tanh(d1
);
883 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
884 // CHECK-NEXT: call <4 x float> @llvm.tanh.v4f32(<4 x float> [[VF1]])
885 vf2
= __builtin_elementwise_tanh(vf1
);
888 void test_builtin_elementwise_trunc(float f1
, float f2
, double d1
, double d2
,
889 float4 vf1
, float4 vf2
) {
890 // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
891 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
892 // CHECK-NEXT: call float @llvm.trunc.f32(float [[F1]])
893 f2
= __builtin_elementwise_trunc(f1
);
895 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
896 // CHECK-NEXT: call double @llvm.trunc.f64(double [[D1]])
897 d2
= __builtin_elementwise_trunc(d1
);
899 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
900 // CHECK-NEXT: call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VF1]])
901 vf2
= __builtin_elementwise_trunc(vf1
);
904 void test_builtin_elementwise_canonicalize(float f1
, float f2
, double d1
, double d2
,
905 float4 vf1
, float4 vf2
) {
906 // CHECK-LABEL: define void @test_builtin_elementwise_canonicalize(
907 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
908 // CHECK-NEXT: call float @llvm.canonicalize.f32(float [[F1]])
909 f2
= __builtin_elementwise_canonicalize(f1
);
911 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
912 // CHECK-NEXT: call double @llvm.canonicalize.f64(double [[D1]])
913 d2
= __builtin_elementwise_canonicalize(d1
);
915 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
916 // CHECK-NEXT: call <4 x float> @llvm.canonicalize.v4f32(<4 x float> [[VF1]])
917 vf2
= __builtin_elementwise_canonicalize(vf1
);
920 void test_builtin_elementwise_copysign(float f1
, float f2
, double d1
, double d2
,
921 float4 vf1
, float4 vf2
, double2 v2f64
) {
922 // CHECK-LABEL: define void @test_builtin_elementwise_copysign(
923 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
924 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
925 // CHECK-NEXT: call float @llvm.copysign.f32(float %0, float %1)
926 f1
= __builtin_elementwise_copysign(f1
, f2
);
928 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
929 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
930 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double [[D2]])
931 d1
= __builtin_elementwise_copysign(d1
, d2
);
933 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
934 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double 2.000000e+00)
935 d1
= __builtin_elementwise_copysign(d1
, 2.0);
937 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
938 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
939 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
940 vf1
= __builtin_elementwise_copysign(vf1
, vf2
);
942 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
943 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
944 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
945 const float4 cvf1
= vf1
;
946 vf1
= __builtin_elementwise_copysign(cvf1
, vf2
);
948 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
949 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
950 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
951 vf1
= __builtin_elementwise_copysign(vf2
, cvf1
);
954 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
955 // CHECK-NEXT: call float @llvm.copysign.f32(float [[F1]], float 2.000000e+00)
956 f1
= __builtin_elementwise_copysign(f1
, 2.0f
);
958 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
959 // CHECK-NEXT: call float @llvm.copysign.f32(float 2.000000e+00, float [[F1]])
960 f1
= __builtin_elementwise_copysign(2.0f
, f1
);
962 // CHECK: [[V2F64:%.+]] = load <2 x double>, ptr %v2f64.addr, align 16
963 // CHECK-NEXT: call <2 x double> @llvm.copysign.v2f64(<2 x double> splat (double 1.000000e+00), <2 x double> [[V2F64]])
964 v2f64
= __builtin_elementwise_copysign((double2
)1.0, v2f64
);
967 void test_builtin_elementwise_fma(float f32
, double f64
,
968 float2 v2f32
, float4 v4f32
,
969 double2 v2f64
, double3 v3f64
,
970 const float4 c_v4f32
,
971 half f16
, half2 v2f16
) {
972 // CHECK-LABEL: define void @test_builtin_elementwise_fma(
973 // CHECK: [[F32_0:%.+]] = load float, ptr %f32.addr
974 // CHECK-NEXT: [[F32_1:%.+]] = load float, ptr %f32.addr
975 // CHECK-NEXT: [[F32_2:%.+]] = load float, ptr %f32.addr
976 // CHECK-NEXT: call float @llvm.fma.f32(float [[F32_0]], float [[F32_1]], float [[F32_2]])
977 float f2
= __builtin_elementwise_fma(f32
, f32
, f32
);
979 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
980 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
981 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
982 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
983 double d2
= __builtin_elementwise_fma(f64
, f64
, f64
);
985 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %v4f32.addr
986 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %v4f32.addr
987 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %v4f32.addr
988 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
989 float4 tmp_v4f32
= __builtin_elementwise_fma(v4f32
, v4f32
, v4f32
);
992 // FIXME: Are we really still doing the 3 vector load workaround
993 // CHECK: [[V3F64_LOAD_0:%.+]] = load <4 x double>, ptr %v3f64.addr
994 // CHECK-NEXT: [[V3F64_0:%.+]] = shufflevector
995 // CHECK-NEXT: [[V3F64_LOAD_1:%.+]] = load <4 x double>, ptr %v3f64.addr
996 // CHECK-NEXT: [[V3F64_1:%.+]] = shufflevector
997 // CHECK-NEXT: [[V3F64_LOAD_2:%.+]] = load <4 x double>, ptr %v3f64.addr
998 // CHECK-NEXT: [[V3F64_2:%.+]] = shufflevector
999 // CHECK-NEXT: call <3 x double> @llvm.fma.v3f64(<3 x double> [[V3F64_0]], <3 x double> [[V3F64_1]], <3 x double> [[V3F64_2]])
1000 v3f64
= __builtin_elementwise_fma(v3f64
, v3f64
, v3f64
);
1002 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
1003 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
1004 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
1005 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
1006 v2f64
= __builtin_elementwise_fma(f64
, f64
, f64
);
1008 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1009 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1010 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1011 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
1012 v4f32
= __builtin_elementwise_fma(c_v4f32
, c_v4f32
, c_v4f32
);
1014 // CHECK: [[F16_0:%.+]] = load half, ptr %f16.addr
1015 // CHECK-NEXT: [[F16_1:%.+]] = load half, ptr %f16.addr
1016 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
1017 // CHECK-NEXT: call half @llvm.fma.f16(half [[F16_0]], half [[F16_1]], half [[F16_2]])
1018 half tmp_f16
= __builtin_elementwise_fma(f16
, f16
, f16
);
1020 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1021 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1022 // CHECK-NEXT: [[V2F16_2:%.+]] = load <2 x half>, ptr %v2f16.addr
1023 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
1024 half2 tmp0_v2f16
= __builtin_elementwise_fma(v2f16
, v2f16
, v2f16
);
1026 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1027 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1028 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
1029 // CHECK-NEXT: [[V2F16_2_INSERT:%.+]] = insertelement
1030 // CHECK-NEXT: [[V2F16_2:%.+]] = shufflevector <2 x half> [[V2F16_2_INSERT]], <2 x half> poison, <2 x i32> zeroinitializer
1031 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
1032 half2 tmp1_v2f16
= __builtin_elementwise_fma(v2f16
, v2f16
, (half2
)f16
);
1034 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1035 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1036 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> splat (half 0xH4400))
1037 half2 tmp2_v2f16
= __builtin_elementwise_fma(v2f16
, v2f16
, (half2
)4.0);