[libc] Switch to using the generic `<gpuintrin.h>` implementations (#121810)
[llvm-project.git] / clang / test / CodeGen / builtins-elementwise-math.c
blob7f6b5f26eb93070396683a25f43232ac1eb1e1bf
1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
3 typedef _Float16 half;
5 typedef half half2 __attribute__((ext_vector_type(2)));
6 typedef float float2 __attribute__((ext_vector_type(2)));
7 typedef float float4 __attribute__((ext_vector_type(4)));
8 typedef short int si8 __attribute__((ext_vector_type(8)));
9 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
10 typedef double double2 __attribute__((ext_vector_type(2)));
11 typedef double double3 __attribute__((ext_vector_type(3)));
13 __attribute__((address_space(1))) int int_as_one;
14 typedef int bar;
15 bar b;
17 void test_builtin_elementwise_abs(float f1, float f2, double d1, double d2,
18 float4 vf1, float4 vf2, si8 vi1, si8 vi2,
19 long long int i1, long long int i2, short si,
20 _BitInt(31) bi1, _BitInt(31) bi2) {
21 // CHECK-LABEL: define void @test_builtin_elementwise_abs(
22 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
23 // CHECK-NEXT: call float @llvm.fabs.f32(float [[F1]])
24 f2 = __builtin_elementwise_abs(f1);
26 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
27 // CHECK-NEXT: call double @llvm.fabs.f64(double [[D1]])
28 d2 = __builtin_elementwise_abs(d1);
30 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
31 // CHECK-NEXT: call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VF1]])
32 vf2 = __builtin_elementwise_abs(vf1);
34 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
35 // CHECK-NEXT: call i64 @llvm.abs.i64(i64 [[I1]], i1 false)
36 i2 = __builtin_elementwise_abs(i1);
38 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
39 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[VI1]], i1 false)
40 vi2 = __builtin_elementwise_abs(vi1);
42 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
43 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[CVI2]], i1 false)
44 const si8 cvi2 = vi2;
45 vi2 = __builtin_elementwise_abs(cvi2);
47 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
48 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
49 // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[LOADEDV]], i1 false)
50 bi2 = __builtin_elementwise_abs(bi1);
52 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
53 // CHECK-NEXT: call i32 @llvm.abs.i32(i32 [[IA1]], i1 false)
54 b = __builtin_elementwise_abs(int_as_one);
56 // CHECK: call i32 @llvm.abs.i32(i32 -10, i1 false)
57 b = __builtin_elementwise_abs(-10);
59 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
60 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
61 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.abs.i32(i32 [[SI_EXT]], i1 false)
62 // CHECK-NEXT: = trunc i32 [[RES]] to i16
63 si = __builtin_elementwise_abs(si);
66 void test_builtin_elementwise_add_sat(float f1, float f2, double d1, double d2,
67 float4 vf1, float4 vf2, long long int i1,
68 long long int i2, si8 vi1, si8 vi2,
69 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
70 _BitInt(31) bi1, _BitInt(31) bi2,
71 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
72 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
73 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
74 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 [[I2]])
75 i1 = __builtin_elementwise_add_sat(i1, i2);
77 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
78 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 10)
79 i1 = __builtin_elementwise_add_sat(i1, 10);
81 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
82 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
83 // CHECK-NEXT: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
84 vi1 = __builtin_elementwise_add_sat(vi1, vi2);
86 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
87 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
88 // CHECK-NEXT: call i32 @llvm.uadd.sat.i32(i32 [[U1]], i32 [[U2]])
89 u1 = __builtin_elementwise_add_sat(u1, u2);
91 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
92 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
93 // CHECK-NEXT: call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
94 vu1 = __builtin_elementwise_add_sat(vu1, vu2);
96 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
97 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
98 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
99 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
100 // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
101 bi1 = __builtin_elementwise_add_sat(bi1, bi2);
103 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
104 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
105 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
106 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
107 // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
108 bu1 = __builtin_elementwise_add_sat(bu1, bu2);
110 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
111 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
112 // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[IAS1]], i32 [[B]])
113 int_as_one = __builtin_elementwise_add_sat(int_as_one, b);
115 // CHECK: store i64 98, ptr %i1.addr, align 8
116 i1 = __builtin_elementwise_add_sat(1, 'a');
119 void test_builtin_elementwise_sub_sat(float f1, float f2, double d1, double d2,
120 float4 vf1, float4 vf2, long long int i1,
121 long long int i2, si8 vi1, si8 vi2,
122 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
123 _BitInt(31) bi1, _BitInt(31) bi2,
124 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
125 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
126 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
127 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 [[I2]])
128 i1 = __builtin_elementwise_sub_sat(i1, i2);
130 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
131 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 10)
132 i1 = __builtin_elementwise_sub_sat(i1, 10);
134 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
135 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
136 // CHECK-NEXT: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
137 vi1 = __builtin_elementwise_sub_sat(vi1, vi2);
139 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
140 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
141 // CHECK-NEXT: call i32 @llvm.usub.sat.i32(i32 [[U1]], i32 [[U2]])
142 u1 = __builtin_elementwise_sub_sat(u1, u2);
144 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
145 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
146 // CHECK-NEXT: call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
147 vu1 = __builtin_elementwise_sub_sat(vu1, vu2);
149 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
150 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
151 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
152 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
153 // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
154 bi1 = __builtin_elementwise_sub_sat(bi1, bi2);
156 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
157 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
158 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
159 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
160 // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
161 bu1 = __builtin_elementwise_sub_sat(bu1, bu2);
163 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
164 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
165 // CHECK-NEXT: call i32 @llvm.ssub.sat.i32(i32 [[IAS1]], i32 [[B]])
166 int_as_one = __builtin_elementwise_sub_sat(int_as_one, b);
168 // CHECK: store i64 -96, ptr %i1.addr, align 8
169 i1 = __builtin_elementwise_sub_sat(1, 'a');
172 void test_builtin_elementwise_maximum(float f1, float f2, double d1, double d2,
173 float4 vf1, float4 vf2, long long int i1,
174 long long int i2, si8 vi1, si8 vi2,
175 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
176 _BitInt(31) bi1, _BitInt(31) bi2,
177 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
178 // CHECK-LABEL: define void @test_builtin_elementwise_maximum(
179 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
180 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
181 // CHECK-NEXT: call float @llvm.maximum.f32(float [[F1]], float [[F2]])
182 f1 = __builtin_elementwise_maximum(f1, f2);
184 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
185 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
186 // CHECK-NEXT: call double @llvm.maximum.f64(double [[D1]], double [[D2]])
187 d1 = __builtin_elementwise_maximum(d1, d2);
189 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
190 // CHECK-NEXT: call double @llvm.maximum.f64(double 2.000000e+01, double [[D2]])
191 d1 = __builtin_elementwise_maximum(20.0, d2);
193 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
194 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
195 // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
196 vf1 = __builtin_elementwise_maximum(vf1, vf2);
198 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
199 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
200 // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
201 const float4 cvf1 = vf1;
202 vf1 = __builtin_elementwise_maximum(cvf1, vf2);
204 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
205 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
206 // CHECK-NEXT: call <4 x float> @llvm.maximum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
207 vf1 = __builtin_elementwise_maximum(vf2, cvf1);
210 void test_builtin_elementwise_minimum(float f1, float f2, double d1, double d2,
211 float4 vf1, float4 vf2, long long int i1,
212 long long int i2, si8 vi1, si8 vi2,
213 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
214 _BitInt(31) bi1, _BitInt(31) bi2,
215 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
216 // CHECK-LABEL: define void @test_builtin_elementwise_minimum(
217 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
218 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
219 // CHECK-NEXT: call float @llvm.minimum.f32(float [[F1]], float [[F2]])
220 f1 = __builtin_elementwise_minimum(f1, f2);
222 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
223 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
224 // CHECK-NEXT: call double @llvm.minimum.f64(double [[D1]], double [[D2]])
225 d1 = __builtin_elementwise_minimum(d1, d2);
227 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
228 // CHECK-NEXT: call double @llvm.minimum.f64(double [[D1]], double 2.000000e+00)
229 d1 = __builtin_elementwise_minimum(d1, 2.0);
231 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
232 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
233 // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
234 vf1 = __builtin_elementwise_minimum(vf1, vf2);
236 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
237 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
238 // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
239 const float4 cvf1 = vf1;
240 vf1 = __builtin_elementwise_minimum(cvf1, vf2);
242 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
243 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
244 // CHECK-NEXT: call <4 x float> @llvm.minimum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
245 vf1 = __builtin_elementwise_minimum(vf2, cvf1);
248 void test_builtin_elementwise_max(float f1, float f2, double d1, double d2,
249 float4 vf1, float4 vf2, long long int i1,
250 long long int i2, si8 vi1, si8 vi2,
251 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
252 _BitInt(31) bi1, _BitInt(31) bi2,
253 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
254 // CHECK-LABEL: define void @test_builtin_elementwise_max(
255 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
256 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
257 // CHECK-NEXT: call float @llvm.maxnum.f32(float [[F1]], float [[F2]])
258 f1 = __builtin_elementwise_max(f1, f2);
260 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
261 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
262 // CHECK-NEXT: call double @llvm.maxnum.f64(double [[D1]], double [[D2]])
263 d1 = __builtin_elementwise_max(d1, d2);
265 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
266 // CHECK-NEXT: call double @llvm.maxnum.f64(double 2.000000e+01, double [[D2]])
267 d1 = __builtin_elementwise_max(20.0, d2);
269 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
270 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
271 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
272 vf1 = __builtin_elementwise_max(vf1, vf2);
274 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
275 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
276 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 [[I2]])
277 i1 = __builtin_elementwise_max(i1, i2);
279 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
280 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 10)
281 i1 = __builtin_elementwise_max(i1, 10);
283 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
284 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
285 // CHECK-NEXT: call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
286 vi1 = __builtin_elementwise_max(vi1, vi2);
288 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
289 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
290 // CHECK-NEXT: call i32 @llvm.umax.i32(i32 [[U1]], i32 [[U2]])
291 u1 = __builtin_elementwise_max(u1, u2);
293 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
294 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
295 // CHECK-NEXT: call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
296 vu1 = __builtin_elementwise_max(vu1, vu2);
298 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
299 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
300 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
301 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
302 // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
303 bi1 = __builtin_elementwise_max(bi1, bi2);
305 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
306 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
307 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
308 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
309 // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
310 bu1 = __builtin_elementwise_max(bu1, bu2);
312 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
313 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
314 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
315 const float4 cvf1 = vf1;
316 vf1 = __builtin_elementwise_max(cvf1, vf2);
318 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
319 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
320 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
321 vf1 = __builtin_elementwise_max(vf2, cvf1);
323 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
324 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
325 // CHECK-NEXT: call i32 @llvm.smax.i32(i32 [[IAS1]], i32 [[B]])
326 int_as_one = __builtin_elementwise_max(int_as_one, b);
328 // CHECK: call i32 @llvm.smax.i32(i32 1, i32 97)
329 i1 = __builtin_elementwise_max(1, 'a');
332 void test_builtin_elementwise_min(float f1, float f2, double d1, double d2,
333 float4 vf1, float4 vf2, long long int i1,
334 long long int i2, si8 vi1, si8 vi2,
335 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
336 _BitInt(31) bi1, _BitInt(31) bi2,
337 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
338 // CHECK-LABEL: define void @test_builtin_elementwise_min(
339 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
340 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
341 // CHECK-NEXT: call float @llvm.minnum.f32(float [[F1]], float [[F2]])
342 f1 = __builtin_elementwise_min(f1, f2);
344 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
345 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
346 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double [[D2]])
347 d1 = __builtin_elementwise_min(d1, d2);
349 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
350 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double 2.000000e+00)
351 d1 = __builtin_elementwise_min(d1, 2.0);
353 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
354 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
355 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
356 vf1 = __builtin_elementwise_min(vf1, vf2);
358 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
359 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
360 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[I1]], i64 [[I2]])
361 i1 = __builtin_elementwise_min(i1, i2);
363 // CHECK: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
364 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 -11, i64 [[I2]])
365 i1 = __builtin_elementwise_min(-11, i2);
367 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
368 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
369 // CHECK-NEXT: call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
370 vi1 = __builtin_elementwise_min(vi1, vi2);
372 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
373 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
374 // CHECK-NEXT: call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
375 u1 = __builtin_elementwise_min(u1, u2);
377 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
378 // CHECK-NEXT: [[ZEXT_U1:%.+]] = zext i32 [[U1]] to i64
379 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
380 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[ZEXT_U1]], i64 [[I2]])
381 u1 = __builtin_elementwise_min(u1, i2);
383 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
384 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
385 // CHECK-NEXT: call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
386 vu1 = __builtin_elementwise_min(vu1, vu2);
388 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
389 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
390 // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
391 // CHECK-NEXT: [[LOADEDV1:%.+]] = trunc i32 [[BI2]] to i31
392 // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[LOADEDV]], i31 [[LOADEDV1]])
393 bi1 = __builtin_elementwise_min(bi1, bi2);
395 // CHECK: [[BU1:%.+]] = load i64, ptr %bu1.addr, align 8
396 // CHECK-NEXT: [[LOADEDV2:%.+]] = trunc i64 [[BU1]] to i55
397 // CHECK-NEXT: [[BU2:%.+]] = load i64, ptr %bu2.addr, align 8
398 // CHECK-NEXT: [[LOADEDV3:%.+]] = trunc i64 [[BU2]] to i55
399 // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[LOADEDV2]], i55 [[LOADEDV3]])
400 bu1 = __builtin_elementwise_min(bu1, bu2);
402 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
403 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
404 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
405 const float4 cvf1 = vf1;
406 vf1 = __builtin_elementwise_min(cvf1, vf2);
408 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
409 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
410 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
411 vf1 = __builtin_elementwise_min(vf2, cvf1);
413 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
414 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
415 // CHECK-NEXT: call i32 @llvm.smin.i32(i32 [[IAS1]], i32 [[B]])
416 int_as_one = __builtin_elementwise_min(int_as_one, b);
419 void test_builtin_elementwise_bitreverse(si8 vi1, si8 vi2,
420 long long int i1, long long int i2, short si,
421 _BitInt(31) bi1, _BitInt(31) bi2) {
424 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
425 // CHECK-NEXT: call i64 @llvm.bitreverse.i64(i64 [[I1]])
426 i2 = __builtin_elementwise_bitreverse(i1);
428 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
429 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[VI1]])
430 vi2 = __builtin_elementwise_bitreverse(vi1);
432 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
433 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[CVI2]])
434 const si8 cvi2 = vi2;
435 vi2 = __builtin_elementwise_bitreverse(cvi2);
437 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
438 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
439 // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[LOADEDV]])
440 bi2 = __builtin_elementwise_bitreverse(bi1);
442 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
443 // CHECK-NEXT: call i32 @llvm.bitreverse.i32(i32 [[IA1]])
444 b = __builtin_elementwise_bitreverse(int_as_one);
446 // CHECK: store i32 1879048191, ptr @b, align 4
447 b = __builtin_elementwise_bitreverse(-10);
449 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
450 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
451 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.bitreverse.i32(i32 [[SI_EXT]])
452 // CHECK-NEXT: = trunc i32 [[RES]] to i16
453 si = __builtin_elementwise_bitreverse(si);
456 void test_builtin_elementwise_ceil(float f1, float f2, double d1, double d2,
457 float4 vf1, float4 vf2) {
458 // CHECK-LABEL: define void @test_builtin_elementwise_ceil(
459 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
460 // CHECK-NEXT: call float @llvm.ceil.f32(float [[F1]])
461 f2 = __builtin_elementwise_ceil(f1);
463 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
464 // CHECK-NEXT: call double @llvm.ceil.f64(double [[D1]])
465 d2 = __builtin_elementwise_ceil(d1);
467 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
468 // CHECK-NEXT: call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VF1]])
469 vf2 = __builtin_elementwise_ceil(vf1);
472 void test_builtin_elementwise_acos(float f1, float f2, double d1, double d2,
473 float4 vf1, float4 vf2) {
474 // CHECK-LABEL: define void @test_builtin_elementwise_acos(
475 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
476 // CHECK-NEXT: call float @llvm.acos.f32(float [[F1]])
477 f2 = __builtin_elementwise_acos(f1);
479 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
480 // CHECK-NEXT: call double @llvm.acos.f64(double [[D1]])
481 d2 = __builtin_elementwise_acos(d1);
483 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
484 // CHECK-NEXT: call <4 x float> @llvm.acos.v4f32(<4 x float> [[VF1]])
485 vf2 = __builtin_elementwise_acos(vf1);
488 void test_builtin_elementwise_asin(float f1, float f2, double d1, double d2,
489 float4 vf1, float4 vf2) {
490 // CHECK-LABEL: define void @test_builtin_elementwise_asin(
491 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
492 // CHECK-NEXT: call float @llvm.asin.f32(float [[F1]])
493 f2 = __builtin_elementwise_asin(f1);
495 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
496 // CHECK-NEXT: call double @llvm.asin.f64(double [[D1]])
497 d2 = __builtin_elementwise_asin(d1);
499 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
500 // CHECK-NEXT: call <4 x float> @llvm.asin.v4f32(<4 x float> [[VF1]])
501 vf2 = __builtin_elementwise_asin(vf1);
504 void test_builtin_elementwise_atan(float f1, float f2, double d1, double d2,
505 float4 vf1, float4 vf2) {
506 // CHECK-LABEL: define void @test_builtin_elementwise_atan(
507 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
508 // CHECK-NEXT: call float @llvm.atan.f32(float [[F1]])
509 f2 = __builtin_elementwise_atan(f1);
511 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
512 // CHECK-NEXT: call double @llvm.atan.f64(double [[D1]])
513 d2 = __builtin_elementwise_atan(d1);
515 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
516 // CHECK-NEXT: call <4 x float> @llvm.atan.v4f32(<4 x float> [[VF1]])
517 vf2 = __builtin_elementwise_atan(vf1);
520 void test_builtin_elementwise_atan2(float f1, float f2, float f3, double d1,
521 double d2, double d3, float4 vf1,
522 float4 vf2, float4 vf3) {
523 // CHECK-LABEL: define void @test_builtin_elementwise_atan2(
524 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
525 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
526 // CHECK-NEXT: call float @llvm.atan2.f32(float [[F1]], float [[F2]])
527 f3 = __builtin_elementwise_atan2(f1, f2);
529 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
530 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
531 // CHECK-NEXT: call double @llvm.atan2.f64(double [[D1]], double [[D2]])
532 d3 = __builtin_elementwise_atan2(d1, d2);
534 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
535 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
536 // CHECK-NEXT: call <4 x float> @llvm.atan2.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
537 vf3 = __builtin_elementwise_atan2(vf1, vf2);
540 void test_builtin_elementwise_cos(float f1, float f2, double d1, double d2,
541 float4 vf1, float4 vf2) {
542 // CHECK-LABEL: define void @test_builtin_elementwise_cos(
543 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
544 // CHECK-NEXT: call float @llvm.cos.f32(float [[F1]])
545 f2 = __builtin_elementwise_cos(f1);
547 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
548 // CHECK-NEXT: call double @llvm.cos.f64(double [[D1]])
549 d2 = __builtin_elementwise_cos(d1);
551 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
552 // CHECK-NEXT: call <4 x float> @llvm.cos.v4f32(<4 x float> [[VF1]])
553 vf2 = __builtin_elementwise_cos(vf1);
556 void test_builtin_elementwise_cosh(float f1, float f2, double d1, double d2,
557 float4 vf1, float4 vf2) {
558 // CHECK-LABEL: define void @test_builtin_elementwise_cosh(
559 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
560 // CHECK-NEXT: call float @llvm.cosh.f32(float [[F1]])
561 f2 = __builtin_elementwise_cosh(f1);
563 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
564 // CHECK-NEXT: call double @llvm.cosh.f64(double [[D1]])
565 d2 = __builtin_elementwise_cosh(d1);
567 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
568 // CHECK-NEXT: call <4 x float> @llvm.cosh.v4f32(<4 x float> [[VF1]])
569 vf2 = __builtin_elementwise_cosh(vf1);
572 void test_builtin_elementwise_exp(float f1, float f2, double d1, double d2,
573 float4 vf1, float4 vf2) {
574 // CHECK-LABEL: define void @test_builtin_elementwise_exp(
575 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
576 // CHECK-NEXT: call float @llvm.exp.f32(float [[F1]])
577 f2 = __builtin_elementwise_exp(f1);
579 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
580 // CHECK-NEXT: call double @llvm.exp.f64(double [[D1]])
581 d2 = __builtin_elementwise_exp(d1);
583 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
584 // CHECK-NEXT: call <4 x float> @llvm.exp.v4f32(<4 x float> [[VF1]])
585 vf2 = __builtin_elementwise_exp(vf1);
588 void test_builtin_elementwise_exp2(float f1, float f2, double d1, double d2,
589 float4 vf1, float4 vf2) {
590 // CHECK-LABEL: define void @test_builtin_elementwise_exp2(
591 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
592 // CHECK-NEXT: call float @llvm.exp2.f32(float [[F1]])
593 f2 = __builtin_elementwise_exp2(f1);
595 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
596 // CHECK-NEXT: call double @llvm.exp2.f64(double [[D1]])
597 d2 = __builtin_elementwise_exp2(d1);
599 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
600 // CHECK-NEXT: call <4 x float> @llvm.exp2.v4f32(<4 x float> [[VF1]])
601 vf2 = __builtin_elementwise_exp2(vf1);
605 void test_builtin_elementwise_floor(float f1, float f2, double d1, double d2,
606 float4 vf1, float4 vf2) {
607 // CHECK-LABEL: define void @test_builtin_elementwise_floor(
608 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
609 // CHECK-NEXT: call float @llvm.floor.f32(float [[F1]])
610 f2 = __builtin_elementwise_floor(f1);
612 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
613 // CHECK-NEXT: call double @llvm.floor.f64(double [[D1]])
614 d2 = __builtin_elementwise_floor(d1);
616 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
617 // CHECK-NEXT: call <4 x float> @llvm.floor.v4f32(<4 x float> [[VF1]])
618 vf2 = __builtin_elementwise_floor(vf1);
621 void test_builtin_elementwise_log(float f1, float f2, double d1, double d2,
622 float4 vf1, float4 vf2) {
623 // CHECK-LABEL: define void @test_builtin_elementwise_log(
624 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
625 // CHECK-NEXT: call float @llvm.log.f32(float [[F1]])
626 f2 = __builtin_elementwise_log(f1);
628 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
629 // CHECK-NEXT: call double @llvm.log.f64(double [[D1]])
630 d2 = __builtin_elementwise_log(d1);
632 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
633 // CHECK-NEXT: call <4 x float> @llvm.log.v4f32(<4 x float> [[VF1]])
634 vf2 = __builtin_elementwise_log(vf1);
637 void test_builtin_elementwise_log10(float f1, float f2, double d1, double d2,
638 float4 vf1, float4 vf2) {
639 // CHECK-LABEL: define void @test_builtin_elementwise_log10(
640 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
641 // CHECK-NEXT: call float @llvm.log10.f32(float [[F1]])
642 f2 = __builtin_elementwise_log10(f1);
644 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
645 // CHECK-NEXT: call double @llvm.log10.f64(double [[D1]])
646 d2 = __builtin_elementwise_log10(d1);
648 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
649 // CHECK-NEXT: call <4 x float> @llvm.log10.v4f32(<4 x float> [[VF1]])
650 vf2 = __builtin_elementwise_log10(vf1);
653 void test_builtin_elementwise_log2(float f1, float f2, double d1, double d2,
654 float4 vf1, float4 vf2) {
655 // CHECK-LABEL: define void @test_builtin_elementwise_log2(
656 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
657 // CHECK-NEXT: call float @llvm.log2.f32(float [[F1]])
658 f2 = __builtin_elementwise_log2(f1);
660 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
661 // CHECK-NEXT: call double @llvm.log2.f64(double [[D1]])
662 d2 = __builtin_elementwise_log2(d1);
664 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
665 // CHECK-NEXT: call <4 x float> @llvm.log2.v4f32(<4 x float> [[VF1]])
666 vf2 = __builtin_elementwise_log2(vf1);
669 void test_builtin_elementwise_popcount(si8 vi1, si8 vi2, long long int i1,
670 long long int i2, short si,
671 _BitInt(31) bi1, _BitInt(31) bi2) {
672 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
673 // CHECK-NEXT: call i64 @llvm.ctpop.i64(i64 [[I1]])
674 i2 = __builtin_elementwise_popcount(i1);
676 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
677 // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[VI1]])
678 vi2 = __builtin_elementwise_popcount(vi1);
680 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
681 // CHECK-NEXT: call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[CVI2]])
682 const si8 cvi2 = vi2;
683 vi2 = __builtin_elementwise_popcount(cvi2);
685 // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
686 // CHECK-NEXT: [[LOADEDV:%.+]] = trunc i32 [[BI1]] to i31
687 // CHECK-NEXT: call i31 @llvm.ctpop.i31(i31 [[LOADEDV]])
688 bi2 = __builtin_elementwise_popcount(bi1);
690 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
691 // CHECK-NEXT: call i32 @llvm.ctpop.i32(i32 [[IA1]])
692 b = __builtin_elementwise_popcount(int_as_one);
694 // CHECK: store i32 30, ptr @b, align 4
695 b = __builtin_elementwise_popcount(-10);
697 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
698 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
699 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.ctpop.i32(i32 [[SI_EXT]])
700 // CHECK-NEXT: = trunc i32 [[RES]] to i16
701 si = __builtin_elementwise_popcount(si);
704 void test_builtin_elementwise_fmod(float f1, float f2, double d1, double d2,
705 float4 vf1, float4 vf2) {
707 // CHECK-LABEL: define void @test_builtin_elementwise_fmod(
708 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
709 // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4
710 // CHECK-NEXT: frem float [[F1]], [[F2]]
711 f2 = __builtin_elementwise_fmod(f1, f2);
713 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
714 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
715 // CHECK-NEXT: frem double [[D1]], [[D2]]
716 d2 = __builtin_elementwise_fmod(d1, d2);
718 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
719 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
720 // CHECK-NEXT: frem <4 x float> [[VF1]], [[VF2]]
721 vf2 = __builtin_elementwise_fmod(vf1, vf2);
724 void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2,
725 float4 vf1, float4 vf2) {
727 // CHECK-LABEL: define void @test_builtin_elementwise_pow(
728 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
729 // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4
730 // CHECK-NEXT: call float @llvm.pow.f32(float [[F1]], float [[F2]])
731 f2 = __builtin_elementwise_pow(f1, f2);
733 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
734 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
735 // CHECK-NEXT: call double @llvm.pow.f64(double [[D1]], double [[D2]])
736 d2 = __builtin_elementwise_pow(d1, d2);
738 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
739 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
740 // CHECK-NEXT: call <4 x float> @llvm.pow.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
741 vf2 = __builtin_elementwise_pow(vf1, vf2);
744 void test_builtin_elementwise_roundeven(float f1, float f2, double d1, double d2,
745 float4 vf1, float4 vf2) {
746 // CHECK-LABEL: define void @test_builtin_elementwise_roundeven(
747 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
748 // CHECK-NEXT: call float @llvm.roundeven.f32(float [[F1]])
749 f2 = __builtin_elementwise_roundeven(f1);
751 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
752 // CHECK-NEXT: call double @llvm.roundeven.f64(double [[D1]])
753 d2 = __builtin_elementwise_roundeven(d1);
755 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
756 // CHECK-NEXT: call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[VF1]])
757 vf2 = __builtin_elementwise_roundeven(vf1);
760 void test_builtin_elementwise_round(float f1, float f2, double d1, double d2,
761 float4 vf1, float4 vf2) {
762 // CHECK-LABEL: define void @test_builtin_elementwise_round(
763 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
764 // CHECK-NEXT: call float @llvm.round.f32(float [[F1]])
765 f2 = __builtin_elementwise_round(f1);
767 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
768 // CHECK-NEXT: call double @llvm.round.f64(double [[D1]])
769 d2 = __builtin_elementwise_round(d1);
771 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
772 // CHECK-NEXT: call <4 x float> @llvm.round.v4f32(<4 x float> [[VF1]])
773 vf2 = __builtin_elementwise_round(vf1);
776 void test_builtin_elementwise_rint(float f1, float f2, double d1, double d2,
777 float4 vf1, float4 vf2) {
778 // CHECK-LABEL: define void @test_builtin_elementwise_rint(
779 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
780 // CHECK-NEXT: call float @llvm.rint.f32(float [[F1]])
781 f2 = __builtin_elementwise_rint(f1);
783 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
784 // CHECK-NEXT: call double @llvm.rint.f64(double [[D1]])
785 d2 = __builtin_elementwise_rint(d1);
787 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
788 // CHECK-NEXT: call <4 x float> @llvm.rint.v4f32(<4 x float> [[VF1]])
789 vf2 = __builtin_elementwise_rint(vf1);
792 void test_builtin_elementwise_nearbyint(float f1, float f2, double d1, double d2,
793 float4 vf1, float4 vf2) {
794 // CHECK-LABEL: define void @test_builtin_elementwise_nearbyint(
795 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
796 // CHECK-NEXT: call float @llvm.nearbyint.f32(float [[F1]])
797 f2 = __builtin_elementwise_nearbyint(f1);
799 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
800 // CHECK-NEXT: call double @llvm.nearbyint.f64(double [[D1]])
801 d2 = __builtin_elementwise_nearbyint(d1);
803 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
804 // CHECK-NEXT: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VF1]])
805 vf2 = __builtin_elementwise_nearbyint(vf1);
808 void test_builtin_elementwise_sin(float f1, float f2, double d1, double d2,
809 float4 vf1, float4 vf2) {
810 // CHECK-LABEL: define void @test_builtin_elementwise_sin(
811 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
812 // CHECK-NEXT: call float @llvm.sin.f32(float [[F1]])
813 f2 = __builtin_elementwise_sin(f1);
815 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
816 // CHECK-NEXT: call double @llvm.sin.f64(double [[D1]])
817 d2 = __builtin_elementwise_sin(d1);
819 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
820 // CHECK-NEXT: call <4 x float> @llvm.sin.v4f32(<4 x float> [[VF1]])
821 vf2 = __builtin_elementwise_sin(vf1);
824 void test_builtin_elementwise_sinh(float f1, float f2, double d1, double d2,
825 float4 vf1, float4 vf2) {
826 // CHECK-LABEL: define void @test_builtin_elementwise_sinh(
827 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
828 // CHECK-NEXT: call float @llvm.sinh.f32(float [[F1]])
829 f2 = __builtin_elementwise_sinh(f1);
831 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
832 // CHECK-NEXT: call double @llvm.sinh.f64(double [[D1]])
833 d2 = __builtin_elementwise_sinh(d1);
835 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
836 // CHECK-NEXT: call <4 x float> @llvm.sinh.v4f32(<4 x float> [[VF1]])
837 vf2 = __builtin_elementwise_sinh(vf1);
840 void test_builtin_elementwise_sqrt(float f1, float f2, double d1, double d2,
841 float4 vf1, float4 vf2) {
842 // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
843 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
844 // CHECK-NEXT: call float @llvm.sqrt.f32(float [[F1]])
845 f2 = __builtin_elementwise_sqrt(f1);
847 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
848 // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
849 d2 = __builtin_elementwise_sqrt(d1);
851 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
852 // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
853 vf2 = __builtin_elementwise_sqrt(vf1);
856 void test_builtin_elementwise_tan(float f1, float f2, double d1, double d2,
857 float4 vf1, float4 vf2) {
858 // CHECK-LABEL: define void @test_builtin_elementwise_tan(
859 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
860 // CHECK-NEXT: call float @llvm.tan.f32(float [[F1]])
861 f2 = __builtin_elementwise_tan(f1);
863 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
864 // CHECK-NEXT: call double @llvm.tan.f64(double [[D1]])
865 d2 = __builtin_elementwise_tan(d1);
867 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
868 // CHECK-NEXT: call <4 x float> @llvm.tan.v4f32(<4 x float> [[VF1]])
869 vf2 = __builtin_elementwise_tan(vf1);
872 void test_builtin_elementwise_tanh(float f1, float f2, double d1, double d2,
873 float4 vf1, float4 vf2) {
874 // CHECK-LABEL: define void @test_builtin_elementwise_tanh(
875 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
876 // CHECK-NEXT: call float @llvm.tanh.f32(float [[F1]])
877 f2 = __builtin_elementwise_tanh(f1);
879 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
880 // CHECK-NEXT: call double @llvm.tanh.f64(double [[D1]])
881 d2 = __builtin_elementwise_tanh(d1);
883 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
884 // CHECK-NEXT: call <4 x float> @llvm.tanh.v4f32(<4 x float> [[VF1]])
885 vf2 = __builtin_elementwise_tanh(vf1);
888 void test_builtin_elementwise_trunc(float f1, float f2, double d1, double d2,
889 float4 vf1, float4 vf2) {
890 // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
891 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
892 // CHECK-NEXT: call float @llvm.trunc.f32(float [[F1]])
893 f2 = __builtin_elementwise_trunc(f1);
895 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
896 // CHECK-NEXT: call double @llvm.trunc.f64(double [[D1]])
897 d2 = __builtin_elementwise_trunc(d1);
899 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
900 // CHECK-NEXT: call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VF1]])
901 vf2 = __builtin_elementwise_trunc(vf1);
904 void test_builtin_elementwise_canonicalize(float f1, float f2, double d1, double d2,
905 float4 vf1, float4 vf2) {
906 // CHECK-LABEL: define void @test_builtin_elementwise_canonicalize(
907 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
908 // CHECK-NEXT: call float @llvm.canonicalize.f32(float [[F1]])
909 f2 = __builtin_elementwise_canonicalize(f1);
911 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
912 // CHECK-NEXT: call double @llvm.canonicalize.f64(double [[D1]])
913 d2 = __builtin_elementwise_canonicalize(d1);
915 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
916 // CHECK-NEXT: call <4 x float> @llvm.canonicalize.v4f32(<4 x float> [[VF1]])
917 vf2 = __builtin_elementwise_canonicalize(vf1);
920 void test_builtin_elementwise_copysign(float f1, float f2, double d1, double d2,
921 float4 vf1, float4 vf2, double2 v2f64) {
922 // CHECK-LABEL: define void @test_builtin_elementwise_copysign(
923 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
924 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
925 // CHECK-NEXT: call float @llvm.copysign.f32(float %0, float %1)
926 f1 = __builtin_elementwise_copysign(f1, f2);
928 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
929 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
930 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double [[D2]])
931 d1 = __builtin_elementwise_copysign(d1, d2);
933 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
934 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double 2.000000e+00)
935 d1 = __builtin_elementwise_copysign(d1, 2.0);
937 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
938 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
939 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
940 vf1 = __builtin_elementwise_copysign(vf1, vf2);
942 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
943 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
944 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
945 const float4 cvf1 = vf1;
946 vf1 = __builtin_elementwise_copysign(cvf1, vf2);
948 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
949 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
950 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
951 vf1 = __builtin_elementwise_copysign(vf2, cvf1);
954 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
955 // CHECK-NEXT: call float @llvm.copysign.f32(float [[F1]], float 2.000000e+00)
956 f1 = __builtin_elementwise_copysign(f1, 2.0f);
958 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
959 // CHECK-NEXT: call float @llvm.copysign.f32(float 2.000000e+00, float [[F1]])
960 f1 = __builtin_elementwise_copysign(2.0f, f1);
962 // CHECK: [[V2F64:%.+]] = load <2 x double>, ptr %v2f64.addr, align 16
963 // CHECK-NEXT: call <2 x double> @llvm.copysign.v2f64(<2 x double> splat (double 1.000000e+00), <2 x double> [[V2F64]])
964 v2f64 = __builtin_elementwise_copysign((double2)1.0, v2f64);
967 void test_builtin_elementwise_fma(float f32, double f64,
968 float2 v2f32, float4 v4f32,
969 double2 v2f64, double3 v3f64,
970 const float4 c_v4f32,
971 half f16, half2 v2f16) {
972 // CHECK-LABEL: define void @test_builtin_elementwise_fma(
973 // CHECK: [[F32_0:%.+]] = load float, ptr %f32.addr
974 // CHECK-NEXT: [[F32_1:%.+]] = load float, ptr %f32.addr
975 // CHECK-NEXT: [[F32_2:%.+]] = load float, ptr %f32.addr
976 // CHECK-NEXT: call float @llvm.fma.f32(float [[F32_0]], float [[F32_1]], float [[F32_2]])
977 float f2 = __builtin_elementwise_fma(f32, f32, f32);
979 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
980 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
981 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
982 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
983 double d2 = __builtin_elementwise_fma(f64, f64, f64);
985 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %v4f32.addr
986 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %v4f32.addr
987 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %v4f32.addr
988 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
989 float4 tmp_v4f32 = __builtin_elementwise_fma(v4f32, v4f32, v4f32);
992 // FIXME: Are we really still doing the 3 vector load workaround
993 // CHECK: [[V3F64_LOAD_0:%.+]] = load <4 x double>, ptr %v3f64.addr
994 // CHECK-NEXT: [[V3F64_0:%.+]] = shufflevector
995 // CHECK-NEXT: [[V3F64_LOAD_1:%.+]] = load <4 x double>, ptr %v3f64.addr
996 // CHECK-NEXT: [[V3F64_1:%.+]] = shufflevector
997 // CHECK-NEXT: [[V3F64_LOAD_2:%.+]] = load <4 x double>, ptr %v3f64.addr
998 // CHECK-NEXT: [[V3F64_2:%.+]] = shufflevector
999 // CHECK-NEXT: call <3 x double> @llvm.fma.v3f64(<3 x double> [[V3F64_0]], <3 x double> [[V3F64_1]], <3 x double> [[V3F64_2]])
1000 v3f64 = __builtin_elementwise_fma(v3f64, v3f64, v3f64);
1002 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
1003 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
1004 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
1005 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
1006 v2f64 = __builtin_elementwise_fma(f64, f64, f64);
1008 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1009 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1010 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %c_v4f32.addr
1011 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
1012 v4f32 = __builtin_elementwise_fma(c_v4f32, c_v4f32, c_v4f32);
1014 // CHECK: [[F16_0:%.+]] = load half, ptr %f16.addr
1015 // CHECK-NEXT: [[F16_1:%.+]] = load half, ptr %f16.addr
1016 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
1017 // CHECK-NEXT: call half @llvm.fma.f16(half [[F16_0]], half [[F16_1]], half [[F16_2]])
1018 half tmp_f16 = __builtin_elementwise_fma(f16, f16, f16);
1020 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1021 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1022 // CHECK-NEXT: [[V2F16_2:%.+]] = load <2 x half>, ptr %v2f16.addr
1023 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
1024 half2 tmp0_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, v2f16);
1026 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1027 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1028 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
1029 // CHECK-NEXT: [[V2F16_2_INSERT:%.+]] = insertelement
1030 // CHECK-NEXT: [[V2F16_2:%.+]] = shufflevector <2 x half> [[V2F16_2_INSERT]], <2 x half> poison, <2 x i32> zeroinitializer
1031 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
1032 half2 tmp1_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)f16);
1034 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
1035 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
1036 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> splat (half 0xH4400))
1037 half2 tmp2_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)4.0);