Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / builtins-elementwise-math.c
blob1c667e5bff1eabbdb7a45e48d86eba8daa933a6c
1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
3 typedef _Float16 half;
5 typedef half half2 __attribute__((ext_vector_type(2)));
6 typedef float float2 __attribute__((ext_vector_type(2)));
7 typedef float float4 __attribute__((ext_vector_type(4)));
8 typedef short int si8 __attribute__((ext_vector_type(8)));
9 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
10 typedef double double2 __attribute__((ext_vector_type(2)));
11 typedef double double3 __attribute__((ext_vector_type(3)));
13 __attribute__((address_space(1))) int int_as_one;
14 typedef int bar;
15 bar b;
17 void test_builtin_elementwise_abs(float f1, float f2, double d1, double d2,
18 float4 vf1, float4 vf2, si8 vi1, si8 vi2,
19 long long int i1, long long int i2, short si,
20 _BitInt(31) bi1, _BitInt(31) bi2) {
21 // CHECK-LABEL: define void @test_builtin_elementwise_abs(
22 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
23 // CHECK-NEXT: call float @llvm.fabs.f32(float [[F1]])
24 f2 = __builtin_elementwise_abs(f1);
26 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
27 // CHECK-NEXT: call double @llvm.fabs.f64(double [[D1]])
28 d2 = __builtin_elementwise_abs(d1);
30 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
31 // CHECK-NEXT: call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VF1]])
32 vf2 = __builtin_elementwise_abs(vf1);
34 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
35 // CHECK-NEXT: call i64 @llvm.abs.i64(i64 [[I1]], i1 false)
36 i2 = __builtin_elementwise_abs(i1);
38 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
39 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[VI1]], i1 false)
40 vi2 = __builtin_elementwise_abs(vi1);
42 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
43 // CHECK-NEXT: call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[CVI2]], i1 false)
44 const si8 cvi2 = vi2;
45 vi2 = __builtin_elementwise_abs(cvi2);
47 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
48 // CHECK-NEXT: call i31 @llvm.abs.i31(i31 [[BI1]], i1 false)
49 bi2 = __builtin_elementwise_abs(bi1);
51 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
52 // CHECK-NEXT: call i32 @llvm.abs.i32(i32 [[IA1]], i1 false)
53 b = __builtin_elementwise_abs(int_as_one);
55 // CHECK: call i32 @llvm.abs.i32(i32 -10, i1 false)
56 b = __builtin_elementwise_abs(-10);
58 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
59 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
60 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.abs.i32(i32 [[SI_EXT]], i1 false)
61 // CHECK-NEXT: = trunc i32 [[RES]] to i16
62 si = __builtin_elementwise_abs(si);
65 void test_builtin_elementwise_add_sat(float f1, float f2, double d1, double d2,
66 float4 vf1, float4 vf2, long long int i1,
67 long long int i2, si8 vi1, si8 vi2,
68 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
69 _BitInt(31) bi1, _BitInt(31) bi2,
70 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
71 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
72 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
73 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 [[I2]])
74 i1 = __builtin_elementwise_add_sat(i1, i2);
76 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
77 // CHECK-NEXT: call i64 @llvm.sadd.sat.i64(i64 [[I1]], i64 10)
78 i1 = __builtin_elementwise_add_sat(i1, 10);
80 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
81 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
82 // CHECK-NEXT: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
83 vi1 = __builtin_elementwise_add_sat(vi1, vi2);
85 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
86 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
87 // CHECK-NEXT: call i32 @llvm.uadd.sat.i32(i32 [[U1]], i32 [[U2]])
88 u1 = __builtin_elementwise_add_sat(u1, u2);
90 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
91 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
92 // CHECK-NEXT: call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
93 vu1 = __builtin_elementwise_add_sat(vu1, vu2);
95 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
96 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
97 // CHECK-NEXT: call i31 @llvm.sadd.sat.i31(i31 [[BI1]], i31 [[BI2]])
98 bi1 = __builtin_elementwise_add_sat(bi1, bi2);
100 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
101 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
102 // CHECK-NEXT: call i55 @llvm.uadd.sat.i55(i55 [[BU1]], i55 [[BU2]])
103 bu1 = __builtin_elementwise_add_sat(bu1, bu2);
105 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
106 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
107 // CHECK-NEXT: call i32 @llvm.sadd.sat.i32(i32 [[IAS1]], i32 [[B]])
108 int_as_one = __builtin_elementwise_add_sat(int_as_one, b);
110 // CHECK: call i32 @llvm.sadd.sat.i32(i32 1, i32 97)
111 i1 = __builtin_elementwise_add_sat(1, 'a');
114 void test_builtin_elementwise_sub_sat(float f1, float f2, double d1, double d2,
115 float4 vf1, float4 vf2, long long int i1,
116 long long int i2, si8 vi1, si8 vi2,
117 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
118 _BitInt(31) bi1, _BitInt(31) bi2,
119 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
120 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
121 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
122 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 [[I2]])
123 i1 = __builtin_elementwise_sub_sat(i1, i2);
125 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
126 // CHECK-NEXT: call i64 @llvm.ssub.sat.i64(i64 [[I1]], i64 10)
127 i1 = __builtin_elementwise_sub_sat(i1, 10);
129 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
130 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
131 // CHECK-NEXT: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
132 vi1 = __builtin_elementwise_sub_sat(vi1, vi2);
134 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
135 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
136 // CHECK-NEXT: call i32 @llvm.usub.sat.i32(i32 [[U1]], i32 [[U2]])
137 u1 = __builtin_elementwise_sub_sat(u1, u2);
139 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
140 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
141 // CHECK-NEXT: call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
142 vu1 = __builtin_elementwise_sub_sat(vu1, vu2);
144 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
145 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
146 // CHECK-NEXT: call i31 @llvm.ssub.sat.i31(i31 [[BI1]], i31 [[BI2]])
147 bi1 = __builtin_elementwise_sub_sat(bi1, bi2);
149 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
150 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
151 // CHECK-NEXT: call i55 @llvm.usub.sat.i55(i55 [[BU1]], i55 [[BU2]])
152 bu1 = __builtin_elementwise_sub_sat(bu1, bu2);
154 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
155 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
156 // CHECK-NEXT: call i32 @llvm.ssub.sat.i32(i32 [[IAS1]], i32 [[B]])
157 int_as_one = __builtin_elementwise_sub_sat(int_as_one, b);
159 // CHECK: call i32 @llvm.ssub.sat.i32(i32 1, i32 97)
160 i1 = __builtin_elementwise_sub_sat(1, 'a');
163 void test_builtin_elementwise_max(float f1, float f2, double d1, double d2,
164 float4 vf1, float4 vf2, long long int i1,
165 long long int i2, si8 vi1, si8 vi2,
166 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
167 _BitInt(31) bi1, _BitInt(31) bi2,
168 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
169 // CHECK-LABEL: define void @test_builtin_elementwise_max(
170 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
171 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
172 // CHECK-NEXT: call float @llvm.maxnum.f32(float %0, float %1)
173 f1 = __builtin_elementwise_max(f1, f2);
175 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
176 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
177 // CHECK-NEXT: call double @llvm.maxnum.f64(double [[D1]], double [[D2]])
178 d1 = __builtin_elementwise_max(d1, d2);
180 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
181 // CHECK-NEXT: call double @llvm.maxnum.f64(double 2.000000e+01, double [[D2]])
182 d1 = __builtin_elementwise_max(20.0, d2);
184 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
185 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
186 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
187 vf1 = __builtin_elementwise_max(vf1, vf2);
189 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
190 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
191 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 [[I2]])
192 i1 = __builtin_elementwise_max(i1, i2);
194 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
195 // CHECK-NEXT: call i64 @llvm.smax.i64(i64 [[I1]], i64 10)
196 i1 = __builtin_elementwise_max(i1, 10);
198 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
199 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
200 // CHECK-NEXT: call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
201 vi1 = __builtin_elementwise_max(vi1, vi2);
203 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
204 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
205 // CHECK-NEXT: call i32 @llvm.umax.i32(i32 [[U1]], i32 [[U2]])
206 u1 = __builtin_elementwise_max(u1, u2);
208 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
209 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
210 // CHECK-NEXT: call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
211 vu1 = __builtin_elementwise_max(vu1, vu2);
213 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
214 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
215 // CHECK-NEXT: call i31 @llvm.smax.i31(i31 [[BI1]], i31 [[BI2]])
216 bi1 = __builtin_elementwise_max(bi1, bi2);
218 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
219 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
220 // CHECK-NEXT: call i55 @llvm.umax.i55(i55 [[BU1]], i55 [[BU2]])
221 bu1 = __builtin_elementwise_max(bu1, bu2);
223 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
224 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
225 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
226 const float4 cvf1 = vf1;
227 vf1 = __builtin_elementwise_max(cvf1, vf2);
229 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
230 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
231 // CHECK-NEXT: call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
232 vf1 = __builtin_elementwise_max(vf2, cvf1);
234 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
235 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
236 // CHECK-NEXT: call i32 @llvm.smax.i32(i32 [[IAS1]], i32 [[B]])
237 int_as_one = __builtin_elementwise_max(int_as_one, b);
239 // CHECK: call i32 @llvm.smax.i32(i32 1, i32 97)
240 i1 = __builtin_elementwise_max(1, 'a');
243 void test_builtin_elementwise_min(float f1, float f2, double d1, double d2,
244 float4 vf1, float4 vf2, long long int i1,
245 long long int i2, si8 vi1, si8 vi2,
246 unsigned u1, unsigned u2, u4 vu1, u4 vu2,
247 _BitInt(31) bi1, _BitInt(31) bi2,
248 unsigned _BitInt(55) bu1, unsigned _BitInt(55) bu2) {
249 // CHECK-LABEL: define void @test_builtin_elementwise_min(
250 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
251 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
252 // CHECK-NEXT: call float @llvm.minnum.f32(float %0, float %1)
253 f1 = __builtin_elementwise_min(f1, f2);
255 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
256 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
257 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double [[D2]])
258 d1 = __builtin_elementwise_min(d1, d2);
260 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
261 // CHECK-NEXT: call double @llvm.minnum.f64(double [[D1]], double 2.000000e+00)
262 d1 = __builtin_elementwise_min(d1, 2.0);
264 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
265 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
266 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
267 vf1 = __builtin_elementwise_min(vf1, vf2);
269 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
270 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
271 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[I1]], i64 [[I2]])
272 i1 = __builtin_elementwise_min(i1, i2);
274 // CHECK: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
275 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 -11, i64 [[I2]])
276 i1 = __builtin_elementwise_min(-11, i2);
278 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
279 // CHECK-NEXT: [[VI2:%.+]] = load <8 x i16>, ptr %vi2.addr, align 16
280 // CHECK-NEXT: call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[VI1]], <8 x i16> [[VI2]])
281 vi1 = __builtin_elementwise_min(vi1, vi2);
283 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
284 // CHECK-NEXT: [[U2:%.+]] = load i32, ptr %u2.addr, align 4
285 // CHECK-NEXT: call i32 @llvm.umin.i32(i32 [[U1]], i32 [[U2]])
286 u1 = __builtin_elementwise_min(u1, u2);
288 // CHECK: [[U1:%.+]] = load i32, ptr %u1.addr, align 4
289 // CHECK-NEXT: [[ZEXT_U1:%.+]] = zext i32 [[U1]] to i64
290 // CHECK-NEXT: [[I2:%.+]] = load i64, ptr %i2.addr, align 8
291 // CHECK-NEXT: call i64 @llvm.smin.i64(i64 [[ZEXT_U1]], i64 [[I2]])
292 u1 = __builtin_elementwise_min(u1, i2);
294 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
295 // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
296 // CHECK-NEXT: call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]])
297 vu1 = __builtin_elementwise_min(vu1, vu2);
299 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
300 // CHECK-NEXT: [[BI2:%.+]] = load i31, ptr %bi2.addr, align 4
301 // CHECK-NEXT: call i31 @llvm.smin.i31(i31 [[BI1]], i31 [[BI2]])
302 bi1 = __builtin_elementwise_min(bi1, bi2);
304 // CHECK: [[BU1:%.+]] = load i55, ptr %bu1.addr, align 8
305 // CHECK-NEXT: [[BU2:%.+]] = load i55, ptr %bu2.addr, align 8
306 // CHECK-NEXT: call i55 @llvm.umin.i55(i55 [[BU1]], i55 [[BU2]])
307 bu1 = __builtin_elementwise_min(bu1, bu2);
309 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
310 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
311 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
312 const float4 cvf1 = vf1;
313 vf1 = __builtin_elementwise_min(cvf1, vf2);
315 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
316 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
317 // CHECK-NEXT: call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
318 vf1 = __builtin_elementwise_min(vf2, cvf1);
320 // CHECK: [[IAS1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
321 // CHECK-NEXT: [[B:%.+]] = load i32, ptr @b, align 4
322 // CHECK-NEXT: call i32 @llvm.smin.i32(i32 [[IAS1]], i32 [[B]])
323 int_as_one = __builtin_elementwise_min(int_as_one, b);
326 void test_builtin_elementwise_bitreverse(si8 vi1, si8 vi2,
327 long long int i1, long long int i2, short si,
328 _BitInt(31) bi1, _BitInt(31) bi2) {
331 // CHECK: [[I1:%.+]] = load i64, ptr %i1.addr, align 8
332 // CHECK-NEXT: call i64 @llvm.bitreverse.i64(i64 [[I1]])
333 i2 = __builtin_elementwise_bitreverse(i1);
335 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
336 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[VI1]])
337 vi2 = __builtin_elementwise_bitreverse(vi1);
339 // CHECK: [[CVI2:%.+]] = load <8 x i16>, ptr %cvi2, align 16
340 // CHECK-NEXT: call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[CVI2]])
341 const si8 cvi2 = vi2;
342 vi2 = __builtin_elementwise_bitreverse(cvi2);
344 // CHECK: [[BI1:%.+]] = load i31, ptr %bi1.addr, align 4
345 // CHECK-NEXT: call i31 @llvm.bitreverse.i31(i31 [[BI1]])
346 bi2 = __builtin_elementwise_bitreverse(bi1);
348 // CHECK: [[IA1:%.+]] = load i32, ptr addrspace(1) @int_as_one, align 4
349 // CHECK-NEXT: call i32 @llvm.bitreverse.i32(i32 [[IA1]])
350 b = __builtin_elementwise_bitreverse(int_as_one);
352 // CHECK: call i32 @llvm.bitreverse.i32(i32 -10)
353 b = __builtin_elementwise_bitreverse(-10);
355 // CHECK: [[SI:%.+]] = load i16, ptr %si.addr, align 2
356 // CHECK-NEXT: [[SI_EXT:%.+]] = sext i16 [[SI]] to i32
357 // CHECK-NEXT: [[RES:%.+]] = call i32 @llvm.bitreverse.i32(i32 [[SI_EXT]])
358 // CHECK-NEXT: = trunc i32 [[RES]] to i16
359 si = __builtin_elementwise_bitreverse(si);
362 void test_builtin_elementwise_ceil(float f1, float f2, double d1, double d2,
363 float4 vf1, float4 vf2) {
364 // CHECK-LABEL: define void @test_builtin_elementwise_ceil(
365 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
366 // CHECK-NEXT: call float @llvm.ceil.f32(float [[F1]])
367 f2 = __builtin_elementwise_ceil(f1);
369 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
370 // CHECK-NEXT: call double @llvm.ceil.f64(double [[D1]])
371 d2 = __builtin_elementwise_ceil(d1);
373 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
374 // CHECK-NEXT: call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VF1]])
375 vf2 = __builtin_elementwise_ceil(vf1);
378 void test_builtin_elementwise_cos(float f1, float f2, double d1, double d2,
379 float4 vf1, float4 vf2) {
380 // CHECK-LABEL: define void @test_builtin_elementwise_cos(
381 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
382 // CHECK-NEXT: call float @llvm.cos.f32(float [[F1]])
383 f2 = __builtin_elementwise_cos(f1);
385 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
386 // CHECK-NEXT: call double @llvm.cos.f64(double [[D1]])
387 d2 = __builtin_elementwise_cos(d1);
389 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
390 // CHECK-NEXT: call <4 x float> @llvm.cos.v4f32(<4 x float> [[VF1]])
391 vf2 = __builtin_elementwise_cos(vf1);
394 void test_builtin_elementwise_exp(float f1, float f2, double d1, double d2,
395 float4 vf1, float4 vf2) {
396 // CHECK-LABEL: define void @test_builtin_elementwise_exp(
397 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
398 // CHECK-NEXT: call float @llvm.exp.f32(float [[F1]])
399 f2 = __builtin_elementwise_exp(f1);
401 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
402 // CHECK-NEXT: call double @llvm.exp.f64(double [[D1]])
403 d2 = __builtin_elementwise_exp(d1);
405 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
406 // CHECK-NEXT: call <4 x float> @llvm.exp.v4f32(<4 x float> [[VF1]])
407 vf2 = __builtin_elementwise_exp(vf1);
410 void test_builtin_elementwise_exp2(float f1, float f2, double d1, double d2,
411 float4 vf1, float4 vf2) {
412 // CHECK-LABEL: define void @test_builtin_elementwise_exp2(
413 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
414 // CHECK-NEXT: call float @llvm.exp2.f32(float [[F1]])
415 f2 = __builtin_elementwise_exp2(f1);
417 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
418 // CHECK-NEXT: call double @llvm.exp2.f64(double [[D1]])
419 d2 = __builtin_elementwise_exp2(d1);
421 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
422 // CHECK-NEXT: call <4 x float> @llvm.exp2.v4f32(<4 x float> [[VF1]])
423 vf2 = __builtin_elementwise_exp2(vf1);
427 void test_builtin_elementwise_floor(float f1, float f2, double d1, double d2,
428 float4 vf1, float4 vf2) {
429 // CHECK-LABEL: define void @test_builtin_elementwise_floor(
430 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
431 // CHECK-NEXT: call float @llvm.floor.f32(float [[F1]])
432 f2 = __builtin_elementwise_floor(f1);
434 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
435 // CHECK-NEXT: call double @llvm.floor.f64(double [[D1]])
436 d2 = __builtin_elementwise_floor(d1);
438 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
439 // CHECK-NEXT: call <4 x float> @llvm.floor.v4f32(<4 x float> [[VF1]])
440 vf2 = __builtin_elementwise_floor(vf1);
443 void test_builtin_elementwise_log(float f1, float f2, double d1, double d2,
444 float4 vf1, float4 vf2) {
445 // CHECK-LABEL: define void @test_builtin_elementwise_log(
446 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
447 // CHECK-NEXT: call float @llvm.log.f32(float [[F1]])
448 f2 = __builtin_elementwise_log(f1);
450 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
451 // CHECK-NEXT: call double @llvm.log.f64(double [[D1]])
452 d2 = __builtin_elementwise_log(d1);
454 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
455 // CHECK-NEXT: call <4 x float> @llvm.log.v4f32(<4 x float> [[VF1]])
456 vf2 = __builtin_elementwise_log(vf1);
459 void test_builtin_elementwise_log10(float f1, float f2, double d1, double d2,
460 float4 vf1, float4 vf2) {
461 // CHECK-LABEL: define void @test_builtin_elementwise_log10(
462 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
463 // CHECK-NEXT: call float @llvm.log10.f32(float [[F1]])
464 f2 = __builtin_elementwise_log10(f1);
466 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
467 // CHECK-NEXT: call double @llvm.log10.f64(double [[D1]])
468 d2 = __builtin_elementwise_log10(d1);
470 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
471 // CHECK-NEXT: call <4 x float> @llvm.log10.v4f32(<4 x float> [[VF1]])
472 vf2 = __builtin_elementwise_log10(vf1);
475 void test_builtin_elementwise_log2(float f1, float f2, double d1, double d2,
476 float4 vf1, float4 vf2) {
477 // CHECK-LABEL: define void @test_builtin_elementwise_log2(
478 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
479 // CHECK-NEXT: call float @llvm.log2.f32(float [[F1]])
480 f2 = __builtin_elementwise_log2(f1);
482 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
483 // CHECK-NEXT: call double @llvm.log2.f64(double [[D1]])
484 d2 = __builtin_elementwise_log2(d1);
486 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
487 // CHECK-NEXT: call <4 x float> @llvm.log2.v4f32(<4 x float> [[VF1]])
488 vf2 = __builtin_elementwise_log2(vf1);
491 void test_builtin_elementwise_pow(float f1, float f2, double d1, double d2,
492 float4 vf1, float4 vf2) {
494 // CHECK-LABEL: define void @test_builtin_elementwise_pow(
495 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
496 // CHECK: [[F2:%.+]] = load float, ptr %f2.addr, align 4
497 // CHECK-NEXT: call float @llvm.pow.f32(float [[F1]], float [[F2]])
498 f2 = __builtin_elementwise_pow(f1, f2);
500 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
501 // CHECK: [[D2:%.+]] = load double, ptr %d2.addr, align 8
502 // CHECK-NEXT: call double @llvm.pow.f64(double [[D1]], double [[D2]])
503 d2 = __builtin_elementwise_pow(d1, d2);
505 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
506 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
507 // CHECK-NEXT: call <4 x float> @llvm.pow.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
508 vf2 = __builtin_elementwise_pow(vf1, vf2);
511 void test_builtin_elementwise_roundeven(float f1, float f2, double d1, double d2,
512 float4 vf1, float4 vf2) {
513 // CHECK-LABEL: define void @test_builtin_elementwise_roundeven(
514 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
515 // CHECK-NEXT: call float @llvm.roundeven.f32(float [[F1]])
516 f2 = __builtin_elementwise_roundeven(f1);
518 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
519 // CHECK-NEXT: call double @llvm.roundeven.f64(double [[D1]])
520 d2 = __builtin_elementwise_roundeven(d1);
522 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
523 // CHECK-NEXT: call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[VF1]])
524 vf2 = __builtin_elementwise_roundeven(vf1);
527 void test_builtin_elementwise_round(float f1, float f2, double d1, double d2,
528 float4 vf1, float4 vf2) {
529 // CHECK-LABEL: define void @test_builtin_elementwise_round(
530 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
531 // CHECK-NEXT: call float @llvm.round.f32(float [[F1]])
532 f2 = __builtin_elementwise_round(f1);
534 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
535 // CHECK-NEXT: call double @llvm.round.f64(double [[D1]])
536 d2 = __builtin_elementwise_round(d1);
538 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
539 // CHECK-NEXT: call <4 x float> @llvm.round.v4f32(<4 x float> [[VF1]])
540 vf2 = __builtin_elementwise_round(vf1);
543 void test_builtin_elementwise_rint(float f1, float f2, double d1, double d2,
544 float4 vf1, float4 vf2) {
545 // CHECK-LABEL: define void @test_builtin_elementwise_rint(
546 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
547 // CHECK-NEXT: call float @llvm.rint.f32(float [[F1]])
548 f2 = __builtin_elementwise_rint(f1);
550 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
551 // CHECK-NEXT: call double @llvm.rint.f64(double [[D1]])
552 d2 = __builtin_elementwise_rint(d1);
554 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
555 // CHECK-NEXT: call <4 x float> @llvm.rint.v4f32(<4 x float> [[VF1]])
556 vf2 = __builtin_elementwise_rint(vf1);
559 void test_builtin_elementwise_nearbyint(float f1, float f2, double d1, double d2,
560 float4 vf1, float4 vf2) {
561 // CHECK-LABEL: define void @test_builtin_elementwise_nearbyint(
562 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
563 // CHECK-NEXT: call float @llvm.nearbyint.f32(float [[F1]])
564 f2 = __builtin_elementwise_nearbyint(f1);
566 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
567 // CHECK-NEXT: call double @llvm.nearbyint.f64(double [[D1]])
568 d2 = __builtin_elementwise_nearbyint(d1);
570 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
571 // CHECK-NEXT: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VF1]])
572 vf2 = __builtin_elementwise_nearbyint(vf1);
575 void test_builtin_elementwise_sin(float f1, float f2, double d1, double d2,
576 float4 vf1, float4 vf2) {
577 // CHECK-LABEL: define void @test_builtin_elementwise_sin(
578 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
579 // CHECK-NEXT: call float @llvm.sin.f32(float [[F1]])
580 f2 = __builtin_elementwise_sin(f1);
582 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
583 // CHECK-NEXT: call double @llvm.sin.f64(double [[D1]])
584 d2 = __builtin_elementwise_sin(d1);
586 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
587 // CHECK-NEXT: call <4 x float> @llvm.sin.v4f32(<4 x float> [[VF1]])
588 vf2 = __builtin_elementwise_sin(vf1);
591 void test_builtin_elementwise_sqrt(float f1, float f2, double d1, double d2,
592 float4 vf1, float4 vf2) {
593 // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
594 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
595 // CHECK-NEXT: call float @llvm.sqrt.f32(float [[F1]])
596 f2 = __builtin_elementwise_sqrt(f1);
598 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
599 // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
600 d2 = __builtin_elementwise_sqrt(d1);
602 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
603 // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
604 vf2 = __builtin_elementwise_sqrt(vf1);
607 void test_builtin_elementwise_trunc(float f1, float f2, double d1, double d2,
608 float4 vf1, float4 vf2) {
609 // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
610 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
611 // CHECK-NEXT: call float @llvm.trunc.f32(float [[F1]])
612 f2 = __builtin_elementwise_trunc(f1);
614 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
615 // CHECK-NEXT: call double @llvm.trunc.f64(double [[D1]])
616 d2 = __builtin_elementwise_trunc(d1);
618 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
619 // CHECK-NEXT: call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VF1]])
620 vf2 = __builtin_elementwise_trunc(vf1);
623 void test_builtin_elementwise_canonicalize(float f1, float f2, double d1, double d2,
624 float4 vf1, float4 vf2) {
625 // CHECK-LABEL: define void @test_builtin_elementwise_canonicalize(
626 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
627 // CHECK-NEXT: call float @llvm.canonicalize.f32(float [[F1]])
628 f2 = __builtin_elementwise_canonicalize(f1);
630 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
631 // CHECK-NEXT: call double @llvm.canonicalize.f64(double [[D1]])
632 d2 = __builtin_elementwise_canonicalize(d1);
634 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
635 // CHECK-NEXT: call <4 x float> @llvm.canonicalize.v4f32(<4 x float> [[VF1]])
636 vf2 = __builtin_elementwise_canonicalize(vf1);
639 void test_builtin_elementwise_copysign(float f1, float f2, double d1, double d2,
640 float4 vf1, float4 vf2, double2 v2f64) {
641 // CHECK-LABEL: define void @test_builtin_elementwise_copysign(
642 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr, align 4
643 // CHECK-NEXT: [[F2:%.+]] = load float, ptr %f2.addr, align 4
644 // CHECK-NEXT: call float @llvm.copysign.f32(float %0, float %1)
645 f1 = __builtin_elementwise_copysign(f1, f2);
647 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
648 // CHECK-NEXT: [[D2:%.+]] = load double, ptr %d2.addr, align 8
649 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double [[D2]])
650 d1 = __builtin_elementwise_copysign(d1, d2);
652 // CHECK: [[D1:%.+]] = load double, ptr %d1.addr, align 8
653 // CHECK-NEXT: call double @llvm.copysign.f64(double [[D1]], double 2.000000e+00)
654 d1 = __builtin_elementwise_copysign(d1, 2.0);
656 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
657 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
658 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF1]], <4 x float> [[VF2]])
659 vf1 = __builtin_elementwise_copysign(vf1, vf2);
661 // CHECK: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
662 // CHECK-NEXT: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
663 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[CVF1]], <4 x float> [[VF2]])
664 const float4 cvf1 = vf1;
665 vf1 = __builtin_elementwise_copysign(cvf1, vf2);
667 // CHECK: [[VF2:%.+]] = load <4 x float>, ptr %vf2.addr, align 16
668 // CHECK-NEXT: [[CVF1:%.+]] = load <4 x float>, ptr %cvf1, align 16
669 // CHECK-NEXT: call <4 x float> @llvm.copysign.v4f32(<4 x float> [[VF2]], <4 x float> [[CVF1]])
670 vf1 = __builtin_elementwise_copysign(vf2, cvf1);
673 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
674 // CHECK-NEXT: call float @llvm.copysign.f32(float [[F1]], float 2.000000e+00)
675 f1 = __builtin_elementwise_copysign(f1, 2.0f);
677 // CHECK: [[F1:%.+]] = load float, ptr %f1.addr
678 // CHECK-NEXT: call float @llvm.copysign.f32(float 2.000000e+00, float [[F1]])
679 f1 = __builtin_elementwise_copysign(2.0f, f1);
681 // CHECK: [[V2F64:%.+]] = load <2 x double>, ptr %v2f64.addr, align 16
682 // CHECK-NEXT: call <2 x double> @llvm.copysign.v2f64(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x double> [[V2F64]])
683 v2f64 = __builtin_elementwise_copysign((double2)1.0, v2f64);
686 void test_builtin_elementwise_fma(float f32, double f64,
687 float2 v2f32, float4 v4f32,
688 double2 v2f64, double3 v3f64,
689 const float4 c_v4f32,
690 half f16, half2 v2f16) {
691 // CHECK-LABEL: define void @test_builtin_elementwise_fma(
692 // CHECK: [[F32_0:%.+]] = load float, ptr %f32.addr
693 // CHECK-NEXT: [[F32_1:%.+]] = load float, ptr %f32.addr
694 // CHECK-NEXT: [[F32_2:%.+]] = load float, ptr %f32.addr
695 // CHECK-NEXT: call float @llvm.fma.f32(float [[F32_0]], float [[F32_1]], float [[F32_2]])
696 float f2 = __builtin_elementwise_fma(f32, f32, f32);
698 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
699 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
700 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
701 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
702 double d2 = __builtin_elementwise_fma(f64, f64, f64);
704 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %v4f32.addr
705 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %v4f32.addr
706 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %v4f32.addr
707 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
708 float4 tmp_v4f32 = __builtin_elementwise_fma(v4f32, v4f32, v4f32);
711 // FIXME: Are we really still doing the 3 vector load workaround
712 // CHECK: [[V3F64_LOAD_0:%.+]] = load <4 x double>, ptr %v3f64.addr
713 // CHECK-NEXT: [[V3F64_0:%.+]] = shufflevector
714 // CHECK-NEXT: [[V3F64_LOAD_1:%.+]] = load <4 x double>, ptr %v3f64.addr
715 // CHECK-NEXT: [[V3F64_1:%.+]] = shufflevector
716 // CHECK-NEXT: [[V3F64_LOAD_2:%.+]] = load <4 x double>, ptr %v3f64.addr
717 // CHECK-NEXT: [[V3F64_2:%.+]] = shufflevector
718 // CHECK-NEXT: call <3 x double> @llvm.fma.v3f64(<3 x double> [[V3F64_0]], <3 x double> [[V3F64_1]], <3 x double> [[V3F64_2]])
719 v3f64 = __builtin_elementwise_fma(v3f64, v3f64, v3f64);
721 // CHECK: [[F64_0:%.+]] = load double, ptr %f64.addr
722 // CHECK-NEXT: [[F64_1:%.+]] = load double, ptr %f64.addr
723 // CHECK-NEXT: [[F64_2:%.+]] = load double, ptr %f64.addr
724 // CHECK-NEXT: call double @llvm.fma.f64(double [[F64_0]], double [[F64_1]], double [[F64_2]])
725 v2f64 = __builtin_elementwise_fma(f64, f64, f64);
727 // CHECK: [[V4F32_0:%.+]] = load <4 x float>, ptr %c_v4f32.addr
728 // CHECK-NEXT: [[V4F32_1:%.+]] = load <4 x float>, ptr %c_v4f32.addr
729 // CHECK-NEXT: [[V4F32_2:%.+]] = load <4 x float>, ptr %c_v4f32.addr
730 // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4F32_0]], <4 x float> [[V4F32_1]], <4 x float> [[V4F32_2]])
731 v4f32 = __builtin_elementwise_fma(c_v4f32, c_v4f32, c_v4f32);
733 // CHECK: [[F16_0:%.+]] = load half, ptr %f16.addr
734 // CHECK-NEXT: [[F16_1:%.+]] = load half, ptr %f16.addr
735 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
736 // CHECK-NEXT: call half @llvm.fma.f16(half [[F16_0]], half [[F16_1]], half [[F16_2]])
737 half tmp_f16 = __builtin_elementwise_fma(f16, f16, f16);
739 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
740 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
741 // CHECK-NEXT: [[V2F16_2:%.+]] = load <2 x half>, ptr %v2f16.addr
742 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
743 half2 tmp0_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, v2f16);
745 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
746 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
747 // CHECK-NEXT: [[F16_2:%.+]] = load half, ptr %f16.addr
748 // CHECK-NEXT: [[V2F16_2_INSERT:%.+]] = insertelement
749 // CHECK-NEXT: [[V2F16_2:%.+]] = shufflevector <2 x half> [[V2F16_2_INSERT]], <2 x half> poison, <2 x i32> zeroinitializer
750 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> [[V2F16_2]])
751 half2 tmp1_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)f16);
753 // CHECK: [[V2F16_0:%.+]] = load <2 x half>, ptr %v2f16.addr
754 // CHECK-NEXT: [[V2F16_1:%.+]] = load <2 x half>, ptr %v2f16.addr
755 // CHECK-NEXT: call <2 x half> @llvm.fma.v2f16(<2 x half> [[V2F16_0]], <2 x half> [[V2F16_1]], <2 x half> <half 0xH4400, half 0xH4400>)
756 half2 tmp2_v2f16 = __builtin_elementwise_fma(v2f16, v2f16, (half2)4.0);