1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
3 typedef float float4
__attribute__((ext_vector_type(4)));
4 typedef short int si8
__attribute__((ext_vector_type(8)));
5 typedef unsigned int u4
__attribute__((ext_vector_type(4)));
7 __attribute__((address_space(1))) float4 vf1_as_one
;
9 void test_builtin_reduce_max(float4 vf1
, si8 vi1
, u4 vu1
) {
10 // CHECK-LABEL: define void @test_builtin_reduce_max(
11 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
12 // CHECK-NEXT: call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[VF1]])
13 float r1
= __builtin_reduce_max(vf1
);
15 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
16 // CHECK-NEXT: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[VI1]])
17 short r2
= __builtin_reduce_max(vi1
);
19 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
20 // CHECK-NEXT: call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[VU1]])
21 unsigned r3
= __builtin_reduce_max(vu1
);
23 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16
24 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[VF1_AS1]])
25 // CHECK-NEXT: fpext float [[RDX1]] to double
26 const double r4
= __builtin_reduce_max(vf1_as_one
);
28 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
29 // CHECK-NEXT: [[RDX2:%.+]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[CVI1]])
30 // CHECK-NEXT: sext i16 [[RDX2]] to i64
32 unsigned long long r5
= __builtin_reduce_max(cvi1
);
35 void test_builtin_reduce_min(float4 vf1
, si8 vi1
, u4 vu1
) {
36 // CHECK-LABEL: define void @test_builtin_reduce_min(
37 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
38 // CHECK-NEXT: call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[VF1]])
39 float r1
= __builtin_reduce_min(vf1
);
41 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
42 // CHECK-NEXT: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[VI1]])
43 short r2
= __builtin_reduce_min(vi1
);
45 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
46 // CHECK-NEXT: call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[VU1]])
47 unsigned r3
= __builtin_reduce_min(vu1
);
49 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16
50 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[VF1_AS1]])
51 // CHECK-NEXT: fpext float [[RDX1]] to double
52 const double r4
= __builtin_reduce_min(vf1_as_one
);
54 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
55 // CHECK-NEXT: [[RDX2:%.+]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[CVI1]])
56 // CHECK-NEXT: sext i16 [[RDX2]] to i64
58 unsigned long long r5
= __builtin_reduce_min(cvi1
);
61 void test_builtin_reduce_add(si8 vi1
, u4 vu1
) {
62 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
63 // CHECK-NEXT: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VI1]])
64 short r2
= __builtin_reduce_add(vi1
);
66 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
67 // CHECK-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VU1]])
68 unsigned r3
= __builtin_reduce_add(vu1
);
70 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
71 // CHECK-NEXT: [[RDX1:%.+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[CVI1]])
72 // CHECK-NEXT: sext i16 [[RDX1]] to i32
74 int r4
= __builtin_reduce_add(cvi1
);
76 // CHECK: [[CVU1:%.+]] = load <4 x i32>, ptr %cvu1, align 16
77 // CHECK-NEXT: [[RDX2:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[CVU1]])
78 // CHECK-NEXT: zext i32 [[RDX2]] to i64
80 unsigned long long r5
= __builtin_reduce_add(cvu1
);
83 void test_builtin_reduce_mul(si8 vi1
, u4 vu1
) {
84 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
85 // CHECK-NEXT: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[VI1]])
86 short r2
= __builtin_reduce_mul(vi1
);
88 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
89 // CHECK-NEXT: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VU1]])
90 unsigned r3
= __builtin_reduce_mul(vu1
);
92 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
93 // CHECK-NEXT: [[RDX1:%.+]] = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[CVI1]])
94 // CHECK-NEXT: sext i16 [[RDX1]] to i32
96 int r4
= __builtin_reduce_mul(cvi1
);
98 // CHECK: [[CVU1:%.+]] = load <4 x i32>, ptr %cvu1, align 16
99 // CHECK-NEXT: [[RDX2:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[CVU1]])
100 // CHECK-NEXT: zext i32 [[RDX2]] to i64
102 unsigned long long r5
= __builtin_reduce_mul(cvu1
);
105 void test_builtin_reduce_xor(si8 vi1
, u4 vu1
) {
107 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
108 // CHECK-NEXT: call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> [[VI1]])
109 short r2
= __builtin_reduce_xor(vi1
);
111 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
112 // CHECK-NEXT: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[VU1]])
113 unsigned r3
= __builtin_reduce_xor(vu1
);
116 void test_builtin_reduce_or(si8 vi1
, u4 vu1
) {
118 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
119 // CHECK-NEXT: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[VI1]])
120 short r2
= __builtin_reduce_or(vi1
);
122 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
123 // CHECK-NEXT: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[VU1]])
124 unsigned r3
= __builtin_reduce_or(vu1
);
127 void test_builtin_reduce_and(si8 vi1
, u4 vu1
) {
129 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
130 // CHECK-NEXT: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[VI1]])
131 short r2
= __builtin_reduce_and(vi1
);
133 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
134 // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]])
135 unsigned r3
= __builtin_reduce_and(vu1
);