Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / builtins-reduction-math.c
blob34f39cea5265ea8023eb1df0fb35922c986b1b87
1 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
3 typedef float float4 __attribute__((ext_vector_type(4)));
4 typedef short int si8 __attribute__((ext_vector_type(8)));
5 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
7 __attribute__((address_space(1))) float4 vf1_as_one;
9 void test_builtin_reduce_max(float4 vf1, si8 vi1, u4 vu1) {
10 // CHECK-LABEL: define void @test_builtin_reduce_max(
11 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
12 // CHECK-NEXT: call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[VF1]])
13 float r1 = __builtin_reduce_max(vf1);
15 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
16 // CHECK-NEXT: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[VI1]])
17 short r2 = __builtin_reduce_max(vi1);
19 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
20 // CHECK-NEXT: call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[VU1]])
21 unsigned r3 = __builtin_reduce_max(vu1);
23 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16
24 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[VF1_AS1]])
25 // CHECK-NEXT: fpext float [[RDX1]] to double
26 const double r4 = __builtin_reduce_max(vf1_as_one);
28 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
29 // CHECK-NEXT: [[RDX2:%.+]] = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> [[CVI1]])
30 // CHECK-NEXT: sext i16 [[RDX2]] to i64
31 const si8 cvi1 = vi1;
32 unsigned long long r5 = __builtin_reduce_max(cvi1);
35 void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
36 // CHECK-LABEL: define void @test_builtin_reduce_min(
37 // CHECK: [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
38 // CHECK-NEXT: call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[VF1]])
39 float r1 = __builtin_reduce_min(vf1);
41 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
42 // CHECK-NEXT: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[VI1]])
43 short r2 = __builtin_reduce_min(vi1);
45 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
46 // CHECK-NEXT: call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[VU1]])
47 unsigned r3 = __builtin_reduce_min(vu1);
49 // CHECK: [[VF1_AS1:%.+]] = load <4 x float>, ptr addrspace(1) @vf1_as_one, align 16
50 // CHECK-NEXT: [[RDX1:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[VF1_AS1]])
51 // CHECK-NEXT: fpext float [[RDX1]] to double
52 const double r4 = __builtin_reduce_min(vf1_as_one);
54 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
55 // CHECK-NEXT: [[RDX2:%.+]] = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> [[CVI1]])
56 // CHECK-NEXT: sext i16 [[RDX2]] to i64
57 const si8 cvi1 = vi1;
58 unsigned long long r5 = __builtin_reduce_min(cvi1);
61 void test_builtin_reduce_add(si8 vi1, u4 vu1) {
62 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
63 // CHECK-NEXT: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VI1]])
64 short r2 = __builtin_reduce_add(vi1);
66 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
67 // CHECK-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VU1]])
68 unsigned r3 = __builtin_reduce_add(vu1);
70 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
71 // CHECK-NEXT: [[RDX1:%.+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[CVI1]])
72 // CHECK-NEXT: sext i16 [[RDX1]] to i32
73 const si8 cvi1 = vi1;
74 int r4 = __builtin_reduce_add(cvi1);
76 // CHECK: [[CVU1:%.+]] = load <4 x i32>, ptr %cvu1, align 16
77 // CHECK-NEXT: [[RDX2:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[CVU1]])
78 // CHECK-NEXT: zext i32 [[RDX2]] to i64
79 const u4 cvu1 = vu1;
80 unsigned long long r5 = __builtin_reduce_add(cvu1);
83 void test_builtin_reduce_mul(si8 vi1, u4 vu1) {
84 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
85 // CHECK-NEXT: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[VI1]])
86 short r2 = __builtin_reduce_mul(vi1);
88 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
89 // CHECK-NEXT: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VU1]])
90 unsigned r3 = __builtin_reduce_mul(vu1);
92 // CHECK: [[CVI1:%.+]] = load <8 x i16>, ptr %cvi1, align 16
93 // CHECK-NEXT: [[RDX1:%.+]] = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[CVI1]])
94 // CHECK-NEXT: sext i16 [[RDX1]] to i32
95 const si8 cvi1 = vi1;
96 int r4 = __builtin_reduce_mul(cvi1);
98 // CHECK: [[CVU1:%.+]] = load <4 x i32>, ptr %cvu1, align 16
99 // CHECK-NEXT: [[RDX2:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[CVU1]])
100 // CHECK-NEXT: zext i32 [[RDX2]] to i64
101 const u4 cvu1 = vu1;
102 unsigned long long r5 = __builtin_reduce_mul(cvu1);
105 void test_builtin_reduce_xor(si8 vi1, u4 vu1) {
107 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
108 // CHECK-NEXT: call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> [[VI1]])
109 short r2 = __builtin_reduce_xor(vi1);
111 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
112 // CHECK-NEXT: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[VU1]])
113 unsigned r3 = __builtin_reduce_xor(vu1);
116 void test_builtin_reduce_or(si8 vi1, u4 vu1) {
118 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
119 // CHECK-NEXT: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[VI1]])
120 short r2 = __builtin_reduce_or(vi1);
122 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
123 // CHECK-NEXT: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[VU1]])
124 unsigned r3 = __builtin_reduce_or(vu1);
127 void test_builtin_reduce_and(si8 vi1, u4 vu1) {
129 // CHECK: [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
130 // CHECK-NEXT: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[VI1]])
131 short r2 = __builtin_reduce_and(vi1);
133 // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
134 // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]])
135 unsigned r3 = __builtin_reduce_and(vu1);