Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / builtins-x86-reduce.c
blob9e5b479df65849ba4335365117e5287183b0ffb1
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 %s -triple x86_64-unknown-unknown -target-feature +avx512f -target-feature +avx512vl -target-feature +avx512fp16 -emit-llvm -o - | FileCheck %s
4 typedef double double8 __attribute__((ext_vector_type(8)));
6 // CHECK-LABEL: @fadd1(
7 // CHECK-NEXT: entry:
8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x double>, align 64
9 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8
10 // CHECK-NEXT: store <8 x double> [[A:%.*]], ptr [[A_ADDR]], align 64
11 // CHECK-NEXT: store double [[B:%.*]], ptr [[B_ADDR]], align 8
12 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
13 // CHECK-NEXT: [[TMP1:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP0]])
14 // CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[B_ADDR]], align 8
15 // CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], [[TMP2]]
16 // CHECK-NEXT: ret double [[ADD]]
18 double fadd1(double8 a, double b) {
19 return __builtin_ia32_reduce_fadd_pd512(0.0, a) + b;
22 #pragma clang fp reassociate(on)
23 // CHECK-LABEL: @fadd2(
24 // CHECK-NEXT: entry:
25 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x double>, align 64
26 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8
27 // CHECK-NEXT: store <8 x double> [[A:%.*]], ptr [[A_ADDR]], align 64
28 // CHECK-NEXT: store double [[B:%.*]], ptr [[B_ADDR]], align 8
29 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
30 // CHECK-NEXT: [[TMP1:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP0]])
31 // CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[B_ADDR]], align 8
32 // CHECK-NEXT: [[ADD:%.*]] = fadd reassoc double [[TMP1]], [[TMP2]]
33 // CHECK-NEXT: ret double [[ADD]]
35 double fadd2(double8 a, double b) {
36 return __builtin_ia32_reduce_fadd_pd512(0.0, a) + b;
39 typedef float float16 __attribute__((ext_vector_type(16)));
41 #pragma clang fp reassociate(off)
42 // CHECK-LABEL: @fmul1(
43 // CHECK-NEXT: entry:
44 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <16 x float>, align 64
45 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4
46 // CHECK-NEXT: store <16 x float> [[A:%.*]], ptr [[A_ADDR]], align 64
47 // CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR]], align 4
48 // CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
49 // CHECK-NEXT: [[TMP1:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP0]])
50 // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[B_ADDR]], align 4
51 // CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
52 // CHECK-NEXT: ret float [[ADD]]
54 float fmul1(float16 a, float b) {
55 return __builtin_ia32_reduce_fmul_ps512(1.0f, a) + b;
58 typedef _Float16 half8 __attribute__((ext_vector_type(8)));
60 // CHECK-LABEL: @fmax1(
61 // CHECK-NEXT: entry:
62 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16
63 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca half, align 2
64 // CHECK-NEXT: store <8 x half> [[A:%.*]], ptr [[A_ADDR]], align 16
65 // CHECK-NEXT: store half [[B:%.*]], ptr [[B_ADDR]], align 2
66 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
67 // CHECK-NEXT: [[TMP1:%.*]] = call nnan half @llvm.vector.reduce.fmax.v8f16(<8 x half> [[TMP0]])
68 // CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[B_ADDR]], align 2
69 // CHECK-NEXT: [[ADD:%.*]] = fadd half [[TMP1]], [[TMP2]]
70 // CHECK-NEXT: ret half [[ADD]]
72 _Float16 fmax1(half8 a, _Float16 b) {
73 return __builtin_ia32_reduce_fmax_ph128(a) + b;
76 typedef _Float16 half16 __attribute__((ext_vector_type(16)));
78 // CHECK-LABEL: @fmin1(
79 // CHECK-NEXT: entry:
80 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <16 x half>, align 32
81 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca half, align 2
82 // CHECK-NEXT: store <16 x half> [[A:%.*]], ptr [[A_ADDR]], align 32
83 // CHECK-NEXT: store half [[B:%.*]], ptr [[B_ADDR]], align 2
84 // CHECK-NEXT: [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
85 // CHECK-NEXT: [[TMP1:%.*]] = call nnan half @llvm.vector.reduce.fmin.v16f16(<16 x half> [[TMP0]])
86 // CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[B_ADDR]], align 2
87 // CHECK-NEXT: [[ADD:%.*]] = fadd half [[TMP1]], [[TMP2]]
88 // CHECK-NEXT: ret half [[ADD]]
90 _Float16 fmin1(half16 a, _Float16 b) {
91 return __builtin_ia32_reduce_fmin_ph256(a) + b;