1 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
4 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
5 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
6 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
7 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
8 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
9 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512
10 ; RUN: opt < %s -passes="print<cost-model>" -mtriple=x86_64-apple-darwin 2>&1 -disable-output -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512
12 define void @reduce_f64(double %arg) {
13 ; SSE2-LABEL: 'reduce_f64'
14 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
15 ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
16 ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
17 ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
18 ; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
19 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
21 ; SSSE3-LABEL: 'reduce_f64'
22 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
23 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
24 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
25 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
26 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
27 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
29 ; SSE41-LABEL: 'reduce_f64'
30 ; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
31 ; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
32 ; SSE41-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
33 ; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
34 ; SSE41-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
35 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
37 ; SSE42-LABEL: 'reduce_f64'
38 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
39 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
40 ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
41 ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
42 ; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
43 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
45 ; AVX1-LABEL: 'reduce_f64'
46 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
47 ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
48 ; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
49 ; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
50 ; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
51 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
53 ; AVX2-LABEL: 'reduce_f64'
54 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
55 ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
56 ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
57 ; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
58 ; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
59 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
61 ; AVX512-LABEL: 'reduce_f64'
62 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
63 ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
64 ; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
65 ; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
66 ; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
67 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
69 %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
70 %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
71 %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
72 %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
73 %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
77 define void @reduce_f32(float %arg) {
78 ; SSE2-LABEL: 'reduce_f32'
79 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
80 ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
81 ; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
82 ; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
83 ; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
84 ; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
85 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
87 ; SSSE3-LABEL: 'reduce_f32'
88 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
89 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
90 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
91 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
92 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
93 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
94 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
96 ; SSE41-LABEL: 'reduce_f32'
97 ; SSE41-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
98 ; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
99 ; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
100 ; SSE41-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
101 ; SSE41-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
102 ; SSE41-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
103 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
105 ; SSE42-LABEL: 'reduce_f32'
106 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
107 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
108 ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
109 ; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
110 ; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
111 ; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
112 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
114 ; AVX1-LABEL: 'reduce_f32'
115 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
116 ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
117 ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
118 ; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
119 ; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
120 ; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
121 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
123 ; AVX2-LABEL: 'reduce_f32'
124 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
125 ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
126 ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
127 ; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
128 ; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
129 ; AVX2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
130 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
132 ; AVX512-LABEL: 'reduce_f32'
133 ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
134 ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
135 ; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
136 ; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
137 ; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
138 ; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
139 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
141 %V1 = call float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
142 %V2 = call float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
143 %V4 = call float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
144 %V8 = call float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
145 %V16 = call float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
146 %V32 = call float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
152 define void @reduce_f64_fast(double %arg) {
153 ; SSE2-LABEL: 'reduce_f64_fast'
154 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
155 ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
156 ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
157 ; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
158 ; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
159 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
161 ; SSSE3-LABEL: 'reduce_f64_fast'
162 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
163 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
164 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
165 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
166 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
167 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
169 ; SSE41-LABEL: 'reduce_f64_fast'
170 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
171 ; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
172 ; SSE41-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
173 ; SSE41-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
174 ; SSE41-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
175 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
177 ; SSE42-LABEL: 'reduce_f64_fast'
178 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
179 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
180 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
181 ; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
182 ; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
183 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
185 ; AVX1-LABEL: 'reduce_f64_fast'
186 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
187 ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
188 ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
189 ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
190 ; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
191 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
193 ; AVX2-LABEL: 'reduce_f64_fast'
194 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
195 ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
196 ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
197 ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
198 ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
199 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
201 ; AVX512-LABEL: 'reduce_f64_fast'
202 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
203 ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
204 ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
205 ; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
206 ; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
207 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
209 %V1 = call fast double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
210 %V2 = call fast double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
211 %V4 = call fast double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
212 %V8 = call fast double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
213 %V16 = call fast double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
217 define void @reduce_f32_fast(float %arg) {
218 ; SSE2-LABEL: 'reduce_f32_fast'
219 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
220 ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
221 ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
222 ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
223 ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
224 ; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
225 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
227 ; SSSE3-LABEL: 'reduce_f32_fast'
228 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
229 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
230 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
231 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
232 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
233 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
234 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
236 ; SSE41-LABEL: 'reduce_f32_fast'
237 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
238 ; SSE41-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
239 ; SSE41-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
240 ; SSE41-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
241 ; SSE41-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
242 ; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
243 ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
245 ; SSE42-LABEL: 'reduce_f32_fast'
246 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
247 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
248 ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
249 ; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
250 ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
251 ; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
252 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
254 ; AVX1-LABEL: 'reduce_f32_fast'
255 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
256 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
257 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
258 ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
259 ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
260 ; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
261 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
263 ; AVX2-LABEL: 'reduce_f32_fast'
264 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
265 ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
266 ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
267 ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
268 ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
269 ; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
270 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
272 ; AVX512-LABEL: 'reduce_f32_fast'
273 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
274 ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
275 ; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
276 ; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
277 ; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
278 ; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
279 ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
281 %V1 = call fast float @llvm.vector.reduce.fmul.v1f32(float %arg, <1 x float> undef)
282 %V2 = call fast float @llvm.vector.reduce.fmul.v2f32(float %arg, <2 x float> undef)
283 %V4 = call fast float @llvm.vector.reduce.fmul.v4f32(float %arg, <4 x float> undef)
284 %V8 = call fast float @llvm.vector.reduce.fmul.v8f32(float %arg, <8 x float> undef)
285 %V16 = call fast float @llvm.vector.reduce.fmul.v16f32(float %arg, <16 x float> undef)
286 %V32 = call fast float @llvm.vector.reduce.fmul.v32f32(float %arg, <32 x float> undef)
290 declare double @llvm.vector.reduce.fmul.v1f64(double, <1 x double>)
291 declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
292 declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
293 declare double @llvm.vector.reduce.fmul.v8f64(double, <8 x double>)
294 declare double @llvm.vector.reduce.fmul.v16f64(double, <16 x double>)
296 declare float @llvm.vector.reduce.fmul.v1f32(float, <1 x float>)
297 declare float @llvm.vector.reduce.fmul.v2f32(float, <2 x float>)
298 declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
299 declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
300 declare float @llvm.vector.reduce.fmul.v16f32(float, <16 x float>)
301 declare float @llvm.vector.reduce.fmul.v32f32(float, <32 x float>)