1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
9 ; Vectorized Pairwise Sum Reductions
11 ; inline STYPE sum(VTYPE x) {
12 ; return (x[0] + x[1]) + (x[2] + x[3]);
15 ; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
16 ; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
19 define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
20 ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
21 ; SSSE3-SLOW: # %bb.0:
22 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
23 ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
24 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0
25 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
26 ; SSSE3-SLOW-NEXT: retq
28 ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
29 ; SSSE3-FAST: # %bb.0:
30 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
31 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
32 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
33 ; SSSE3-FAST-NEXT: retq
35 ; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32:
37 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
38 ; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
39 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
40 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
41 ; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
42 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
43 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
44 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
47 ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
49 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
50 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1
51 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
53 %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
54 %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
55 %7 = fadd <2 x float> %5, %6
56 %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
57 %9 = fadd <2 x float> %7, %8
58 %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
59 %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
60 %12 = fadd <2 x float> %10, %11
61 %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
62 %14 = fadd <2 x float> %12, %13
63 %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
64 %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
65 %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
66 %18 = fadd <2 x float> %16, %17
67 %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
68 %20 = fadd <2 x float> %18, %19
69 %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70 %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
71 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
72 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
73 %25 = fadd <2 x float> %23, %24
74 %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
75 %27 = fadd <2 x float> %25, %26
76 %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
77 %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
81 define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
82 ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
83 ; SSSE3-SLOW: # %bb.0:
84 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
85 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
86 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
87 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
88 ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3
89 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
90 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
91 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
92 ; SSSE3-SLOW-NEXT: retq
94 ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
95 ; SSSE3-FAST: # %bb.0:
96 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
97 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
98 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
99 ; SSSE3-FAST-NEXT: retq
101 ; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
102 ; AVX1-SLOW: # %bb.0:
103 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
104 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
105 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
106 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
107 ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
108 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
109 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
110 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
111 ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
112 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
113 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
114 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
115 ; AVX1-SLOW-NEXT: retq
117 ; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
119 ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
120 ; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1
121 ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
122 ; AVX-FAST-NEXT: retq
124 ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
125 ; AVX2-SLOW: # %bb.0:
126 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
127 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
128 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
129 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
130 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
131 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
132 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
133 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
134 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
135 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
136 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
137 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
138 ; AVX2-SLOW-NEXT: retq
139 %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
140 %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
141 %7 = add <2 x i32> %5, %6
142 %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
143 %9 = add <2 x i32> %7, %8
144 %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
145 %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
146 %12 = add <2 x i32> %10, %11
147 %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
148 %14 = add <2 x i32> %12, %13
149 %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
150 %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
151 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
152 %18 = add <2 x i32> %16, %17
153 %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
154 %20 = add <2 x i32> %18, %19
155 %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
156 %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
157 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
158 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
159 %25 = add <2 x i32> %23, %24
160 %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
161 %27 = add <2 x i32> %25, %26
162 %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
163 %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
167 define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
168 ; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
169 ; SSSE3-SLOW: # %bb.0:
170 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
171 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
172 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
173 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
174 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
175 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2
176 ; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
177 ; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2
178 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
179 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
180 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
181 ; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
182 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
183 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
184 ; SSSE3-SLOW-NEXT: retq
186 ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
187 ; SSSE3-FAST: # %bb.0:
188 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
189 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
190 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
191 ; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4
192 ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2
193 ; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
194 ; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6
195 ; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4
196 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1
197 ; SSSE3-FAST-NEXT: retq
199 ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
200 ; AVX1-SLOW: # %bb.0:
201 ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
202 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
203 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
204 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
205 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
206 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
207 ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
208 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
209 ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
210 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
211 ; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
212 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
213 ; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
214 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
215 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
216 ; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
217 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
218 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
219 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
220 ; AVX1-SLOW-NEXT: retq
222 ; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32:
223 ; AVX1-FAST: # %bb.0:
224 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
225 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
226 ; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
227 ; AVX1-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
228 ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
229 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
230 ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
231 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
232 ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
233 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
234 ; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
235 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
236 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
237 ; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
238 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
239 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
240 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
241 ; AVX1-FAST-NEXT: retq
243 ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
244 ; AVX2-SLOW: # %bb.0:
245 ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
246 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
247 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
248 ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
249 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
250 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
251 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
252 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
253 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
254 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
255 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
256 ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
257 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
258 ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
259 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
260 ; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
261 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
262 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
263 ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
264 ; AVX2-SLOW-NEXT: retq
266 ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
267 ; AVX2-FAST: # %bb.0:
268 ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
269 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
270 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
271 ; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
272 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
273 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
274 ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
275 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
276 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
277 ; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
278 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279 ; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
280 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
281 ; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
282 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
283 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
284 ; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
285 ; AVX2-FAST-NEXT: retq
286 %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
287 %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
288 %11 = fadd <2 x float> %9, %10
289 %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
290 %13 = fadd <2 x float> %11, %12
291 %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
292 %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
293 %16 = fadd <2 x float> %14, %15
294 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
295 %18 = fadd <2 x float> %16, %17
296 %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
297 %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
298 %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
299 %22 = fadd <2 x float> %20, %21
300 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
301 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
302 %25 = fadd <2 x float> %23, %24
303 %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2>
304 %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3>
305 %28 = fadd <2 x float> %26, %27
306 %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
307 %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2>
308 %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3>
309 %32 = fadd <2 x float> %30, %31
310 %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
311 %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
312 %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
313 %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
314 %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
315 %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
316 %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
317 %40 = fadd <4 x float> %36, %39
318 %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
319 %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
320 %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2>
321 %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3>
322 %45 = fadd <2 x float> %43, %44
323 %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2>
324 %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3>
325 %48 = fadd <2 x float> %46, %47
326 %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2>
327 %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3>
328 %51 = fadd <2 x float> %49, %50
329 %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
330 %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
334 define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
335 ; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
336 ; SSSE3-SLOW: # %bb.0:
337 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
338 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
339 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
340 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
341 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
342 ; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
343 ; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2
344 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
345 ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
346 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
347 ; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
348 ; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
349 ; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
350 ; SSSE3-SLOW-NEXT: retq
352 ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
353 ; SSSE3-FAST: # %bb.0:
354 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
355 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
356 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
357 ; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4
358 ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2
359 ; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
360 ; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6
361 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7
362 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6
363 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
364 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
365 ; SSSE3-FAST-NEXT: retq
367 ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
368 ; AVX1-SLOW: # %bb.0:
369 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
370 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
371 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
372 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
373 ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
374 ; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
375 ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
376 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
377 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
378 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
379 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
380 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
381 ; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
382 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
383 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
384 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
385 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
386 ; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2
387 ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
388 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
389 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
390 ; AVX1-SLOW-NEXT: retq
392 ; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32:
393 ; AVX1-FAST: # %bb.0:
394 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
395 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
396 ; AVX1-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
397 ; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
398 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
399 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
400 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
401 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
402 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
403 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
404 ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
405 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
406 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
407 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
408 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
409 ; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2
410 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2
411 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
412 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
413 ; AVX1-FAST-NEXT: retq
415 ; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
416 ; AVX2-SLOW: # %bb.0:
417 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
418 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
419 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
420 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
421 ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
422 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
423 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
424 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
425 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
426 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
427 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
428 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
429 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
430 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
431 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
432 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
433 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
434 ; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1
435 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
436 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
437 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
438 ; AVX2-SLOW-NEXT: retq
440 ; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32:
441 ; AVX2-FAST: # %bb.0:
442 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
443 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
444 ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
445 ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
446 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
447 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
448 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
449 ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
450 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
451 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
452 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
453 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
454 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
455 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
456 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
457 ; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1
458 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1
459 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1
460 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
461 ; AVX2-FAST-NEXT: retq
462 %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
463 %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
464 %11 = add <2 x i32> %9, %10
465 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
466 %13 = add <2 x i32> %11, %12
467 %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
468 %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
469 %16 = add <2 x i32> %14, %15
470 %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
471 %18 = add <2 x i32> %16, %17
472 %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
473 %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
474 %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
475 %22 = add <2 x i32> %20, %21
476 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
477 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
478 %25 = add <2 x i32> %23, %24
479 %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
480 %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
481 %28 = add <2 x i32> %26, %27
482 %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
483 %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
484 %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
485 %32 = add <2 x i32> %30, %31
486 %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
487 %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
488 %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
489 %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
490 %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
491 %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
492 %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
493 %40 = add <4 x i32> %36, %39
494 %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
495 %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
496 %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
497 %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
498 %45 = add <2 x i32> %43, %44
499 %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
500 %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
501 %48 = add <2 x i32> %46, %47
502 %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2>
503 %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3>
504 %51 = add <2 x i32> %49, %50
505 %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
506 %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
510 ; Vectorized Sequential Sum Reductions
512 ; inline STYPE sum(VTYPE x) {
513 ; return ((x[0] + x[1]) + x[2]) + x[3];
516 ; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
517 ; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
520 define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
521 ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
522 ; SSSE3-SLOW: # %bb.0:
523 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
524 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5
525 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
526 ; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
527 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
528 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
529 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
530 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1]
531 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
532 ; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4
533 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
534 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
535 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
536 ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
537 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
538 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
539 ; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
540 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
541 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
542 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
543 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
544 ; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
545 ; SSSE3-SLOW-NEXT: retq
547 ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
548 ; SSSE3-FAST: # %bb.0:
549 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
550 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
551 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5
552 ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
553 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
554 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
555 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
556 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
557 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
558 ; SSSE3-FAST-NEXT: addps %xmm5, %xmm0
559 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
560 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
561 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
562 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
563 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
564 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm2
565 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
566 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm3
567 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
568 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
569 ; SSSE3-FAST-NEXT: retq
571 ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
573 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4
574 ; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
575 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
576 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
577 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
578 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
579 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
580 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
581 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
582 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
583 ; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
584 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
585 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
586 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
587 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
588 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
589 ; AVX-SLOW-NEXT: retq
591 ; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
593 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4
594 ; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
595 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
596 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
597 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
598 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
599 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
600 ; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
601 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
602 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
603 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
604 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
605 ; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
606 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
607 ; AVX-FAST-NEXT: retq
608 %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4>
609 %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5>
610 %7 = fadd <2 x float> %5, %6
611 %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6>
612 %9 = fadd <2 x float> %8, %7
613 %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7>
614 %11 = fadd <2 x float> %10, %9
615 %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
616 %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
617 %14 = fadd <4 x float> %13, %2
618 %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
619 %16 = fadd <4 x float> %15, %14
620 %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
621 %18 = fadd <4 x float> %17, %16
622 %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
623 %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
624 %21 = fadd <4 x float> %20, %3
625 %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
626 %23 = fadd <4 x float> %22, %21
627 %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
628 %25 = fadd <4 x float> %24, %23
629 %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
633 define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
634 ; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32:
635 ; SSSE3-SLOW: # %bb.0:
636 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm4
637 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4
638 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
639 ; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
640 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
641 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
642 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
643 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
644 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
645 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
646 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
647 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
648 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6
649 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
650 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
651 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
652 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
653 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
654 ; SSSE3-SLOW-NEXT: retq
656 ; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32:
657 ; SSSE3-FAST: # %bb.0:
658 ; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm4
659 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4
660 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
661 ; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
662 ; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4
663 ; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
664 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
665 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
666 ; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5
667 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5
668 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
669 ; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6
670 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
671 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
672 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
673 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
674 ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
675 ; SSSE3-FAST-NEXT: retq
677 ; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32:
678 ; AVX1-SLOW: # %bb.0:
679 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
680 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
681 ; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
682 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
683 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
684 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
685 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
686 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
687 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
688 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
689 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
690 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
691 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
692 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
693 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
694 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
695 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
696 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
697 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
698 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
699 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
700 ; AVX1-SLOW-NEXT: retq
702 ; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32:
703 ; AVX1-FAST: # %bb.0:
704 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
705 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
706 ; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
707 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
708 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
709 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
710 ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
711 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
712 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
713 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
714 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
715 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
716 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
717 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
718 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
719 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
720 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
721 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
722 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
723 ; AVX1-FAST-NEXT: retq
725 ; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32:
726 ; AVX2-SLOW: # %bb.0:
727 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
728 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
729 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
730 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
731 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
732 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
733 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
734 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
735 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
736 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
737 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
738 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
739 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
740 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
741 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
742 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
743 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
744 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
745 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
746 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
747 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
748 ; AVX2-SLOW-NEXT: retq
750 ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
751 ; AVX2-FAST: # %bb.0:
752 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
753 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
754 ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
755 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
756 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
757 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
758 ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
759 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
760 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
761 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
762 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
763 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
764 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
765 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
766 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
767 ; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
768 ; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
769 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
770 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
771 ; AVX2-FAST-NEXT: retq
772 %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
773 %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
774 %7 = add <2 x i32> %5, %6
775 %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6>
776 %9 = add <2 x i32> %8, %7
777 %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7>
778 %11 = add <2 x i32> %10, %9
779 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
780 %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
781 %14 = add <4 x i32> %13, %2
782 %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
783 %16 = add <4 x i32> %15, %14
784 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
785 %18 = add <4 x i32> %17, %16
786 %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
787 %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
788 %21 = add <4 x i32> %20, %3
789 %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
790 %23 = add <4 x i32> %22, %21
791 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
792 %25 = add <4 x i32> %24, %23
793 %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
797 ; Vectorized Reductions
799 ; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
800 ; return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) };
803 define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
804 ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
805 ; SSSE3-SLOW: # %bb.0:
806 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
807 ; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4
808 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
809 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
810 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
811 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
812 ; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0
813 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
814 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
815 ; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
816 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
817 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
818 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
819 ; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1
820 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
821 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
822 ; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1
823 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
824 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
825 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
826 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
827 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2
828 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
829 ; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1
830 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
831 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
832 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
833 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
834 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3
835 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
836 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
837 ; SSSE3-SLOW-NEXT: retq
839 ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
840 ; SSSE3-FAST: # %bb.0:
841 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
842 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
843 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
844 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
845 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
846 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
847 ; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
848 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
849 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
850 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
851 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
852 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
853 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
854 ; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
855 ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
856 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
857 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
858 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
859 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
860 ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
861 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
862 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
863 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
864 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
865 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
866 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
867 ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
868 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
869 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
870 ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
871 ; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
872 ; SSSE3-FAST-NEXT: retq
874 ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
876 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
877 ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4
878 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
879 ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
880 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
881 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0
882 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
883 ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4
884 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
885 ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
886 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
887 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1
888 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
889 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
890 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1
891 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
892 ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1
893 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
894 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
895 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
896 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
897 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
898 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
899 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
900 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
901 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
902 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
903 ; AVX-SLOW-NEXT: retq
905 ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
907 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
908 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
909 ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
910 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
911 ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
912 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
913 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
914 ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
915 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
916 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
917 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
918 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
919 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
920 ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
921 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
922 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
923 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
924 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
925 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
926 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
927 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
928 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
929 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
930 ; AVX-FAST-NEXT: retq
931 %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
932 %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
933 %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
934 %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
935 %9 = insertelement <4 x float> undef, float %5, i32 0
936 %10 = insertelement <4 x float> %9, float %6, i32 1
937 %11 = insertelement <4 x float> %10, float %7, i32 2
938 %12 = insertelement <4 x float> %11, float %8, i32 3
941 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
943 define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
944 ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
945 ; SSSE3-SLOW: # %bb.0:
946 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
947 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
948 ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
949 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
950 ; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
951 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
952 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5
953 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
954 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
955 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
956 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
957 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
958 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
959 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
960 ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
961 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3
962 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
963 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
964 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
965 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
966 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
967 ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
968 ; SSSE3-SLOW-NEXT: retq
970 ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
971 ; SSSE3-FAST: # %bb.0:
972 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
973 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
974 ; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
975 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
976 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
977 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
978 ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
979 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
980 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
981 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
982 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
983 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
984 ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
985 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
986 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
987 ; SSSE3-FAST-NEXT: retq
989 ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
991 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
992 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
993 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
994 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
995 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
996 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2
997 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
998 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3
999 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
1000 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
1001 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
1002 ; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1003 ; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1004 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1005 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
1006 ; AVX-SLOW-NEXT: retq
1008 ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1009 ; AVX-FAST: # %bb.0:
1010 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1011 ; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
1012 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1013 ; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
1014 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1015 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1016 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
1017 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
1018 ; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
1019 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
1020 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1021 ; AVX-FAST-NEXT: retq
1022 %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
1023 %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
1024 %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
1025 %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
1026 %9 = insertelement <4 x float> undef, float %5, i32 0
1027 %10 = insertelement <4 x float> %9, float %6, i32 1
1028 %11 = insertelement <4 x float> %10, float %7, i32 2
1029 %12 = insertelement <4 x float> %11, float %8, i32 3
1033 define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
1034 ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1035 ; SSSE3-SLOW: # %bb.0:
1036 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1037 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
1038 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1039 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1040 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
1041 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1042 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1043 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1044 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
1045 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1046 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1047 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
1048 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
1049 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1050 ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
1051 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
1052 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1053 ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1054 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
1055 ; SSSE3-SLOW-NEXT: retq
1057 ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
1058 ; SSSE3-FAST: # %bb.0:
1059 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1060 ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
1061 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1062 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
1063 ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
1064 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1065 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
1066 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1067 ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
1068 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
1069 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1070 ; SSSE3-FAST-NEXT: retq
1072 ; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1073 ; AVX1-SLOW: # %bb.0:
1074 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1075 ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1076 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1077 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1078 ; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
1079 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
1080 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1081 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1082 ; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1083 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1084 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1085 ; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3
1086 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
1087 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1088 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1089 ; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1090 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1091 ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1092 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1093 ; AVX1-SLOW-NEXT: retq
1095 ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
1096 ; AVX-FAST: # %bb.0:
1097 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1098 ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1099 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1100 ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1101 ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1102 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1103 ; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1104 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1105 ; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1106 ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1107 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1108 ; AVX-FAST-NEXT: retq
1110 ; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1111 ; AVX2-SLOW: # %bb.0:
1112 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1113 ; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1114 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1115 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1116 ; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
1117 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1118 ; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1119 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
1120 ; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3
1121 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1122 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
1123 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
1124 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1125 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1
1126 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2
1127 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1128 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1129 ; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1130 ; AVX2-SLOW-NEXT: retq
1131 %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
1132 %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
1133 %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)
1134 %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3)
1135 %9 = insertelement <4 x i32> undef, i32 %5, i32 0
1136 %10 = insertelement <4 x i32> %9, i32 %6, i32 1
1137 %11 = insertelement <4 x i32> %10, i32 %7, i32 2
1138 %12 = insertelement <4 x i32> %11, i32 %8, i32 3
1141 declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)