1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
9 ; Vectorized Pairwise Sum Reductions
11 ; inline STYPE sum(VTYPE x) {
12 ; return (x[0] + x[1]) + (x[2] + x[3]);
15 ; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
16 ; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
19 define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
20 ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
21 ; SSSE3-SLOW: # %bb.0:
22 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
23 ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
24 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0
25 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
26 ; SSSE3-SLOW-NEXT: retq
28 ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
29 ; SSSE3-FAST: # %bb.0:
30 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
31 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
32 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
33 ; SSSE3-FAST-NEXT: retq
35 ; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32:
37 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
38 ; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
39 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
40 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
41 ; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
42 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
43 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
44 ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
47 ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
49 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
50 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1
51 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
53 %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
54 %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
55 %7 = fadd <2 x float> %5, %6
56 %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
57 %9 = fadd <2 x float> %7, %8
58 %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
59 %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
60 %12 = fadd <2 x float> %10, %11
61 %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
62 %14 = fadd <2 x float> %12, %13
63 %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
64 %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
65 %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
66 %18 = fadd <2 x float> %16, %17
67 %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
68 %20 = fadd <2 x float> %18, %19
69 %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70 %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
71 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
72 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
73 %25 = fadd <2 x float> %23, %24
74 %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
75 %27 = fadd <2 x float> %25, %26
76 %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
77 %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
81 define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
82 ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
83 ; SSSE3-SLOW: # %bb.0:
84 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
85 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
86 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
87 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
88 ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3
89 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
90 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
91 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
92 ; SSSE3-SLOW-NEXT: retq
94 ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
95 ; SSSE3-FAST: # %bb.0:
96 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
97 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
98 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
99 ; SSSE3-FAST-NEXT: retq
101 ; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
102 ; AVX1-SLOW: # %bb.0:
103 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
104 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
105 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
106 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
107 ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
108 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
109 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
110 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
111 ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
112 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
113 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
114 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
115 ; AVX1-SLOW-NEXT: retq
117 ; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
119 ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
120 ; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1
121 ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
122 ; AVX-FAST-NEXT: retq
124 ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
125 ; AVX2-SLOW: # %bb.0:
126 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
127 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
128 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
129 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
130 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
131 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
132 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
133 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
134 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
135 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
136 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
137 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
138 ; AVX2-SLOW-NEXT: retq
139 %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
140 %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
141 %7 = add <2 x i32> %5, %6
142 %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
143 %9 = add <2 x i32> %7, %8
144 %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
145 %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
146 %12 = add <2 x i32> %10, %11
147 %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
148 %14 = add <2 x i32> %12, %13
149 %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
150 %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
151 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
152 %18 = add <2 x i32> %16, %17
153 %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
154 %20 = add <2 x i32> %18, %19
155 %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
156 %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
157 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
158 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
159 %25 = add <2 x i32> %23, %24
160 %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
161 %27 = add <2 x i32> %25, %26
162 %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
163 %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
167 define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) {
168 ; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32:
169 ; SSSE3-SLOW: # %bb.0:
170 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
171 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
172 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
173 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
174 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
175 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2
176 ; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
177 ; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2
178 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
179 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
180 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
181 ; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
182 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
183 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
184 ; SSSE3-SLOW-NEXT: retq
186 ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
187 ; SSSE3-FAST: # %bb.0:
188 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
189 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
190 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
191 ; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4
192 ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2
193 ; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
194 ; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6
195 ; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4
196 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1
197 ; SSSE3-FAST-NEXT: retq
199 ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
200 ; AVX1-SLOW: # %bb.0:
201 ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
202 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
203 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
204 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
205 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
206 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
207 ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
208 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
209 ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
210 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
211 ; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
212 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
213 ; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
214 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
215 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
216 ; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
217 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
218 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
219 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
220 ; AVX1-SLOW-NEXT: retq
222 ; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32:
223 ; AVX1-FAST: # %bb.0:
224 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
225 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
226 ; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
227 ; AVX1-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
228 ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
229 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
230 ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
231 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
232 ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
233 ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
234 ; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
235 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
236 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
237 ; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
238 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
239 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
240 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
241 ; AVX1-FAST-NEXT: retq
243 ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
244 ; AVX2-SLOW: # %bb.0:
245 ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
246 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
247 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
248 ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
249 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
250 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
251 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
252 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
253 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
254 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
255 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
256 ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
257 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
258 ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
259 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
260 ; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
261 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
262 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
263 ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
264 ; AVX2-SLOW-NEXT: retq
266 ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
267 ; AVX2-FAST: # %bb.0:
268 ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
269 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
270 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
271 ; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
272 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
273 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
274 ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
275 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
276 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
277 ; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
278 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
279 ; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
280 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
281 ; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
282 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
283 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
284 ; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
285 ; AVX2-FAST-NEXT: retq
286 %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
287 %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
288 %11 = fadd <2 x float> %9, %10
289 %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
290 %13 = fadd <2 x float> %11, %12
291 %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2>
292 %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3>
293 %16 = fadd <2 x float> %14, %15
294 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef>
295 %18 = fadd <2 x float> %16, %17
296 %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
297 %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2>
298 %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3>
299 %22 = fadd <2 x float> %20, %21
300 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2>
301 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
302 %25 = fadd <2 x float> %23, %24
303 %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2>
304 %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3>
305 %28 = fadd <2 x float> %26, %27
306 %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
307 %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2>
308 %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3>
309 %32 = fadd <2 x float> %30, %31
310 %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
311 %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
312 %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
313 %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
314 %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
315 %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
316 %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
317 %40 = fadd <4 x float> %36, %39
318 %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
319 %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
320 %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2>
321 %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3>
322 %45 = fadd <2 x float> %43, %44
323 %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2>
324 %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3>
325 %48 = fadd <2 x float> %46, %47
326 %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2>
327 %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3>
328 %51 = fadd <2 x float> %49, %50
329 %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
330 %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
334 define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) {
335 ; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
336 ; SSSE3-SLOW: # %bb.0:
337 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
338 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
339 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
340 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
341 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
342 ; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
343 ; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2
344 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
345 ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
346 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
347 ; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
348 ; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
349 ; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
350 ; SSSE3-SLOW-NEXT: retq
352 ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
353 ; SSSE3-FAST: # %bb.0:
354 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
355 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
356 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
357 ; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4
358 ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2
359 ; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
360 ; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6
361 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7
362 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6
363 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
364 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
365 ; SSSE3-FAST-NEXT: retq
367 ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
368 ; AVX1-SLOW: # %bb.0:
369 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
370 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
371 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
372 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
373 ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
374 ; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
375 ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
376 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
377 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
378 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
379 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
380 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
381 ; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
382 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
383 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
384 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
385 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
386 ; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2
387 ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
388 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
389 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
390 ; AVX1-SLOW-NEXT: retq
392 ; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32:
393 ; AVX1-FAST: # %bb.0:
394 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
395 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
396 ; AVX1-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
397 ; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
398 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
399 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
400 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
401 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
402 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
403 ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
404 ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
405 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
406 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
407 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
408 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
409 ; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2
410 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2
411 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
412 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
413 ; AVX1-FAST-NEXT: retq
415 ; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
416 ; AVX2-SLOW: # %bb.0:
417 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
418 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
419 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
420 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
421 ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
422 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
423 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
424 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
425 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
426 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
427 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
428 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
429 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
430 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
431 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
432 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
433 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
434 ; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1
435 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
436 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
437 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
438 ; AVX2-SLOW-NEXT: retq
440 ; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32:
441 ; AVX2-FAST: # %bb.0:
442 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
443 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
444 ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
445 ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
446 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
447 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
448 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
449 ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
450 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
451 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
452 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
453 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
454 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
455 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
456 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
457 ; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1
458 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1
459 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1
460 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
461 ; AVX2-FAST-NEXT: retq
462 %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
463 %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
464 %11 = add <2 x i32> %9, %10
465 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
466 %13 = add <2 x i32> %11, %12
467 %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
468 %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
469 %16 = add <2 x i32> %14, %15
470 %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
471 %18 = add <2 x i32> %16, %17
472 %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
473 %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
474 %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
475 %22 = add <2 x i32> %20, %21
476 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
477 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
478 %25 = add <2 x i32> %23, %24
479 %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
480 %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
481 %28 = add <2 x i32> %26, %27
482 %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
483 %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
484 %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
485 %32 = add <2 x i32> %30, %31
486 %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
487 %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
488 %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
489 %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
490 %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
491 %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
492 %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
493 %40 = add <4 x i32> %36, %39
494 %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
495 %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
496 %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
497 %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
498 %45 = add <2 x i32> %43, %44
499 %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
500 %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
501 %48 = add <2 x i32> %46, %47
502 %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2>
503 %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3>
504 %51 = add <2 x i32> %49, %50
505 %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
506 %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
510 ; Vectorized Sequential Sum Reductions
512 ; inline STYPE sum(VTYPE x) {
513 ; return ((x[0] + x[1]) + x[2]) + x[3];
516 ; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
517 ; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) };
520 define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
521 ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
522 ; SSSE3-SLOW: # %bb.0:
523 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
524 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5
525 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
526 ; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
527 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
528 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
529 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
530 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1]
531 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
532 ; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4
533 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
534 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
535 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
536 ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
537 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
538 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
539 ; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
540 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
541 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
542 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
543 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
544 ; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
545 ; SSSE3-SLOW-NEXT: retq
547 ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
548 ; SSSE3-FAST: # %bb.0:
549 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
550 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
551 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5
552 ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
553 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
554 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
555 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
556 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
557 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
558 ; SSSE3-FAST-NEXT: addps %xmm5, %xmm0
559 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
560 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
561 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
562 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
563 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
564 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm2
565 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
566 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm3
567 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
568 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
569 ; SSSE3-FAST-NEXT: retq
571 ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
573 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4
574 ; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
575 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
576 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
577 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
578 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
579 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
580 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
581 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
582 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
583 ; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
584 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
585 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
586 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
587 ; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
588 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
589 ; AVX-SLOW-NEXT: retq
591 ; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
593 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4
594 ; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
595 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
596 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
597 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
598 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
599 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
600 ; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
601 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
602 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
603 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
604 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
605 ; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
606 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
607 ; AVX-FAST-NEXT: retq
608 %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4>
609 %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5>
610 %7 = fadd <2 x float> %5, %6
611 %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6>
612 %9 = fadd <2 x float> %8, %7
613 %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7>
614 %11 = fadd <2 x float> %10, %9
615 %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
616 %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
617 %14 = fadd <4 x float> %13, %2
618 %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
619 %16 = fadd <4 x float> %15, %14
620 %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
621 %18 = fadd <4 x float> %17, %16
622 %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
623 %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
624 %21 = fadd <4 x float> %20, %3
625 %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
626 %23 = fadd <4 x float> %22, %21
627 %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
628 %25 = fadd <4 x float> %24, %23
629 %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
633 define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
634 ; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32:
635 ; SSSE3-SLOW: # %bb.0:
636 ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm4
637 ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4
638 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
639 ; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
640 ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
641 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
642 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
643 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
644 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
645 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
646 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1
647 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
648 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6
649 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
650 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
651 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
652 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
653 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
654 ; SSSE3-SLOW-NEXT: retq
656 ; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32:
657 ; SSSE3-FAST: # %bb.0:
658 ; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm4
659 ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4
660 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
661 ; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
662 ; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4
663 ; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
664 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
665 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
666 ; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5
667 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5
668 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
669 ; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6
670 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
671 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
672 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
673 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
674 ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
675 ; SSSE3-FAST-NEXT: retq
677 ; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32:
678 ; AVX1-SLOW: # %bb.0:
679 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
680 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
681 ; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
682 ; AVX1-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
683 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
684 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
685 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
686 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
687 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
688 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
689 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
690 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
691 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
692 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
693 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
694 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
695 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
696 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
697 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
698 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
699 ; AVX1-SLOW-NEXT: retq
701 ; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32:
702 ; AVX1-FAST: # %bb.0:
703 ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
704 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
705 ; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
706 ; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
707 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
708 ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
709 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
710 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
711 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
712 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
713 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
714 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
715 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
716 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
717 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
718 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
719 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
720 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
721 ; AVX1-FAST-NEXT: retq
723 ; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32:
724 ; AVX2-SLOW: # %bb.0:
725 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
726 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
727 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
728 ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
729 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
730 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
731 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
732 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
733 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
734 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
735 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
736 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
737 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
738 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
739 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
740 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
741 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
742 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
743 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
744 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
745 ; AVX2-SLOW-NEXT: retq
747 ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
748 ; AVX2-FAST: # %bb.0:
749 ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
750 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
751 ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
752 ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
753 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
754 ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
755 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
756 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
757 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
758 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
759 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
760 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
761 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
762 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
763 ; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
764 ; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
765 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
766 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
767 ; AVX2-FAST-NEXT: retq
768 %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
769 %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>
770 %7 = add <2 x i32> %5, %6
771 %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6>
772 %9 = add <2 x i32> %8, %7
773 %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7>
774 %11 = add <2 x i32> %10, %9
775 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
776 %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
777 %14 = add <4 x i32> %13, %2
778 %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
779 %16 = add <4 x i32> %15, %14
780 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
781 %18 = add <4 x i32> %17, %16
782 %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
783 %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
784 %21 = add <4 x i32> %20, %3
785 %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
786 %23 = add <4 x i32> %22, %21
787 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
788 %25 = add <4 x i32> %24, %23
789 %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
793 ; Vectorized Reductions
795 ; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) {
796 ; return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) };
799 define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
800 ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
801 ; SSSE3-SLOW: # %bb.0:
802 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
803 ; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4
804 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
805 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
806 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
807 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
808 ; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0
809 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
810 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
811 ; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
812 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
813 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
814 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
815 ; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1
816 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
817 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
818 ; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1
819 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
820 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
821 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
822 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
823 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2
824 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
825 ; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1
826 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
827 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
828 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
829 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
830 ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3
831 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
832 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
833 ; SSSE3-SLOW-NEXT: retq
835 ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
836 ; SSSE3-FAST: # %bb.0:
837 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
838 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
839 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
840 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
841 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
842 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
843 ; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
844 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
845 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
846 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
847 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
848 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
849 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
850 ; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
851 ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
852 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
853 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
854 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
855 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
856 ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
857 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
858 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
859 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
860 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
861 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
862 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
863 ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
864 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
865 ; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
866 ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
867 ; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
868 ; SSSE3-FAST-NEXT: retq
870 ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
872 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
873 ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4
874 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
875 ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
876 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
877 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0
878 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
879 ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4
880 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
881 ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
882 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
883 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1
884 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
885 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
886 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1
887 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
888 ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1
889 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
890 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
891 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
892 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
893 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
894 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
895 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
896 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
897 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
898 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
899 ; AVX-SLOW-NEXT: retq
901 ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
903 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
904 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
905 ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
906 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
907 ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
908 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
909 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
910 ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
911 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
912 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
913 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
914 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
915 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
916 ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
917 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
918 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
919 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
920 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
921 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
922 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
923 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
924 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
925 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
926 ; AVX-FAST-NEXT: retq
927 %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
928 %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
929 %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
930 %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
931 %9 = insertelement <4 x float> undef, float %5, i32 0
932 %10 = insertelement <4 x float> %9, float %6, i32 1
933 %11 = insertelement <4 x float> %10, float %7, i32 2
934 %12 = insertelement <4 x float> %11, float %8, i32 3
937 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
939 define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
940 ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
941 ; SSSE3-SLOW: # %bb.0:
942 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
943 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
944 ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
945 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
946 ; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
947 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
948 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5
949 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
950 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
951 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
952 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
953 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
954 ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
955 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
956 ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
957 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3
958 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
959 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
960 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
961 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
962 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
963 ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
964 ; SSSE3-SLOW-NEXT: retq
966 ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
967 ; SSSE3-FAST: # %bb.0:
968 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
969 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
970 ; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
971 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
972 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
973 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
974 ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
975 ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
976 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
977 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
978 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
979 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
980 ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
981 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
982 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
983 ; SSSE3-FAST-NEXT: retq
985 ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
987 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
988 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
989 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
990 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
991 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
992 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2
993 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
994 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3
995 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1]
996 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
997 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
998 ; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
999 ; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1000 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1001 ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
1002 ; AVX-SLOW-NEXT: retq
1004 ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1005 ; AVX-FAST: # %bb.0:
1006 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1007 ; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
1008 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1009 ; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
1010 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1011 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1012 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
1013 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
1014 ; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
1015 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
1016 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1017 ; AVX-FAST-NEXT: retq
1018 %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
1019 %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
1020 %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
1021 %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
1022 %9 = insertelement <4 x float> undef, float %5, i32 0
1023 %10 = insertelement <4 x float> %9, float %6, i32 1
1024 %11 = insertelement <4 x float> %10, float %7, i32 2
1025 %12 = insertelement <4 x float> %11, float %8, i32 3
1029 define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
1030 ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1031 ; SSSE3-SLOW: # %bb.0:
1032 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1033 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
1034 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1035 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1036 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
1037 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1038 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1039 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1040 ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
1041 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1042 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1043 ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
1044 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
1045 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1046 ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
1047 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
1048 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1049 ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1050 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
1051 ; SSSE3-SLOW-NEXT: retq
1053 ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
1054 ; SSSE3-FAST: # %bb.0:
1055 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1056 ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
1057 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1058 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
1059 ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
1060 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1061 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
1062 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1063 ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
1064 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
1065 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1066 ; SSSE3-FAST-NEXT: retq
1068 ; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1069 ; AVX1-SLOW: # %bb.0:
1070 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1071 ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1072 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1073 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1074 ; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
1075 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
1076 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1077 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1078 ; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1079 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1080 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1081 ; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3
1082 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
1083 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1084 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1085 ; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1086 ; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1087 ; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1088 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1089 ; AVX1-SLOW-NEXT: retq
1091 ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
1092 ; AVX-FAST: # %bb.0:
1093 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1094 ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1095 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1096 ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1097 ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1098 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1099 ; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1100 ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1101 ; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1102 ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1103 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1104 ; AVX-FAST-NEXT: retq
1106 ; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
1107 ; AVX2-SLOW: # %bb.0:
1108 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1109 ; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1110 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1111 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1112 ; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1
1113 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1114 ; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2
1115 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
1116 ; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3
1117 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1118 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
1119 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
1120 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1121 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1
1122 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2
1123 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1124 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1125 ; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1126 ; AVX2-SLOW-NEXT: retq
1127 %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
1128 %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
1129 %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)
1130 %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3)
1131 %9 = insertelement <4 x i32> undef, i32 %5, i32 0
1132 %10 = insertelement <4 x i32> %9, i32 %6, i32 1
1133 %11 = insertelement <4 x i32> %10, i32 %7, i32 2
1134 %12 = insertelement <4 x i32> %11, i32 %8, i32 3
1137 declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)