1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
9 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
11 ; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
12 ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
14 define <4 x float> @hadd_v4f32(<4 x float> %a) {
15 ; SSE-LABEL: hadd_v4f32:
17 ; SSE-NEXT: haddps %xmm0, %xmm0
20 ; AVX-LABEL: hadd_v4f32:
22 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
24 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
25 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
26 %hop = fadd <2 x float> %a02, %a13
27 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
31 define <8 x float> @hadd_v8f32a(<8 x float> %a) {
32 ; SSE_SLOW-LABEL: hadd_v8f32a:
34 ; SSE_SLOW-NEXT: movaps %xmm0, %xmm2
35 ; SSE_SLOW-NEXT: haddps %xmm1, %xmm2
36 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
37 ; SSE_SLOW-NEXT: movaps %xmm2, %xmm1
40 ; SSE_FAST-LABEL: hadd_v8f32a:
42 ; SSE_FAST-NEXT: movaps %xmm0, %xmm2
43 ; SSE_FAST-NEXT: haddps %xmm1, %xmm2
44 ; SSE_FAST-NEXT: haddps %xmm0, %xmm0
45 ; SSE_FAST-NEXT: movaps %xmm2, %xmm1
48 ; AVX1_SLOW-LABEL: hadd_v8f32a:
50 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
51 ; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
52 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
53 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
54 ; AVX1_SLOW-NEXT: retq
56 ; AVX1_FAST-LABEL: hadd_v8f32a:
58 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
59 ; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm1, %ymm0
60 ; AVX1_FAST-NEXT: retq
62 ; AVX2-LABEL: hadd_v8f32a:
64 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
65 ; AVX2-NEXT: vhaddps %xmm1, %xmm0, %xmm0
66 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
68 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
69 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
70 %hop = fadd <4 x float> %a0, %a1
71 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
75 define <8 x float> @hadd_v8f32b(<8 x float> %a) {
76 ; SSE-LABEL: hadd_v8f32b:
78 ; SSE-NEXT: haddps %xmm0, %xmm0
79 ; SSE-NEXT: haddps %xmm1, %xmm1
82 ; AVX-LABEL: hadd_v8f32b:
84 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
86 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
87 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
88 %hop = fadd <8 x float> %a0, %a1
89 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
93 define <4 x float> @hsub_v4f32(<4 x float> %a) {
94 ; SSE-LABEL: hsub_v4f32:
96 ; SSE-NEXT: hsubps %xmm0, %xmm0
99 ; AVX-LABEL: hsub_v4f32:
101 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
103 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
104 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
105 %hop = fsub <2 x float> %a02, %a13
106 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
107 ret <4 x float> %shuf
110 define <8 x float> @hsub_v8f32a(<8 x float> %a) {
111 ; SSE_SLOW-LABEL: hsub_v8f32a:
113 ; SSE_SLOW-NEXT: movaps %xmm0, %xmm2
114 ; SSE_SLOW-NEXT: hsubps %xmm1, %xmm2
115 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
116 ; SSE_SLOW-NEXT: movaps %xmm2, %xmm1
117 ; SSE_SLOW-NEXT: retq
119 ; SSE_FAST-LABEL: hsub_v8f32a:
121 ; SSE_FAST-NEXT: movaps %xmm0, %xmm2
122 ; SSE_FAST-NEXT: hsubps %xmm1, %xmm2
123 ; SSE_FAST-NEXT: hsubps %xmm0, %xmm0
124 ; SSE_FAST-NEXT: movaps %xmm2, %xmm1
125 ; SSE_FAST-NEXT: retq
127 ; AVX1_SLOW-LABEL: hsub_v8f32a:
128 ; AVX1_SLOW: # %bb.0:
129 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
130 ; AVX1_SLOW-NEXT: vhsubps %xmm1, %xmm0, %xmm0
131 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
132 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
133 ; AVX1_SLOW-NEXT: retq
135 ; AVX1_FAST-LABEL: hsub_v8f32a:
136 ; AVX1_FAST: # %bb.0:
137 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
138 ; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm1, %ymm0
139 ; AVX1_FAST-NEXT: retq
141 ; AVX2-LABEL: hsub_v8f32a:
143 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
144 ; AVX2-NEXT: vhsubps %xmm1, %xmm0, %xmm0
145 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
147 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
148 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
149 %hop = fsub <4 x float> %a0, %a1
150 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
151 ret <8 x float> %shuf
154 define <8 x float> @hsub_v8f32b(<8 x float> %a) {
155 ; SSE-LABEL: hsub_v8f32b:
157 ; SSE-NEXT: hsubps %xmm0, %xmm0
158 ; SSE-NEXT: hsubps %xmm1, %xmm1
161 ; AVX-LABEL: hsub_v8f32b:
163 ; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
165 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
166 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
167 %hop = fsub <8 x float> %a0, %a1
168 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
169 ret <8 x float> %shuf
172 define <2 x double> @hadd_v2f64(<2 x double> %a) {
173 ; SSE_SLOW-LABEL: hadd_v2f64:
175 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
176 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
177 ; SSE_SLOW-NEXT: addsd %xmm0, %xmm1
178 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
179 ; SSE_SLOW-NEXT: retq
181 ; SSE_FAST-LABEL: hadd_v2f64:
183 ; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
184 ; SSE_FAST-NEXT: retq
186 ; AVX1_SLOW-LABEL: hadd_v2f64:
187 ; AVX1_SLOW: # %bb.0:
188 ; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
189 ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
190 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
191 ; AVX1_SLOW-NEXT: retq
193 ; AVX1_FAST-LABEL: hadd_v2f64:
194 ; AVX1_FAST: # %bb.0:
195 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
196 ; AVX1_FAST-NEXT: retq
198 ; AVX2_SLOW-LABEL: hadd_v2f64:
199 ; AVX2_SLOW: # %bb.0:
200 ; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
201 ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
202 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
203 ; AVX2_SLOW-NEXT: retq
205 ; AVX2_FAST-LABEL: hadd_v2f64:
206 ; AVX2_FAST: # %bb.0:
207 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
208 ; AVX2_FAST-NEXT: retq
209 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
210 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
211 %hop = fadd <2 x double> %a0, %a1
212 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
213 ret <2 x double> %shuf
216 define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
217 ; SSE_SLOW-LABEL: hadd_v2f64_scalar_splat:
219 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
220 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
221 ; SSE_SLOW-NEXT: addsd %xmm0, %xmm1
222 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
223 ; SSE_SLOW-NEXT: retq
225 ; SSE_FAST-LABEL: hadd_v2f64_scalar_splat:
227 ; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
228 ; SSE_FAST-NEXT: retq
230 ; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat:
231 ; AVX1_SLOW: # %bb.0:
232 ; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
233 ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
234 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
235 ; AVX1_SLOW-NEXT: retq
237 ; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat:
238 ; AVX1_FAST: # %bb.0:
239 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
240 ; AVX1_FAST-NEXT: retq
242 ; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat:
243 ; AVX2_SLOW: # %bb.0:
244 ; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
245 ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
246 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
247 ; AVX2_SLOW-NEXT: retq
249 ; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat:
250 ; AVX2_FAST: # %bb.0:
251 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
252 ; AVX2_FAST-NEXT: retq
253 %a0 = extractelement <2 x double> %a, i32 0
254 %a1 = extractelement <2 x double> %a, i32 1
255 %hop = fadd double %a0, %a1
256 %ins = insertelement <2 x double> undef, double %hop, i32 0
257 %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0>
258 ret <2 x double> %shuf
261 define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
262 ; SSE_SLOW-LABEL: hadd_v4f64_scalar_splat:
264 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm2
265 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
266 ; SSE_SLOW-NEXT: addsd %xmm0, %xmm2
267 ; SSE_SLOW-NEXT: movapd %xmm1, %xmm3
268 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
269 ; SSE_SLOW-NEXT: addsd %xmm1, %xmm3
270 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
271 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
272 ; SSE_SLOW-NEXT: retq
274 ; SSE_FAST-LABEL: hadd_v4f64_scalar_splat:
276 ; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
277 ; SSE_FAST-NEXT: haddpd %xmm1, %xmm1
278 ; SSE_FAST-NEXT: retq
280 ; AVX-LABEL: hadd_v4f64_scalar_splat:
282 ; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
284 %a0 = extractelement <4 x double> %a, i32 0
285 %a1 = extractelement <4 x double> %a, i32 1
286 %hop0 = fadd double %a0, %a1
287 %a2 = extractelement <4 x double> %a, i32 2
288 %a3 = extractelement <4 x double> %a, i32 3
289 %hop1 = fadd double %a2, %a3
290 %ins = insertelement <4 x double> undef, double %hop0, i32 0
291 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2
292 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
293 ret <4 x double> %shuf
296 define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
297 ; SSE_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
299 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
300 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
301 ; SSE_SLOW-NEXT: addsd %xmm0, %xmm1
302 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
303 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
304 ; SSE_SLOW-NEXT: retq
306 ; SSE_FAST-LABEL: hadd_v4f64_scalar_broadcast:
308 ; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
309 ; SSE_FAST-NEXT: movapd %xmm0, %xmm1
310 ; SSE_FAST-NEXT: retq
312 ; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
313 ; AVX1_SLOW: # %bb.0:
314 ; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
315 ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
316 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
317 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
318 ; AVX1_SLOW-NEXT: retq
320 ; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast:
321 ; AVX1_FAST: # %bb.0:
322 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
323 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
324 ; AVX1_FAST-NEXT: retq
326 ; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
327 ; AVX2_SLOW: # %bb.0:
328 ; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
329 ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
330 ; AVX2_SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
331 ; AVX2_SLOW-NEXT: retq
333 ; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast:
334 ; AVX2_FAST: # %bb.0:
335 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
336 ; AVX2_FAST-NEXT: vbroadcastsd %xmm0, %ymm0
337 ; AVX2_FAST-NEXT: retq
338 %a0 = extractelement <4 x double> %a, i32 0
339 %a1 = extractelement <4 x double> %a, i32 1
340 %hop0 = fadd double %a0, %a1
341 %a2 = extractelement <4 x double> %a, i32 2
342 %a3 = extractelement <4 x double> %a, i32 3
343 %hop1 = fadd double %a2, %a3
344 %ins = insertelement <4 x double> undef, double %hop0, i32 0
345 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2
346 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
347 ret <4 x double> %shuf
350 define <4 x double> @hadd_v4f64(<4 x double> %a) {
351 ; SSE_SLOW-LABEL: hadd_v4f64:
353 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm2
354 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
355 ; SSE_SLOW-NEXT: addsd %xmm0, %xmm2
356 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
357 ; SSE_SLOW-NEXT: movapd %xmm1, %xmm2
358 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
359 ; SSE_SLOW-NEXT: addsd %xmm1, %xmm2
360 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
361 ; SSE_SLOW-NEXT: retq
363 ; SSE_FAST-LABEL: hadd_v4f64:
365 ; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
366 ; SSE_FAST-NEXT: haddpd %xmm1, %xmm1
367 ; SSE_FAST-NEXT: retq
369 ; AVX-LABEL: hadd_v4f64:
371 ; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
373 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
374 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
375 %hop = fadd <4 x double> %a0, %a1
376 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
377 ret <4 x double> %shuf
380 define <2 x double> @hsub_v2f64(<2 x double> %a) {
381 ; SSE_SLOW-LABEL: hsub_v2f64:
383 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
384 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
385 ; SSE_SLOW-NEXT: subsd %xmm1, %xmm0
386 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
387 ; SSE_SLOW-NEXT: retq
389 ; SSE_FAST-LABEL: hsub_v2f64:
391 ; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0
392 ; SSE_FAST-NEXT: retq
394 ; AVX1_SLOW-LABEL: hsub_v2f64:
395 ; AVX1_SLOW: # %bb.0:
396 ; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
397 ; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
398 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
399 ; AVX1_SLOW-NEXT: retq
401 ; AVX1_FAST-LABEL: hsub_v2f64:
402 ; AVX1_FAST: # %bb.0:
403 ; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
404 ; AVX1_FAST-NEXT: retq
406 ; AVX2_SLOW-LABEL: hsub_v2f64:
407 ; AVX2_SLOW: # %bb.0:
408 ; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
409 ; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
410 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
411 ; AVX2_SLOW-NEXT: retq
413 ; AVX2_FAST-LABEL: hsub_v2f64:
414 ; AVX2_FAST: # %bb.0:
415 ; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
416 ; AVX2_FAST-NEXT: retq
417 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
418 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
419 %hop = fsub <2 x double> %a0, %a1
420 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
421 ret <2 x double> %shuf
424 define <4 x double> @hsub_v4f64(<4 x double> %a) {
425 ; SSE_SLOW-LABEL: hsub_v4f64:
427 ; SSE_SLOW-NEXT: movapd %xmm0, %xmm2
428 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
429 ; SSE_SLOW-NEXT: subsd %xmm2, %xmm0
430 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
431 ; SSE_SLOW-NEXT: movapd %xmm1, %xmm2
432 ; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
433 ; SSE_SLOW-NEXT: subsd %xmm2, %xmm1
434 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
435 ; SSE_SLOW-NEXT: retq
437 ; SSE_FAST-LABEL: hsub_v4f64:
439 ; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0
440 ; SSE_FAST-NEXT: hsubpd %xmm1, %xmm1
441 ; SSE_FAST-NEXT: retq
443 ; AVX-LABEL: hsub_v4f64:
445 ; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
447 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
448 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
449 %hop = fsub <4 x double> %a0, %a1
450 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
451 ret <4 x double> %shuf
454 define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
455 ; SSE3-LABEL: hadd_v4i32:
457 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
458 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
459 ; SSE3-NEXT: paddd %xmm1, %xmm0
462 ; SSSE3-LABEL: hadd_v4i32:
464 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
467 ; AVX-LABEL: hadd_v4i32:
469 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
471 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
472 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
473 %hop = add <4 x i32> %a02, %a13
474 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
478 define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
479 ; SSE3-LABEL: hadd_v8i32a:
481 ; SSE3-NEXT: movaps %xmm0, %xmm2
482 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
483 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
484 ; SSE3-NEXT: paddd %xmm0, %xmm2
485 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
486 ; SSE3-NEXT: movdqa %xmm2, %xmm1
489 ; SSSE3_SLOW-LABEL: hadd_v8i32a:
490 ; SSSE3_SLOW: # %bb.0:
491 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
492 ; SSSE3_SLOW-NEXT: phaddd %xmm1, %xmm2
493 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
494 ; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
495 ; SSSE3_SLOW-NEXT: retq
497 ; SSSE3_FAST-LABEL: hadd_v8i32a:
498 ; SSSE3_FAST: # %bb.0:
499 ; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
500 ; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm2
501 ; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0
502 ; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
503 ; SSSE3_FAST-NEXT: retq
505 ; AVX1_SLOW-LABEL: hadd_v8i32a:
506 ; AVX1_SLOW: # %bb.0:
507 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
508 ; AVX1_SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
509 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
510 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
511 ; AVX1_SLOW-NEXT: retq
513 ; AVX1_FAST-LABEL: hadd_v8i32a:
514 ; AVX1_FAST: # %bb.0:
515 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
516 ; AVX1_FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm1
517 ; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
518 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
519 ; AVX1_FAST-NEXT: retq
521 ; AVX2-LABEL: hadd_v8i32a:
523 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
524 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
525 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
527 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
528 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
529 %hop = add <4 x i32> %a0, %a1
530 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
534 define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
535 ; SSE3-LABEL: hadd_v8i32b:
537 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3]
538 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
539 ; SSE3-NEXT: paddd %xmm2, %xmm0
540 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3]
541 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
542 ; SSE3-NEXT: paddd %xmm2, %xmm1
545 ; SSSE3-LABEL: hadd_v8i32b:
547 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
548 ; SSSE3-NEXT: phaddd %xmm1, %xmm1
551 ; AVX1-LABEL: hadd_v8i32b:
553 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1
554 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
555 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
556 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
559 ; AVX2-LABEL: hadd_v8i32b:
561 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
563 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
564 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
565 %hop = add <8 x i32> %a0, %a1
566 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
570 define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
571 ; SSE3-LABEL: hsub_v4i32:
573 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,1,3]
574 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
575 ; SSE3-NEXT: psubd %xmm1, %xmm0
578 ; SSSE3-LABEL: hsub_v4i32:
580 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
583 ; AVX-LABEL: hsub_v4i32:
585 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
587 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
588 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
589 %hop = sub <4 x i32> %a02, %a13
590 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
594 define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
595 ; SSE3-LABEL: hsub_v8i32a:
597 ; SSE3-NEXT: movaps %xmm0, %xmm2
598 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
599 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
600 ; SSE3-NEXT: psubd %xmm0, %xmm2
601 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
602 ; SSE3-NEXT: movdqa %xmm2, %xmm1
605 ; SSSE3_SLOW-LABEL: hsub_v8i32a:
606 ; SSSE3_SLOW: # %bb.0:
607 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
608 ; SSSE3_SLOW-NEXT: phsubd %xmm1, %xmm2
609 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
610 ; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
611 ; SSSE3_SLOW-NEXT: retq
613 ; SSSE3_FAST-LABEL: hsub_v8i32a:
614 ; SSSE3_FAST: # %bb.0:
615 ; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
616 ; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm2
617 ; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0
618 ; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
619 ; SSSE3_FAST-NEXT: retq
621 ; AVX1_SLOW-LABEL: hsub_v8i32a:
622 ; AVX1_SLOW: # %bb.0:
623 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
624 ; AVX1_SLOW-NEXT: vphsubd %xmm1, %xmm0, %xmm0
625 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
626 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
627 ; AVX1_SLOW-NEXT: retq
629 ; AVX1_FAST-LABEL: hsub_v8i32a:
630 ; AVX1_FAST: # %bb.0:
631 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
632 ; AVX1_FAST-NEXT: vphsubd %xmm1, %xmm0, %xmm1
633 ; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
634 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
635 ; AVX1_FAST-NEXT: retq
637 ; AVX2-LABEL: hsub_v8i32a:
639 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
640 ; AVX2-NEXT: vphsubd %xmm1, %xmm0, %xmm0
641 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
643 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
644 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
645 %hop = sub <4 x i32> %a0, %a1
646 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
650 define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
651 ; SSE3-LABEL: hsub_v8i32b:
653 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3]
654 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
655 ; SSE3-NEXT: psubd %xmm2, %xmm0
656 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3]
657 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
658 ; SSE3-NEXT: psubd %xmm2, %xmm1
661 ; SSSE3-LABEL: hsub_v8i32b:
663 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
664 ; SSSE3-NEXT: phsubd %xmm1, %xmm1
667 ; AVX1-LABEL: hsub_v8i32b:
669 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1
670 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
671 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0
672 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
675 ; AVX2-LABEL: hsub_v8i32b:
677 ; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0
679 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
680 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
681 %hop = sub <8 x i32> %a0, %a1
682 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
686 define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
687 ; SSE3-LABEL: hadd_v8i16:
689 ; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
690 ; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
691 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
692 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
693 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
694 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
695 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
696 ; SSE3-NEXT: paddw %xmm1, %xmm0
699 ; SSSE3-LABEL: hadd_v8i16:
701 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
704 ; AVX-LABEL: hadd_v8i16:
706 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
708 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
709 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
710 %hop = add <8 x i16> %a0246, %a1357
711 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
715 define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
716 ; SSE3-LABEL: hadd_v16i16a:
718 ; SSE3-NEXT: movdqa %xmm1, %xmm3
719 ; SSE3-NEXT: pslld $16, %xmm3
720 ; SSE3-NEXT: psrad $16, %xmm3
721 ; SSE3-NEXT: movdqa %xmm0, %xmm2
722 ; SSE3-NEXT: pslld $16, %xmm2
723 ; SSE3-NEXT: psrad $16, %xmm2
724 ; SSE3-NEXT: packssdw %xmm3, %xmm2
725 ; SSE3-NEXT: psrad $16, %xmm1
726 ; SSE3-NEXT: psrad $16, %xmm0
727 ; SSE3-NEXT: packssdw %xmm1, %xmm0
728 ; SSE3-NEXT: paddw %xmm0, %xmm2
729 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
730 ; SSE3-NEXT: movdqa %xmm2, %xmm1
733 ; SSSE3_SLOW-LABEL: hadd_v16i16a:
734 ; SSSE3_SLOW: # %bb.0:
735 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
736 ; SSSE3_SLOW-NEXT: phaddw %xmm1, %xmm2
737 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
738 ; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
739 ; SSSE3_SLOW-NEXT: retq
741 ; SSSE3_FAST-LABEL: hadd_v16i16a:
742 ; SSSE3_FAST: # %bb.0:
743 ; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
744 ; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm2
745 ; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0
746 ; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
747 ; SSSE3_FAST-NEXT: retq
749 ; AVX1_SLOW-LABEL: hadd_v16i16a:
750 ; AVX1_SLOW: # %bb.0:
751 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
752 ; AVX1_SLOW-NEXT: vphaddw %xmm1, %xmm0, %xmm0
753 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
754 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
755 ; AVX1_SLOW-NEXT: retq
757 ; AVX1_FAST-LABEL: hadd_v16i16a:
758 ; AVX1_FAST: # %bb.0:
759 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
760 ; AVX1_FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm1
761 ; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
762 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
763 ; AVX1_FAST-NEXT: retq
765 ; AVX2-LABEL: hadd_v16i16a:
767 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
768 ; AVX2-NEXT: vphaddw %xmm1, %xmm0, %xmm0
769 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
771 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
772 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
773 %hop = add <8 x i16> %a0, %a1
774 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
778 define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
779 ; SSE3-LABEL: hadd_v16i16b:
781 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7]
782 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
783 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
784 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
785 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
786 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
787 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
788 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
789 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
790 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
791 ; SSE3-NEXT: paddw %xmm2, %xmm0
792 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7]
793 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
794 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
795 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
796 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
797 ; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
798 ; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4]
799 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
800 ; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
801 ; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5]
802 ; SSE3-NEXT: paddw %xmm2, %xmm1
805 ; SSSE3-LABEL: hadd_v16i16b:
807 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
808 ; SSSE3-NEXT: phaddw %xmm1, %xmm1
811 ; AVX1-LABEL: hadd_v16i16b:
813 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1
814 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
815 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
816 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
819 ; AVX2-LABEL: hadd_v16i16b:
821 ; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
823 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
824 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
825 %hop = add <16 x i16> %a0, %a1
826 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
830 define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
831 ; SSE3-LABEL: hsub_v8i16:
833 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
834 ; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
835 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
836 ; SSE3-NEXT: psubw %xmm1, %xmm0
839 ; SSSE3-LABEL: hsub_v8i16:
841 ; SSSE3-NEXT: phsubw %xmm0, %xmm0
844 ; AVX-LABEL: hsub_v8i16:
846 ; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0
848 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
849 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
850 %hop = sub <8 x i16> %a0246, %a1357
851 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
855 define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
856 ; SSE3-LABEL: hsub_v16i16a:
858 ; SSE3-NEXT: movdqa %xmm1, %xmm3
859 ; SSE3-NEXT: pslld $16, %xmm3
860 ; SSE3-NEXT: psrad $16, %xmm3
861 ; SSE3-NEXT: movdqa %xmm0, %xmm2
862 ; SSE3-NEXT: pslld $16, %xmm2
863 ; SSE3-NEXT: psrad $16, %xmm2
864 ; SSE3-NEXT: packssdw %xmm3, %xmm2
865 ; SSE3-NEXT: psrad $16, %xmm1
866 ; SSE3-NEXT: psrad $16, %xmm0
867 ; SSE3-NEXT: packssdw %xmm1, %xmm0
868 ; SSE3-NEXT: psubw %xmm0, %xmm2
869 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
870 ; SSE3-NEXT: movdqa %xmm2, %xmm1
873 ; SSSE3_SLOW-LABEL: hsub_v16i16a:
874 ; SSSE3_SLOW: # %bb.0:
875 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
876 ; SSSE3_SLOW-NEXT: phsubw %xmm1, %xmm2
877 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
878 ; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1
879 ; SSSE3_SLOW-NEXT: retq
881 ; SSSE3_FAST-LABEL: hsub_v16i16a:
882 ; SSSE3_FAST: # %bb.0:
883 ; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2
884 ; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm2
885 ; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0
886 ; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1
887 ; SSSE3_FAST-NEXT: retq
889 ; AVX1_SLOW-LABEL: hsub_v16i16a:
890 ; AVX1_SLOW: # %bb.0:
891 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
892 ; AVX1_SLOW-NEXT: vphsubw %xmm1, %xmm0, %xmm0
893 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
894 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
895 ; AVX1_SLOW-NEXT: retq
897 ; AVX1_FAST-LABEL: hsub_v16i16a:
898 ; AVX1_FAST: # %bb.0:
899 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
900 ; AVX1_FAST-NEXT: vphsubw %xmm1, %xmm0, %xmm1
901 ; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
902 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
903 ; AVX1_FAST-NEXT: retq
905 ; AVX2-LABEL: hsub_v16i16a:
907 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
908 ; AVX2-NEXT: vphsubw %xmm1, %xmm0, %xmm0
909 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
911 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
912 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
913 %hop = sub <8 x i16> %a0, %a1
914 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
918 define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
919 ; SSE3-LABEL: hsub_v16i16b:
921 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7]
922 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
923 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
924 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
925 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
926 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
927 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
928 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
929 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
930 ; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
931 ; SSE3-NEXT: psubw %xmm2, %xmm0
932 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7]
933 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
934 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
935 ; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
936 ; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
937 ; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
938 ; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4]
939 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
940 ; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
941 ; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5]
942 ; SSE3-NEXT: psubw %xmm2, %xmm1
945 ; SSSE3-LABEL: hsub_v16i16b:
947 ; SSSE3-NEXT: phsubw %xmm0, %xmm0
948 ; SSSE3-NEXT: phsubw %xmm1, %xmm1
951 ; AVX1-LABEL: hsub_v16i16b:
953 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1
954 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
955 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0
956 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
959 ; AVX2-LABEL: hsub_v16i16b:
961 ; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0
963 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
964 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
965 %hop = sub <16 x i16> %a0, %a1
966 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
970 define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
971 ; SSE-LABEL: broadcast_haddps_v4f32:
973 ; SSE-NEXT: haddps %xmm0, %xmm0
974 ; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
977 ; AVX1-LABEL: broadcast_haddps_v4f32:
979 ; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0
980 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
983 ; AVX2-LABEL: broadcast_haddps_v4f32:
985 ; AVX2-NEXT: vhaddps %xmm0, %xmm0, %xmm0
986 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
988 %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
989 %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
993 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
995 define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
996 ; SSE-LABEL: PR34724_1:
998 ; SSE-NEXT: haddps %xmm1, %xmm0
1001 ; AVX-LABEL: PR34724_1:
1003 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1005 %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
1006 %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
1007 %t2 = fadd <2 x float> %t0, %t1
1008 %vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1009 %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1010 %t4 = fadd <4 x float> %t3, %b
1011 %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1012 ret <4 x float> %vecinit13
1015 define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
1016 ; SSE-LABEL: PR34724_2:
1018 ; SSE-NEXT: haddps %xmm1, %xmm0
1021 ; AVX-LABEL: PR34724_2:
1023 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1025 %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
1026 %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
1027 %t2 = fadd <4 x float> %t0, %t1
1028 %vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
1029 %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
1030 %t4 = fadd <4 x float> %t3, %b
1031 %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
1032 ret <4 x float> %vecinit13
1036 ; fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
1037 ; --> SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))).
1040 define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
1041 ; SSE-LABEL: hadd_4f32_v8f32_shuffle:
1043 ; SSE-NEXT: haddps %xmm1, %xmm0
1044 ; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1047 ; AVX-LABEL: hadd_4f32_v8f32_shuffle:
1049 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
1050 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1051 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1052 ; AVX-NEXT: vzeroupper
1054 %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1055 %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1056 %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1057 %hadd0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1058 %hadd1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1059 %hadd = fadd <4 x float> %hadd0, %hadd1
1060 ret <4 x float> %hadd
1063 define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
1064 ; SSE-LABEL: hsub_4f32_v8f32_shuffle:
1066 ; SSE-NEXT: haddps %xmm1, %xmm0
1067 ; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1070 ; AVX-LABEL: hsub_4f32_v8f32_shuffle:
1072 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
1073 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1074 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1075 ; AVX-NEXT: vzeroupper
1077 %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1078 %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1079 %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1080 %hsub0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1081 %hsub1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1082 %hsub = fadd <4 x float> %hsub0, %hsub1
1083 ret <4 x float> %hsub
1086 define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
1087 ; SSE3-LABEL: hadd_4i32_v8i32_shuffle:
1089 ; SSE3-NEXT: movaps %xmm0, %xmm2
1090 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
1091 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
1092 ; SSE3-NEXT: paddd %xmm2, %xmm0
1095 ; SSSE3-LABEL: hadd_4i32_v8i32_shuffle:
1097 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
1098 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1101 ; AVX1-LABEL: hadd_4i32_v8i32_shuffle:
1103 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1104 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1105 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1106 ; AVX1-NEXT: vzeroupper
1109 ; AVX2-LABEL: hadd_4i32_v8i32_shuffle:
1111 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1112 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1113 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1114 ; AVX2-NEXT: vzeroupper
1116 %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1117 %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1118 %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1119 %hadd0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1120 %hadd1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1121 %hadd = add <4 x i32> %hadd0, %hadd1
1125 define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
1126 ; SSE3-LABEL: hsub_4i32_v8i32_shuffle:
1128 ; SSE3-NEXT: movaps %xmm0, %xmm2
1129 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
1130 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
1131 ; SSE3-NEXT: paddd %xmm2, %xmm0
1134 ; SSSE3-LABEL: hsub_4i32_v8i32_shuffle:
1136 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
1137 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1140 ; AVX1-LABEL: hsub_4i32_v8i32_shuffle:
1142 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1143 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1144 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1145 ; AVX1-NEXT: vzeroupper
1148 ; AVX2-LABEL: hsub_4i32_v8i32_shuffle:
1150 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1151 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1152 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1153 ; AVX2-NEXT: vzeroupper
1155 %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1156 %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1157 %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1158 %hsub0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1159 %hsub1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1160 %hsub = add <4 x i32> %hsub0, %hsub1
1165 ; fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) --> SHUFFLE(HOP(X,Y)).
1168 define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
1169 ; SSE-LABEL: hadd_4f64_v4f64_shuffle:
1171 ; SSE-NEXT: haddpd %xmm1, %xmm0
1172 ; SSE-NEXT: haddpd %xmm3, %xmm2
1173 ; SSE-NEXT: movapd %xmm2, %xmm1
1176 ; AVX1-LABEL: hadd_4f64_v4f64_shuffle:
1178 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
1179 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1180 ; AVX1-NEXT: vhaddpd %ymm0, %ymm2, %ymm0
1183 ; AVX2-LABEL: hadd_4f64_v4f64_shuffle:
1185 ; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
1186 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1188 %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1189 %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1190 %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1191 %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1192 %hadd = fadd <4 x double> %hadd0, %hadd1
1193 ret <4 x double> %hadd
1196 define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
1197 ; SSE-LABEL: hsub_4f64_v4f64_shuffle:
1199 ; SSE-NEXT: hsubpd %xmm1, %xmm0
1200 ; SSE-NEXT: hsubpd %xmm3, %xmm2
1201 ; SSE-NEXT: movapd %xmm2, %xmm1
1204 ; AVX1-LABEL: hsub_4f64_v4f64_shuffle:
1206 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
1207 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1208 ; AVX1-NEXT: vhsubpd %ymm0, %ymm2, %ymm0
1211 ; AVX2-LABEL: hsub_4f64_v4f64_shuffle:
1213 ; AVX2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
1214 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1216 %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1217 %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1218 %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1219 %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1220 %hadd = fsub <4 x double> %hadd0, %hadd1
1221 ret <4 x double> %hadd
1224 define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
1225 ; SSE-LABEL: hadd_8f32_v8f32_shuffle:
1227 ; SSE-NEXT: haddps %xmm1, %xmm0
1228 ; SSE-NEXT: haddps %xmm3, %xmm2
1229 ; SSE-NEXT: movaps %xmm2, %xmm1
1232 ; AVX1-LABEL: hadd_8f32_v8f32_shuffle:
1234 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
1235 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1236 ; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0
1239 ; AVX2-LABEL: hadd_8f32_v8f32_shuffle:
1241 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
1242 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1244 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1245 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1246 %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1247 %hadd1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1248 %hadd = fadd <8 x float> %hadd0, %hadd1
1249 ret <8 x float> %hadd
1252 define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
1253 ; SSE-LABEL: hsub_8f32_v8f32_shuffle:
1255 ; SSE-NEXT: haddps %xmm1, %xmm0
1256 ; SSE-NEXT: haddps %xmm3, %xmm2
1257 ; SSE-NEXT: movaps %xmm2, %xmm1
1260 ; AVX1-LABEL: hsub_8f32_v8f32_shuffle:
1262 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
1263 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1264 ; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0
1267 ; AVX2-LABEL: hsub_8f32_v8f32_shuffle:
1269 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
1270 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1272 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1273 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1274 %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1275 %hsub1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1276 %hsub = fadd <8 x float> %hsub0, %hsub1
1277 ret <8 x float> %hsub
1280 define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
1281 ; SSE3-LABEL: hadd_8i32_v8i32_shuffle:
1283 ; SSE3-NEXT: movaps %xmm2, %xmm4
1284 ; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
1285 ; SSE3-NEXT: movaps %xmm0, %xmm5
1286 ; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
1287 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
1288 ; SSE3-NEXT: paddd %xmm2, %xmm4
1289 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1290 ; SSE3-NEXT: paddd %xmm5, %xmm0
1291 ; SSE3-NEXT: movdqa %xmm4, %xmm1
1294 ; SSSE3-LABEL: hadd_8i32_v8i32_shuffle:
1296 ; SSSE3-NEXT: phaddd %xmm1, %xmm0
1297 ; SSSE3-NEXT: phaddd %xmm3, %xmm2
1298 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1301 ; AVX1-LABEL: hadd_8i32_v8i32_shuffle:
1303 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1304 ; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1305 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1306 ; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0
1307 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1310 ; AVX2-LABEL: hadd_8i32_v8i32_shuffle:
1312 ; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
1313 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1315 %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1316 %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1317 %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1318 %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1319 %hadd = add <8 x i32> %hadd0, %hadd1
1323 define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
1324 ; SSE3-LABEL: hsub_8i32_v8i32_shuffle:
1326 ; SSE3-NEXT: movaps %xmm2, %xmm4
1327 ; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
1328 ; SSE3-NEXT: movaps %xmm0, %xmm5
1329 ; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
1330 ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
1331 ; SSE3-NEXT: psubd %xmm2, %xmm4
1332 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1333 ; SSE3-NEXT: psubd %xmm0, %xmm5
1334 ; SSE3-NEXT: movdqa %xmm5, %xmm0
1335 ; SSE3-NEXT: movdqa %xmm4, %xmm1
1338 ; SSSE3-LABEL: hsub_8i32_v8i32_shuffle:
1340 ; SSSE3-NEXT: phsubd %xmm1, %xmm0
1341 ; SSSE3-NEXT: phsubd %xmm3, %xmm2
1342 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1345 ; AVX1-LABEL: hsub_8i32_v8i32_shuffle:
1347 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1348 ; AVX1-NEXT: vphsubd %xmm2, %xmm1, %xmm1
1349 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1350 ; AVX1-NEXT: vphsubd %xmm2, %xmm0, %xmm0
1351 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1354 ; AVX2-LABEL: hsub_8i32_v8i32_shuffle:
1356 ; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0
1357 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1359 %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1360 %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1361 %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1362 %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1363 %hadd = sub <8 x i32> %hadd0, %hadd1
1367 define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
1368 ; SSE3-LABEL: hadd_16i16_16i16_shuffle:
1370 ; SSE3-NEXT: movdqa %xmm3, %xmm5
1371 ; SSE3-NEXT: pslld $16, %xmm5
1372 ; SSE3-NEXT: psrad $16, %xmm5
1373 ; SSE3-NEXT: movdqa %xmm2, %xmm4
1374 ; SSE3-NEXT: pslld $16, %xmm4
1375 ; SSE3-NEXT: psrad $16, %xmm4
1376 ; SSE3-NEXT: packssdw %xmm5, %xmm4
1377 ; SSE3-NEXT: movdqa %xmm1, %xmm5
1378 ; SSE3-NEXT: pslld $16, %xmm5
1379 ; SSE3-NEXT: psrad $16, %xmm5
1380 ; SSE3-NEXT: movdqa %xmm0, %xmm6
1381 ; SSE3-NEXT: pslld $16, %xmm6
1382 ; SSE3-NEXT: psrad $16, %xmm6
1383 ; SSE3-NEXT: packssdw %xmm5, %xmm6
1384 ; SSE3-NEXT: psrad $16, %xmm3
1385 ; SSE3-NEXT: psrad $16, %xmm2
1386 ; SSE3-NEXT: packssdw %xmm3, %xmm2
1387 ; SSE3-NEXT: paddw %xmm2, %xmm4
1388 ; SSE3-NEXT: psrad $16, %xmm1
1389 ; SSE3-NEXT: psrad $16, %xmm0
1390 ; SSE3-NEXT: packssdw %xmm1, %xmm0
1391 ; SSE3-NEXT: paddw %xmm6, %xmm0
1392 ; SSE3-NEXT: movdqa %xmm4, %xmm1
1395 ; SSSE3-LABEL: hadd_16i16_16i16_shuffle:
1397 ; SSSE3-NEXT: phaddw %xmm1, %xmm0
1398 ; SSSE3-NEXT: phaddw %xmm3, %xmm2
1399 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1402 ; AVX1-LABEL: hadd_16i16_16i16_shuffle:
1404 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1405 ; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1
1406 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1407 ; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0
1408 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1411 ; AVX2-LABEL: hadd_16i16_16i16_shuffle:
1413 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
1414 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1416 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1417 %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1418 %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
1419 %hadd1 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
1420 %hadd = add <16 x i16> %hadd0, %hadd1
1421 ret <16 x i16> %hadd