1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
9 ; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
10 ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
12 define <4 x float> @hadd_v4f32(<4 x float> %a) {
13 ; SSSE3_SLOW-LABEL: hadd_v4f32:
14 ; SSSE3_SLOW: # %bb.0:
15 ; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1
16 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
17 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
18 ; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0
19 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
20 ; SSSE3_SLOW-NEXT: retq
22 ; SSSE3_FAST-LABEL: hadd_v4f32:
23 ; SSSE3_FAST: # %bb.0:
24 ; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0
25 ; SSSE3_FAST-NEXT: retq
27 ; AVX1_SLOW-LABEL: hadd_v4f32:
29 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
30 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
31 ; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
32 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
33 ; AVX1_SLOW-NEXT: retq
35 ; AVX1_FAST-LABEL: hadd_v4f32:
37 ; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
38 ; AVX1_FAST-NEXT: retq
40 ; AVX2_SLOW-LABEL: hadd_v4f32:
42 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
43 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
44 ; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
45 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
46 ; AVX2_SLOW-NEXT: retq
48 ; AVX2_FAST-LABEL: hadd_v4f32:
50 ; AVX2_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
51 ; AVX2_FAST-NEXT: retq
52 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
53 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
54 %hop = fadd <2 x float> %a02, %a13
55 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
59 define <8 x float> @hadd_v8f32a(<8 x float> %a) {
60 ; SSSE3-LABEL: hadd_v8f32a:
62 ; SSSE3-NEXT: movaps %xmm0, %xmm2
63 ; SSSE3-NEXT: haddps %xmm1, %xmm2
64 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
65 ; SSSE3-NEXT: movaps %xmm2, %xmm1
68 ; AVX1-LABEL: hadd_v8f32a:
70 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
71 ; AVX1-NEXT: vhaddps %xmm1, %xmm0, %xmm0
72 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
73 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
76 ; AVX2-LABEL: hadd_v8f32a:
78 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
79 ; AVX2-NEXT: vhaddps %xmm1, %xmm0, %xmm0
80 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
82 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
83 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
84 %hop = fadd <4 x float> %a0, %a1
85 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
89 define <8 x float> @hadd_v8f32b(<8 x float> %a) {
90 ; SSSE3_SLOW-LABEL: hadd_v8f32b:
91 ; SSSE3_SLOW: # %bb.0:
92 ; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
93 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
94 ; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3
95 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
96 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
97 ; SSSE3_SLOW-NEXT: addps %xmm2, %xmm0
98 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
99 ; SSSE3_SLOW-NEXT: addps %xmm3, %xmm1
100 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
101 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
102 ; SSSE3_SLOW-NEXT: retq
104 ; SSSE3_FAST-LABEL: hadd_v8f32b:
105 ; SSSE3_FAST: # %bb.0:
106 ; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0
107 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1
108 ; SSSE3_FAST-NEXT: retq
110 ; AVX1_SLOW-LABEL: hadd_v8f32b:
111 ; AVX1_SLOW: # %bb.0:
112 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
113 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
114 ; AVX1_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0
115 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
116 ; AVX1_SLOW-NEXT: retq
118 ; AVX1_FAST-LABEL: hadd_v8f32b:
119 ; AVX1_FAST: # %bb.0:
120 ; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
121 ; AVX1_FAST-NEXT: retq
123 ; AVX2_SLOW-LABEL: hadd_v8f32b:
124 ; AVX2_SLOW: # %bb.0:
125 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
126 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
127 ; AVX2_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0
128 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
129 ; AVX2_SLOW-NEXT: retq
131 ; AVX2_FAST-LABEL: hadd_v8f32b:
132 ; AVX2_FAST: # %bb.0:
133 ; AVX2_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
134 ; AVX2_FAST-NEXT: retq
135 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
136 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
137 %hop = fadd <8 x float> %a0, %a1
138 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
139 ret <8 x float> %shuf
142 define <4 x float> @hsub_v4f32(<4 x float> %a) {
143 ; SSSE3_SLOW-LABEL: hsub_v4f32:
144 ; SSSE3_SLOW: # %bb.0:
145 ; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1
146 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
147 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
148 ; SSSE3_SLOW-NEXT: subps %xmm0, %xmm1
149 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
150 ; SSSE3_SLOW-NEXT: retq
152 ; SSSE3_FAST-LABEL: hsub_v4f32:
153 ; SSSE3_FAST: # %bb.0:
154 ; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0
155 ; SSSE3_FAST-NEXT: retq
157 ; AVX1_SLOW-LABEL: hsub_v4f32:
158 ; AVX1_SLOW: # %bb.0:
159 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
160 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
161 ; AVX1_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
162 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
163 ; AVX1_SLOW-NEXT: retq
165 ; AVX1_FAST-LABEL: hsub_v4f32:
166 ; AVX1_FAST: # %bb.0:
167 ; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
168 ; AVX1_FAST-NEXT: retq
170 ; AVX2_SLOW-LABEL: hsub_v4f32:
171 ; AVX2_SLOW: # %bb.0:
172 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
173 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
174 ; AVX2_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
175 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
176 ; AVX2_SLOW-NEXT: retq
178 ; AVX2_FAST-LABEL: hsub_v4f32:
179 ; AVX2_FAST: # %bb.0:
180 ; AVX2_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
181 ; AVX2_FAST-NEXT: retq
182 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
183 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
184 %hop = fsub <2 x float> %a02, %a13
185 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
186 ret <4 x float> %shuf
189 define <8 x float> @hsub_v8f32a(<8 x float> %a) {
190 ; SSSE3-LABEL: hsub_v8f32a:
192 ; SSSE3-NEXT: movaps %xmm0, %xmm2
193 ; SSSE3-NEXT: hsubps %xmm1, %xmm2
194 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
195 ; SSSE3-NEXT: movaps %xmm2, %xmm1
198 ; AVX1-LABEL: hsub_v8f32a:
200 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
201 ; AVX1-NEXT: vhsubps %xmm1, %xmm0, %xmm0
202 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
203 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
206 ; AVX2-LABEL: hsub_v8f32a:
208 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
209 ; AVX2-NEXT: vhsubps %xmm1, %xmm0, %xmm0
210 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
212 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
213 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
214 %hop = fsub <4 x float> %a0, %a1
215 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
216 ret <8 x float> %shuf
219 define <8 x float> @hsub_v8f32b(<8 x float> %a) {
220 ; SSSE3_SLOW-LABEL: hsub_v8f32b:
221 ; SSSE3_SLOW: # %bb.0:
222 ; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
223 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
224 ; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3
225 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
226 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
227 ; SSSE3_SLOW-NEXT: subps %xmm0, %xmm2
228 ; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
229 ; SSSE3_SLOW-NEXT: subps %xmm1, %xmm3
230 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
231 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
232 ; SSSE3_SLOW-NEXT: retq
234 ; SSSE3_FAST-LABEL: hsub_v8f32b:
235 ; SSSE3_FAST: # %bb.0:
236 ; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0
237 ; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm1
238 ; SSSE3_FAST-NEXT: retq
240 ; AVX1_SLOW-LABEL: hsub_v8f32b:
241 ; AVX1_SLOW: # %bb.0:
242 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
243 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
244 ; AVX1_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0
245 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
246 ; AVX1_SLOW-NEXT: retq
248 ; AVX1_FAST-LABEL: hsub_v8f32b:
249 ; AVX1_FAST: # %bb.0:
250 ; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0
251 ; AVX1_FAST-NEXT: retq
253 ; AVX2_SLOW-LABEL: hsub_v8f32b:
254 ; AVX2_SLOW: # %bb.0:
255 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
256 ; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
257 ; AVX2_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0
258 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
259 ; AVX2_SLOW-NEXT: retq
261 ; AVX2_FAST-LABEL: hsub_v8f32b:
262 ; AVX2_FAST: # %bb.0:
263 ; AVX2_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0
264 ; AVX2_FAST-NEXT: retq
265 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
266 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
267 %hop = fsub <8 x float> %a0, %a1
268 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
269 ret <8 x float> %shuf
272 define <2 x double> @hadd_v2f64(<2 x double> %a) {
273 ; SSSE3_SLOW-LABEL: hadd_v2f64:
274 ; SSSE3_SLOW: # %bb.0:
275 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
276 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
277 ; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1
278 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
279 ; SSSE3_SLOW-NEXT: retq
281 ; SSSE3_FAST-LABEL: hadd_v2f64:
282 ; SSSE3_FAST: # %bb.0:
283 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
284 ; SSSE3_FAST-NEXT: retq
286 ; AVX1_SLOW-LABEL: hadd_v2f64:
287 ; AVX1_SLOW: # %bb.0:
288 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
289 ; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
290 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
291 ; AVX1_SLOW-NEXT: retq
293 ; AVX1_FAST-LABEL: hadd_v2f64:
294 ; AVX1_FAST: # %bb.0:
295 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
296 ; AVX1_FAST-NEXT: retq
298 ; AVX2_SLOW-LABEL: hadd_v2f64:
299 ; AVX2_SLOW: # %bb.0:
300 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
301 ; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
302 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
303 ; AVX2_SLOW-NEXT: retq
305 ; AVX2_FAST-LABEL: hadd_v2f64:
306 ; AVX2_FAST: # %bb.0:
307 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
308 ; AVX2_FAST-NEXT: retq
309 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
310 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
311 %hop = fadd <2 x double> %a0, %a1
312 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
313 ret <2 x double> %shuf
316 define <4 x double> @hadd_v4f64(<4 x double> %a) {
317 ; SSSE3_SLOW-LABEL: hadd_v4f64:
318 ; SSSE3_SLOW: # %bb.0:
319 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
320 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
321 ; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
322 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
323 ; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3
324 ; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2
325 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
326 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
327 ; SSSE3_SLOW-NEXT: retq
329 ; SSSE3_FAST-LABEL: hadd_v4f64:
330 ; SSSE3_FAST: # %bb.0:
331 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
332 ; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
333 ; SSSE3_FAST-NEXT: retq
335 ; AVX1_SLOW-LABEL: hadd_v4f64:
336 ; AVX1_SLOW: # %bb.0:
337 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
338 ; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
339 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
340 ; AVX1_SLOW-NEXT: retq
342 ; AVX1_FAST-LABEL: hadd_v4f64:
343 ; AVX1_FAST: # %bb.0:
344 ; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
345 ; AVX1_FAST-NEXT: retq
347 ; AVX2_SLOW-LABEL: hadd_v4f64:
348 ; AVX2_SLOW: # %bb.0:
349 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
350 ; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
351 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
352 ; AVX2_SLOW-NEXT: retq
354 ; AVX2_FAST-LABEL: hadd_v4f64:
355 ; AVX2_FAST: # %bb.0:
356 ; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
357 ; AVX2_FAST-NEXT: retq
358 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
359 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
360 %hop = fadd <4 x double> %a0, %a1
361 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
362 ret <4 x double> %shuf
365 define <2 x double> @hsub_v2f64(<2 x double> %a) {
366 ; SSSE3_SLOW-LABEL: hsub_v2f64:
367 ; SSSE3_SLOW: # %bb.0:
368 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
369 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
370 ; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0
371 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
372 ; SSSE3_SLOW-NEXT: retq
374 ; SSSE3_FAST-LABEL: hsub_v2f64:
375 ; SSSE3_FAST: # %bb.0:
376 ; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
377 ; SSSE3_FAST-NEXT: retq
379 ; AVX1_SLOW-LABEL: hsub_v2f64:
380 ; AVX1_SLOW: # %bb.0:
381 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
382 ; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
383 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
384 ; AVX1_SLOW-NEXT: retq
386 ; AVX1_FAST-LABEL: hsub_v2f64:
387 ; AVX1_FAST: # %bb.0:
388 ; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
389 ; AVX1_FAST-NEXT: retq
391 ; AVX2_SLOW-LABEL: hsub_v2f64:
392 ; AVX2_SLOW: # %bb.0:
393 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
394 ; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
395 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
396 ; AVX2_SLOW-NEXT: retq
398 ; AVX2_FAST-LABEL: hsub_v2f64:
399 ; AVX2_FAST: # %bb.0:
400 ; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
401 ; AVX2_FAST-NEXT: retq
402 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
403 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
404 %hop = fsub <2 x double> %a0, %a1
405 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
406 ret <2 x double> %shuf
409 define <4 x double> @hsub_v4f64(<4 x double> %a) {
410 ; SSSE3_SLOW-LABEL: hsub_v4f64:
411 ; SSSE3_SLOW: # %bb.0:
412 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
413 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
414 ; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
415 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
416 ; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1
417 ; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0
418 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
419 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
420 ; SSSE3_SLOW-NEXT: retq
422 ; SSSE3_FAST-LABEL: hsub_v4f64:
423 ; SSSE3_FAST: # %bb.0:
424 ; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
425 ; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1
426 ; SSSE3_FAST-NEXT: retq
428 ; AVX1_SLOW-LABEL: hsub_v4f64:
429 ; AVX1_SLOW: # %bb.0:
430 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
431 ; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
432 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
433 ; AVX1_SLOW-NEXT: retq
435 ; AVX1_FAST-LABEL: hsub_v4f64:
436 ; AVX1_FAST: # %bb.0:
437 ; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
438 ; AVX1_FAST-NEXT: retq
440 ; AVX2_SLOW-LABEL: hsub_v4f64:
441 ; AVX2_SLOW: # %bb.0:
442 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
443 ; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
444 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
445 ; AVX2_SLOW-NEXT: retq
447 ; AVX2_FAST-LABEL: hsub_v4f64:
448 ; AVX2_FAST: # %bb.0:
449 ; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
450 ; AVX2_FAST-NEXT: retq
451 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
452 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
453 %hop = fsub <4 x double> %a0, %a1
454 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
455 ret <4 x double> %shuf
458 define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
459 ; SSSE3_SLOW-LABEL: hadd_v4i32:
460 ; SSSE3_SLOW: # %bb.0:
461 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
462 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
463 ; SSSE3_SLOW-NEXT: paddd %xmm1, %xmm0
464 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
465 ; SSSE3_SLOW-NEXT: retq
467 ; SSSE3_FAST-LABEL: hadd_v4i32:
468 ; SSSE3_FAST: # %bb.0:
469 ; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0
470 ; SSSE3_FAST-NEXT: retq
472 ; AVX1_SLOW-LABEL: hadd_v4i32:
473 ; AVX1_SLOW: # %bb.0:
474 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
475 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
476 ; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
477 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
478 ; AVX1_SLOW-NEXT: retq
480 ; AVX1_FAST-LABEL: hadd_v4i32:
481 ; AVX1_FAST: # %bb.0:
482 ; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
483 ; AVX1_FAST-NEXT: retq
485 ; AVX2_SLOW-LABEL: hadd_v4i32:
486 ; AVX2_SLOW: # %bb.0:
487 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
488 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
489 ; AVX2_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
490 ; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
491 ; AVX2_SLOW-NEXT: retq
493 ; AVX2_FAST-LABEL: hadd_v4i32:
494 ; AVX2_FAST: # %bb.0:
495 ; AVX2_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
496 ; AVX2_FAST-NEXT: retq
497 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
498 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
499 %hop = add <4 x i32> %a02, %a13
500 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
504 define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
505 ; SSSE3-LABEL: hadd_v8i32a:
507 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
508 ; SSSE3-NEXT: phaddd %xmm1, %xmm2
509 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
510 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
513 ; AVX1-LABEL: hadd_v8i32a:
515 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
516 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
517 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
518 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
521 ; AVX2-LABEL: hadd_v8i32a:
523 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
524 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
525 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
527 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
528 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
529 %hop = add <4 x i32> %a0, %a1
530 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
534 define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
535 ; SSSE3_SLOW-LABEL: hadd_v8i32b:
536 ; SSSE3_SLOW: # %bb.0:
537 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
538 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
539 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
540 ; SSSE3_SLOW-NEXT: paddd %xmm2, %xmm0
541 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
542 ; SSSE3_SLOW-NEXT: paddd %xmm3, %xmm1
543 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
544 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
545 ; SSSE3_SLOW-NEXT: retq
547 ; SSSE3_FAST-LABEL: hadd_v8i32b:
548 ; SSSE3_FAST: # %bb.0:
549 ; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0
550 ; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm1
551 ; SSSE3_FAST-NEXT: retq
553 ; AVX1_SLOW-LABEL: hadd_v8i32b:
554 ; AVX1_SLOW: # %bb.0:
555 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
556 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
557 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
558 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3
559 ; AVX1_SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
560 ; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
561 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
562 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
563 ; AVX1_SLOW-NEXT: retq
565 ; AVX1_FAST-LABEL: hadd_v8i32b:
566 ; AVX1_FAST: # %bb.0:
567 ; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1
568 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
569 ; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
570 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
571 ; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
572 ; AVX1_FAST-NEXT: retq
574 ; AVX2_SLOW-LABEL: hadd_v8i32b:
575 ; AVX2_SLOW: # %bb.0:
576 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
577 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
578 ; AVX2_SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
579 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
580 ; AVX2_SLOW-NEXT: retq
582 ; AVX2_FAST-LABEL: hadd_v8i32b:
583 ; AVX2_FAST: # %bb.0:
584 ; AVX2_FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
585 ; AVX2_FAST-NEXT: retq
586 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
587 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
588 %hop = add <8 x i32> %a0, %a1
589 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
593 define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
594 ; SSSE3_SLOW-LABEL: hsub_v4i32:
595 ; SSSE3_SLOW: # %bb.0:
596 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
597 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
598 ; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm1
599 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
600 ; SSSE3_SLOW-NEXT: retq
602 ; SSSE3_FAST-LABEL: hsub_v4i32:
603 ; SSSE3_FAST: # %bb.0:
604 ; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0
605 ; SSSE3_FAST-NEXT: retq
607 ; AVX1_SLOW-LABEL: hsub_v4i32:
608 ; AVX1_SLOW: # %bb.0:
609 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
610 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
611 ; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
612 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
613 ; AVX1_SLOW-NEXT: retq
615 ; AVX1_FAST-LABEL: hsub_v4i32:
616 ; AVX1_FAST: # %bb.0:
617 ; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
618 ; AVX1_FAST-NEXT: retq
620 ; AVX2_SLOW-LABEL: hsub_v4i32:
621 ; AVX2_SLOW: # %bb.0:
622 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
623 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
624 ; AVX2_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
625 ; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
626 ; AVX2_SLOW-NEXT: retq
628 ; AVX2_FAST-LABEL: hsub_v4i32:
629 ; AVX2_FAST: # %bb.0:
630 ; AVX2_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
631 ; AVX2_FAST-NEXT: retq
632 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
633 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
634 %hop = sub <4 x i32> %a02, %a13
635 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
639 define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
640 ; SSSE3-LABEL: hsub_v8i32a:
642 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
643 ; SSSE3-NEXT: phsubd %xmm1, %xmm2
644 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
645 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
648 ; AVX1-LABEL: hsub_v8i32a:
650 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
651 ; AVX1-NEXT: vphsubd %xmm1, %xmm0, %xmm0
652 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
653 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
656 ; AVX2-LABEL: hsub_v8i32a:
658 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
659 ; AVX2-NEXT: vphsubd %xmm1, %xmm0, %xmm0
660 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
662 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
663 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
664 %hop = sub <4 x i32> %a0, %a1
665 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
669 define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
670 ; SSSE3_SLOW-LABEL: hsub_v8i32b:
671 ; SSSE3_SLOW: # %bb.0:
672 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
673 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
674 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
675 ; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm2
676 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
677 ; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm3
678 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
679 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
680 ; SSSE3_SLOW-NEXT: retq
682 ; SSSE3_FAST-LABEL: hsub_v8i32b:
683 ; SSSE3_FAST: # %bb.0:
684 ; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0
685 ; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm1
686 ; SSSE3_FAST-NEXT: retq
688 ; AVX1_SLOW-LABEL: hsub_v8i32b:
689 ; AVX1_SLOW: # %bb.0:
690 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
691 ; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
692 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
693 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3
694 ; AVX1_SLOW-NEXT: vpsubd %xmm2, %xmm3, %xmm2
695 ; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
696 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
697 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
698 ; AVX1_SLOW-NEXT: retq
700 ; AVX1_FAST-LABEL: hsub_v8i32b:
701 ; AVX1_FAST: # %bb.0:
702 ; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm1
703 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
704 ; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
705 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
706 ; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
707 ; AVX1_FAST-NEXT: retq
709 ; AVX2_SLOW-LABEL: hsub_v8i32b:
710 ; AVX2_SLOW: # %bb.0:
711 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
712 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
713 ; AVX2_SLOW-NEXT: vpsubd %ymm0, %ymm1, %ymm0
714 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
715 ; AVX2_SLOW-NEXT: retq
717 ; AVX2_FAST-LABEL: hsub_v8i32b:
718 ; AVX2_FAST: # %bb.0:
719 ; AVX2_FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0
720 ; AVX2_FAST-NEXT: retq
721 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
722 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
723 %hop = sub <8 x i32> %a0, %a1
724 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
728 define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
729 ; SSSE3_SLOW-LABEL: hadd_v8i16:
730 ; SSSE3_SLOW: # %bb.0:
731 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1
732 ; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
733 ; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
734 ; SSSE3_SLOW-NEXT: paddw %xmm1, %xmm0
735 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
736 ; SSSE3_SLOW-NEXT: retq
738 ; SSSE3_FAST-LABEL: hadd_v8i16:
739 ; SSSE3_FAST: # %bb.0:
740 ; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0
741 ; SSSE3_FAST-NEXT: retq
743 ; AVX1_SLOW-LABEL: hadd_v8i16:
744 ; AVX1_SLOW: # %bb.0:
745 ; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
746 ; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
747 ; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
748 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
749 ; AVX1_SLOW-NEXT: retq
751 ; AVX1_FAST-LABEL: hadd_v8i16:
752 ; AVX1_FAST: # %bb.0:
753 ; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
754 ; AVX1_FAST-NEXT: retq
756 ; AVX2_SLOW-LABEL: hadd_v8i16:
757 ; AVX2_SLOW: # %bb.0:
758 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
759 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
760 ; AVX2_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
761 ; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
762 ; AVX2_SLOW-NEXT: retq
764 ; AVX2_FAST-LABEL: hadd_v8i16:
765 ; AVX2_FAST: # %bb.0:
766 ; AVX2_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
767 ; AVX2_FAST-NEXT: retq
768 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
769 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
770 %hop = add <8 x i16> %a0246, %a1357
771 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
775 define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
776 ; SSSE3-LABEL: hadd_v16i16a:
778 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
779 ; SSSE3-NEXT: phaddw %xmm1, %xmm2
780 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
781 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
784 ; AVX1-LABEL: hadd_v16i16a:
786 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
787 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
788 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
789 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
792 ; AVX2-LABEL: hadd_v16i16a:
794 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
795 ; AVX2-NEXT: vphaddw %xmm1, %xmm0, %xmm0
796 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
798 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
799 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
800 %hop = add <8 x i16> %a0, %a1
801 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
805 define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
806 ; SSSE3_SLOW-LABEL: hadd_v16i16b:
807 ; SSSE3_SLOW: # %bb.0:
808 ; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
809 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3
810 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3
811 ; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4
812 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4
813 ; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
814 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0
815 ; SSSE3_SLOW-NEXT: paddw %xmm3, %xmm0
816 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1
817 ; SSSE3_SLOW-NEXT: paddw %xmm4, %xmm1
818 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
819 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
820 ; SSSE3_SLOW-NEXT: retq
822 ; SSSE3_FAST-LABEL: hadd_v16i16b:
823 ; SSSE3_FAST: # %bb.0:
824 ; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0
825 ; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm1
826 ; SSSE3_FAST-NEXT: retq
828 ; AVX1_SLOW-LABEL: hadd_v16i16b:
829 ; AVX1_SLOW: # %bb.0:
830 ; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
831 ; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2
832 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
833 ; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
834 ; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
835 ; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
836 ; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0
837 ; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2
838 ; AVX1_SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
839 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
840 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
841 ; AVX1_SLOW-NEXT: retq
843 ; AVX1_FAST-LABEL: hadd_v16i16b:
844 ; AVX1_FAST: # %bb.0:
845 ; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm1
846 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
847 ; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
848 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
849 ; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
850 ; AVX1_FAST-NEXT: retq
852 ; AVX2_SLOW-LABEL: hadd_v16i16b:
853 ; AVX2_SLOW: # %bb.0:
854 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
855 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
856 ; AVX2_SLOW-NEXT: vpaddw %ymm0, %ymm1, %ymm0
857 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
858 ; AVX2_SLOW-NEXT: retq
860 ; AVX2_FAST-LABEL: hadd_v16i16b:
861 ; AVX2_FAST: # %bb.0:
862 ; AVX2_FAST-NEXT: vphaddw %ymm0, %ymm0, %ymm0
863 ; AVX2_FAST-NEXT: retq
864 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
865 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
866 %hop = add <16 x i16> %a0, %a1
867 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
871 define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
872 ; SSSE3_SLOW-LABEL: hsub_v8i16:
873 ; SSSE3_SLOW: # %bb.0:
874 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1
875 ; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
876 ; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
877 ; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm1
878 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
879 ; SSSE3_SLOW-NEXT: retq
881 ; SSSE3_FAST-LABEL: hsub_v8i16:
882 ; SSSE3_FAST: # %bb.0:
883 ; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0
884 ; SSSE3_FAST-NEXT: retq
886 ; AVX1_SLOW-LABEL: hsub_v8i16:
887 ; AVX1_SLOW: # %bb.0:
888 ; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
889 ; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
890 ; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
891 ; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
892 ; AVX1_SLOW-NEXT: retq
894 ; AVX1_FAST-LABEL: hsub_v8i16:
895 ; AVX1_FAST: # %bb.0:
896 ; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
897 ; AVX1_FAST-NEXT: retq
899 ; AVX2_SLOW-LABEL: hsub_v8i16:
900 ; AVX2_SLOW: # %bb.0:
901 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
902 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
903 ; AVX2_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
904 ; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
905 ; AVX2_SLOW-NEXT: retq
907 ; AVX2_FAST-LABEL: hsub_v8i16:
908 ; AVX2_FAST: # %bb.0:
909 ; AVX2_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
910 ; AVX2_FAST-NEXT: retq
911 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
912 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
913 %hop = sub <8 x i16> %a0246, %a1357
914 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
918 define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
919 ; SSSE3-LABEL: hsub_v16i16a:
921 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
922 ; SSSE3-NEXT: phsubw %xmm1, %xmm2
923 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
924 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
927 ; AVX1-LABEL: hsub_v16i16a:
929 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
930 ; AVX1-NEXT: vphsubw %xmm1, %xmm0, %xmm0
931 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
932 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
935 ; AVX2-LABEL: hsub_v16i16a:
937 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
938 ; AVX2-NEXT: vphsubw %xmm1, %xmm0, %xmm0
939 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
941 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
942 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
943 %hop = sub <8 x i16> %a0, %a1
944 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
948 define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
949 ; SSSE3_SLOW-LABEL: hsub_v16i16b:
950 ; SSSE3_SLOW: # %bb.0:
951 ; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
952 ; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3
953 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3
954 ; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4
955 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4
956 ; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
957 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0
958 ; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm3
959 ; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1
960 ; SSSE3_SLOW-NEXT: psubw %xmm1, %xmm4
961 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1]
962 ; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
963 ; SSSE3_SLOW-NEXT: retq
965 ; SSSE3_FAST-LABEL: hsub_v16i16b:
966 ; SSSE3_FAST: # %bb.0:
967 ; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0
968 ; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm1
969 ; SSSE3_FAST-NEXT: retq
971 ; AVX1_SLOW-LABEL: hsub_v16i16b:
972 ; AVX1_SLOW: # %bb.0:
973 ; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
974 ; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2
975 ; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
976 ; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
977 ; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
978 ; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
979 ; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm2, %xmm0
980 ; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2
981 ; AVX1_SLOW-NEXT: vpsubw %xmm2, %xmm1, %xmm1
982 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
983 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
984 ; AVX1_SLOW-NEXT: retq
986 ; AVX1_FAST-LABEL: hsub_v16i16b:
987 ; AVX1_FAST: # %bb.0:
988 ; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm1
989 ; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
990 ; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
991 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
992 ; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
993 ; AVX1_FAST-NEXT: retq
995 ; AVX2_SLOW-LABEL: hsub_v16i16b:
996 ; AVX2_SLOW: # %bb.0:
997 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
998 ; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
999 ; AVX2_SLOW-NEXT: vpsubw %ymm0, %ymm1, %ymm0
1000 ; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
1001 ; AVX2_SLOW-NEXT: retq
1003 ; AVX2_FAST-LABEL: hsub_v16i16b:
1004 ; AVX2_FAST: # %bb.0:
1005 ; AVX2_FAST-NEXT: vphsubw %ymm0, %ymm0, %ymm0
1006 ; AVX2_FAST-NEXT: retq
1007 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
1008 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
1009 %hop = sub <16 x i16> %a0, %a1
1010 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
1011 ret <16 x i16> %shuf