1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
9 ; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
10 ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
12 define <4 x float> @hadd_v4f32(<4 x float> %a) {
13 ; SSSE3-LABEL: hadd_v4f32:
15 ; SSSE3-NEXT: haddps %xmm0, %xmm0
18 ; AVX-LABEL: hadd_v4f32:
20 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
22 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
23 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
24 %hop = fadd <2 x float> %a02, %a13
25 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
29 define <8 x float> @hadd_v8f32a(<8 x float> %a) {
30 ; SSSE3-LABEL: hadd_v8f32a:
32 ; SSSE3-NEXT: movaps %xmm0, %xmm2
33 ; SSSE3-NEXT: haddps %xmm1, %xmm2
34 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
35 ; SSSE3-NEXT: movaps %xmm2, %xmm1
38 ; AVX1-LABEL: hadd_v8f32a:
40 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
41 ; AVX1-NEXT: vhaddps %xmm1, %xmm0, %xmm0
42 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
43 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
46 ; AVX2-LABEL: hadd_v8f32a:
48 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
49 ; AVX2-NEXT: vhaddps %xmm1, %xmm0, %xmm0
50 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
52 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
53 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
54 %hop = fadd <4 x float> %a0, %a1
55 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
59 define <8 x float> @hadd_v8f32b(<8 x float> %a) {
60 ; SSSE3-LABEL: hadd_v8f32b:
62 ; SSSE3-NEXT: haddps %xmm0, %xmm0
63 ; SSSE3-NEXT: haddps %xmm1, %xmm1
66 ; AVX-LABEL: hadd_v8f32b:
68 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
70 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
71 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
72 %hop = fadd <8 x float> %a0, %a1
73 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
77 define <4 x float> @hsub_v4f32(<4 x float> %a) {
78 ; SSSE3-LABEL: hsub_v4f32:
80 ; SSSE3-NEXT: hsubps %xmm0, %xmm0
83 ; AVX-LABEL: hsub_v4f32:
85 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
87 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
88 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
89 %hop = fsub <2 x float> %a02, %a13
90 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
94 define <8 x float> @hsub_v8f32a(<8 x float> %a) {
95 ; SSSE3-LABEL: hsub_v8f32a:
97 ; SSSE3-NEXT: movaps %xmm0, %xmm2
98 ; SSSE3-NEXT: hsubps %xmm1, %xmm2
99 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
100 ; SSSE3-NEXT: movaps %xmm2, %xmm1
103 ; AVX1-LABEL: hsub_v8f32a:
105 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
106 ; AVX1-NEXT: vhsubps %xmm1, %xmm0, %xmm0
107 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
108 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
111 ; AVX2-LABEL: hsub_v8f32a:
113 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
114 ; AVX2-NEXT: vhsubps %xmm1, %xmm0, %xmm0
115 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
117 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
118 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
119 %hop = fsub <4 x float> %a0, %a1
120 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
121 ret <8 x float> %shuf
124 define <8 x float> @hsub_v8f32b(<8 x float> %a) {
125 ; SSSE3-LABEL: hsub_v8f32b:
127 ; SSSE3-NEXT: hsubps %xmm0, %xmm0
128 ; SSSE3-NEXT: hsubps %xmm1, %xmm1
131 ; AVX-LABEL: hsub_v8f32b:
133 ; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
135 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
136 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
137 %hop = fsub <8 x float> %a0, %a1
138 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
139 ret <8 x float> %shuf
142 define <2 x double> @hadd_v2f64(<2 x double> %a) {
143 ; SSSE3_SLOW-LABEL: hadd_v2f64:
144 ; SSSE3_SLOW: # %bb.0:
145 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
146 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
147 ; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
148 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
149 ; SSSE3_SLOW-NEXT: retq
151 ; SSSE3_FAST-LABEL: hadd_v2f64:
152 ; SSSE3_FAST: # %bb.0:
153 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
154 ; SSSE3_FAST-NEXT: retq
156 ; AVX1_SLOW-LABEL: hadd_v2f64:
157 ; AVX1_SLOW: # %bb.0:
158 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
159 ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
160 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
161 ; AVX1_SLOW-NEXT: retq
163 ; AVX1_FAST-LABEL: hadd_v2f64:
164 ; AVX1_FAST: # %bb.0:
165 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
166 ; AVX1_FAST-NEXT: retq
168 ; AVX2_SLOW-LABEL: hadd_v2f64:
169 ; AVX2_SLOW: # %bb.0:
170 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
171 ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
172 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
173 ; AVX2_SLOW-NEXT: retq
175 ; AVX2_FAST-LABEL: hadd_v2f64:
176 ; AVX2_FAST: # %bb.0:
177 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
178 ; AVX2_FAST-NEXT: retq
179 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
180 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
181 %hop = fadd <2 x double> %a0, %a1
182 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
183 ret <2 x double> %shuf
186 define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
187 ; SSSE3_SLOW-LABEL: hadd_v2f64_scalar_splat:
188 ; SSSE3_SLOW: # %bb.0:
189 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
190 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
191 ; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
192 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
193 ; SSSE3_SLOW-NEXT: retq
195 ; SSSE3_FAST-LABEL: hadd_v2f64_scalar_splat:
196 ; SSSE3_FAST: # %bb.0:
197 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
198 ; SSSE3_FAST-NEXT: retq
200 ; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat:
201 ; AVX1_SLOW: # %bb.0:
202 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
203 ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
204 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
205 ; AVX1_SLOW-NEXT: retq
207 ; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat:
208 ; AVX1_FAST: # %bb.0:
209 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
210 ; AVX1_FAST-NEXT: retq
212 ; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat:
213 ; AVX2_SLOW: # %bb.0:
214 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
215 ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
216 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
217 ; AVX2_SLOW-NEXT: retq
219 ; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat:
220 ; AVX2_FAST: # %bb.0:
221 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
222 ; AVX2_FAST-NEXT: retq
223 %a0 = extractelement <2 x double> %a, i32 0
224 %a1 = extractelement <2 x double> %a, i32 1
225 %hop = fadd double %a0, %a1
226 %ins = insertelement <2 x double> undef, double %hop, i32 0
227 %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0>
228 ret <2 x double> %shuf
231 define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
232 ; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_splat:
233 ; SSSE3_SLOW: # %bb.0:
234 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
235 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
236 ; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2
237 ; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
238 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
239 ; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm3
240 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
241 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
242 ; SSSE3_SLOW-NEXT: retq
244 ; SSSE3_FAST-LABEL: hadd_v4f64_scalar_splat:
245 ; SSSE3_FAST: # %bb.0:
246 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
247 ; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
248 ; SSSE3_FAST-NEXT: retq
250 ; AVX-LABEL: hadd_v4f64_scalar_splat:
252 ; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
254 %a0 = extractelement <4 x double> %a, i32 0
255 %a1 = extractelement <4 x double> %a, i32 1
256 %hop0 = fadd double %a0, %a1
257 %a2 = extractelement <4 x double> %a, i32 2
258 %a3 = extractelement <4 x double> %a, i32 3
259 %hop1 = fadd double %a2, %a3
260 %ins = insertelement <4 x double> undef, double %hop0, i32 0
261 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2
262 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
263 ret <4 x double> %shuf
266 define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
267 ; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
268 ; SSSE3_SLOW: # %bb.0:
269 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
270 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
271 ; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
272 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
273 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
274 ; SSSE3_SLOW-NEXT: retq
276 ; SSSE3_FAST-LABEL: hadd_v4f64_scalar_broadcast:
277 ; SSSE3_FAST: # %bb.0:
278 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
279 ; SSSE3_FAST-NEXT: movapd %xmm0, %xmm1
280 ; SSSE3_FAST-NEXT: retq
282 ; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
283 ; AVX1_SLOW: # %bb.0:
284 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
285 ; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
286 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
287 ; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
288 ; AVX1_SLOW-NEXT: retq
290 ; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast:
291 ; AVX1_FAST: # %bb.0:
292 ; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
293 ; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
294 ; AVX1_FAST-NEXT: retq
296 ; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
297 ; AVX2_SLOW: # %bb.0:
298 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
299 ; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
300 ; AVX2_SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
301 ; AVX2_SLOW-NEXT: retq
303 ; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast:
304 ; AVX2_FAST: # %bb.0:
305 ; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
306 ; AVX2_FAST-NEXT: vbroadcastsd %xmm0, %ymm0
307 ; AVX2_FAST-NEXT: retq
308 %a0 = extractelement <4 x double> %a, i32 0
309 %a1 = extractelement <4 x double> %a, i32 1
310 %hop0 = fadd double %a0, %a1
311 %a2 = extractelement <4 x double> %a, i32 2
312 %a3 = extractelement <4 x double> %a, i32 3
313 %hop1 = fadd double %a2, %a3
314 %ins = insertelement <4 x double> undef, double %hop0, i32 0
315 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2
316 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
317 ret <4 x double> %shuf
320 define <4 x double> @hadd_v4f64(<4 x double> %a) {
321 ; SSSE3_SLOW-LABEL: hadd_v4f64:
322 ; SSSE3_SLOW: # %bb.0:
323 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
324 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
325 ; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2
326 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
327 ; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
328 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
329 ; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2
330 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
331 ; SSSE3_SLOW-NEXT: retq
333 ; SSSE3_FAST-LABEL: hadd_v4f64:
334 ; SSSE3_FAST: # %bb.0:
335 ; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
336 ; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
337 ; SSSE3_FAST-NEXT: retq
339 ; AVX1_SLOW-LABEL: hadd_v4f64:
340 ; AVX1_SLOW: # %bb.0:
341 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
342 ; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
343 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
344 ; AVX1_SLOW-NEXT: retq
346 ; AVX1_FAST-LABEL: hadd_v4f64:
347 ; AVX1_FAST: # %bb.0:
348 ; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
349 ; AVX1_FAST-NEXT: retq
351 ; AVX2_SLOW-LABEL: hadd_v4f64:
352 ; AVX2_SLOW: # %bb.0:
353 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
354 ; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
355 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
356 ; AVX2_SLOW-NEXT: retq
358 ; AVX2_FAST-LABEL: hadd_v4f64:
359 ; AVX2_FAST: # %bb.0:
360 ; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
361 ; AVX2_FAST-NEXT: retq
362 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
363 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
364 %hop = fadd <4 x double> %a0, %a1
365 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
366 ret <4 x double> %shuf
369 define <2 x double> @hsub_v2f64(<2 x double> %a) {
370 ; SSSE3_SLOW-LABEL: hsub_v2f64:
371 ; SSSE3_SLOW: # %bb.0:
372 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
373 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
374 ; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0
375 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
376 ; SSSE3_SLOW-NEXT: retq
378 ; SSSE3_FAST-LABEL: hsub_v2f64:
379 ; SSSE3_FAST: # %bb.0:
380 ; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
381 ; SSSE3_FAST-NEXT: retq
383 ; AVX1_SLOW-LABEL: hsub_v2f64:
384 ; AVX1_SLOW: # %bb.0:
385 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
386 ; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
387 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
388 ; AVX1_SLOW-NEXT: retq
390 ; AVX1_FAST-LABEL: hsub_v2f64:
391 ; AVX1_FAST: # %bb.0:
392 ; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
393 ; AVX1_FAST-NEXT: retq
395 ; AVX2_SLOW-LABEL: hsub_v2f64:
396 ; AVX2_SLOW: # %bb.0:
397 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
398 ; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
399 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
400 ; AVX2_SLOW-NEXT: retq
402 ; AVX2_FAST-LABEL: hsub_v2f64:
403 ; AVX2_FAST: # %bb.0:
404 ; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
405 ; AVX2_FAST-NEXT: retq
406 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
407 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
408 %hop = fsub <2 x double> %a0, %a1
409 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
410 ret <2 x double> %shuf
413 define <4 x double> @hsub_v4f64(<4 x double> %a) {
414 ; SSSE3_SLOW-LABEL: hsub_v4f64:
415 ; SSSE3_SLOW: # %bb.0:
416 ; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
417 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
418 ; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0
419 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
420 ; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
421 ; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
422 ; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1
423 ; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
424 ; SSSE3_SLOW-NEXT: retq
426 ; SSSE3_FAST-LABEL: hsub_v4f64:
427 ; SSSE3_FAST: # %bb.0:
428 ; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
429 ; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1
430 ; SSSE3_FAST-NEXT: retq
432 ; AVX1_SLOW-LABEL: hsub_v4f64:
433 ; AVX1_SLOW: # %bb.0:
434 ; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
435 ; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
436 ; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
437 ; AVX1_SLOW-NEXT: retq
439 ; AVX1_FAST-LABEL: hsub_v4f64:
440 ; AVX1_FAST: # %bb.0:
441 ; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
442 ; AVX1_FAST-NEXT: retq
444 ; AVX2_SLOW-LABEL: hsub_v4f64:
445 ; AVX2_SLOW: # %bb.0:
446 ; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
447 ; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
448 ; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
449 ; AVX2_SLOW-NEXT: retq
451 ; AVX2_FAST-LABEL: hsub_v4f64:
452 ; AVX2_FAST: # %bb.0:
453 ; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
454 ; AVX2_FAST-NEXT: retq
455 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
456 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
457 %hop = fsub <4 x double> %a0, %a1
458 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
459 ret <4 x double> %shuf
462 define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
463 ; SSSE3-LABEL: hadd_v4i32:
465 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
468 ; AVX-LABEL: hadd_v4i32:
470 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
472 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
473 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
474 %hop = add <4 x i32> %a02, %a13
475 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
479 define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
480 ; SSSE3-LABEL: hadd_v8i32a:
482 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
483 ; SSSE3-NEXT: phaddd %xmm1, %xmm2
484 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
485 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
488 ; AVX1-LABEL: hadd_v8i32a:
490 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
491 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
492 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
493 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
496 ; AVX2-LABEL: hadd_v8i32a:
498 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
499 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
500 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
502 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
503 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
504 %hop = add <4 x i32> %a0, %a1
505 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
509 define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
510 ; SSSE3-LABEL: hadd_v8i32b:
512 ; SSSE3-NEXT: phaddd %xmm0, %xmm0
513 ; SSSE3-NEXT: phaddd %xmm1, %xmm1
516 ; AVX1-LABEL: hadd_v8i32b:
518 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1
519 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
520 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
521 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
522 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
525 ; AVX2-LABEL: hadd_v8i32b:
527 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
529 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
530 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
531 %hop = add <8 x i32> %a0, %a1
532 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
536 define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
537 ; SSSE3-LABEL: hsub_v4i32:
539 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
542 ; AVX-LABEL: hsub_v4i32:
544 ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
546 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
547 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
548 %hop = sub <4 x i32> %a02, %a13
549 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
553 define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
554 ; SSSE3-LABEL: hsub_v8i32a:
556 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
557 ; SSSE3-NEXT: phsubd %xmm1, %xmm2
558 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
559 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
562 ; AVX1-LABEL: hsub_v8i32a:
564 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
565 ; AVX1-NEXT: vphsubd %xmm1, %xmm0, %xmm0
566 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
567 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
570 ; AVX2-LABEL: hsub_v8i32a:
572 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
573 ; AVX2-NEXT: vphsubd %xmm1, %xmm0, %xmm0
574 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
576 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
577 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
578 %hop = sub <4 x i32> %a0, %a1
579 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
583 define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
584 ; SSSE3-LABEL: hsub_v8i32b:
586 ; SSSE3-NEXT: phsubd %xmm0, %xmm0
587 ; SSSE3-NEXT: phsubd %xmm1, %xmm1
590 ; AVX1-LABEL: hsub_v8i32b:
592 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1
593 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
594 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0
595 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
596 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
599 ; AVX2-LABEL: hsub_v8i32b:
601 ; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0
603 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
604 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
605 %hop = sub <8 x i32> %a0, %a1
606 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
610 define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
611 ; SSSE3-LABEL: hadd_v8i16:
613 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
616 ; AVX-LABEL: hadd_v8i16:
618 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
620 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
621 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
622 %hop = add <8 x i16> %a0246, %a1357
623 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
627 define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
628 ; SSSE3-LABEL: hadd_v16i16a:
630 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
631 ; SSSE3-NEXT: phaddw %xmm1, %xmm2
632 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
633 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
636 ; AVX1-LABEL: hadd_v16i16a:
638 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
639 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
640 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
641 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
644 ; AVX2-LABEL: hadd_v16i16a:
646 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
647 ; AVX2-NEXT: vphaddw %xmm1, %xmm0, %xmm0
648 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
650 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
651 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
652 %hop = add <8 x i16> %a0, %a1
653 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
657 define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
658 ; SSSE3-LABEL: hadd_v16i16b:
660 ; SSSE3-NEXT: phaddw %xmm0, %xmm0
661 ; SSSE3-NEXT: phaddw %xmm1, %xmm1
664 ; AVX1-LABEL: hadd_v16i16b:
666 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1
667 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
668 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
669 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
670 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
673 ; AVX2-LABEL: hadd_v16i16b:
675 ; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
677 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
678 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
679 %hop = add <16 x i16> %a0, %a1
680 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
684 define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
685 ; SSSE3-LABEL: hsub_v8i16:
687 ; SSSE3-NEXT: phsubw %xmm0, %xmm0
690 ; AVX-LABEL: hsub_v8i16:
692 ; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0
694 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
695 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
696 %hop = sub <8 x i16> %a0246, %a1357
697 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
701 define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
702 ; SSSE3-LABEL: hsub_v16i16a:
704 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
705 ; SSSE3-NEXT: phsubw %xmm1, %xmm2
706 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
707 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
710 ; AVX1-LABEL: hsub_v16i16a:
712 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
713 ; AVX1-NEXT: vphsubw %xmm1, %xmm0, %xmm0
714 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
715 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
718 ; AVX2-LABEL: hsub_v16i16a:
720 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
721 ; AVX2-NEXT: vphsubw %xmm1, %xmm0, %xmm0
722 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
724 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
725 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
726 %hop = sub <8 x i16> %a0, %a1
727 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
731 define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
732 ; SSSE3-LABEL: hsub_v16i16b:
734 ; SSSE3-NEXT: phsubw %xmm0, %xmm0
735 ; SSSE3-NEXT: phsubw %xmm1, %xmm1
738 ; AVX1-LABEL: hsub_v16i16b:
740 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1
741 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
742 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0
743 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
744 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
747 ; AVX2-LABEL: hsub_v16i16b:
749 ; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0
751 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
752 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
753 %hop = sub <16 x i16> %a0, %a1
754 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
758 define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
759 ; SSSE3-LABEL: broadcast_haddps_v4f32:
761 ; SSSE3-NEXT: haddps %xmm0, %xmm0
762 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
765 ; AVX1-LABEL: broadcast_haddps_v4f32:
767 ; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0
768 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
771 ; AVX2-LABEL: broadcast_haddps_v4f32:
773 ; AVX2-NEXT: vhaddps %xmm0, %xmm0, %xmm0
774 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
776 %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
777 %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
781 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)