1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2
9 define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
10 ; SSE-LABEL: hadd_reverse_v8i16:
12 ; SSE-NEXT: phaddw %xmm1, %xmm0
13 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
14 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
17 ; AVX-LABEL: hadd_reverse_v8i16:
19 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
20 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
21 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
23 %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9>
24 %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8>
25 %add = add <8 x i16> %lhs, %rhs
29 define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
30 ; SSE-LABEL: hadd_reverse2_v8i16:
32 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
33 ; SSE-NEXT: pshufb %xmm2, %xmm0
34 ; SSE-NEXT: pshufb %xmm2, %xmm1
35 ; SSE-NEXT: phaddw %xmm1, %xmm0
38 ; AVX-LABEL: hadd_reverse2_v8i16:
40 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
41 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
42 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
43 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
45 %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
46 %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
47 %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
48 %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
49 %add = add <8 x i16> %lhs, %rhs
53 define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) {
54 ; SSE-LABEL: hadd_reverse_v8f32:
56 ; SSE-NEXT: movaps %xmm0, %xmm4
57 ; SSE-NEXT: haddps %xmm3, %xmm1
58 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2]
59 ; SSE-NEXT: haddps %xmm2, %xmm4
60 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
61 ; SSE-NEXT: movaps %xmm1, %xmm0
62 ; SSE-NEXT: movaps %xmm4, %xmm1
65 ; AVX1-LABEL: hadd_reverse_v8f32:
67 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
68 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
69 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
72 ; AVX2-LABEL: hadd_reverse_v8f32:
74 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
75 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
76 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
78 %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9>
79 %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8>
80 %add = fadd <8 x float> %lhs, %rhs
84 define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) {
85 ; SSE-LABEL: hadd_reverse2_v8f32:
87 ; SSE-NEXT: movaps %xmm0, %xmm4
88 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2],xmm0[1,0]
89 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0]
90 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
91 ; SSE-NEXT: haddps %xmm2, %xmm4
92 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
93 ; SSE-NEXT: haddps %xmm3, %xmm1
94 ; SSE-NEXT: movaps %xmm1, %xmm0
95 ; SSE-NEXT: movaps %xmm4, %xmm1
98 ; AVX1-LABEL: hadd_reverse2_v8f32:
100 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
101 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
102 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
103 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
104 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
107 ; AVX2-LABEL: hadd_reverse2_v8f32:
109 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
110 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
111 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
112 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
113 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
115 %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
116 %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
117 %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
118 %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
119 %add = fadd <8 x float> %lhs, %rhs
123 define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) {
124 ; SSE-LABEL: hadd_reverse3_v8f32:
126 ; SSE-NEXT: haddps %xmm1, %xmm3
127 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
128 ; SSE-NEXT: haddps %xmm0, %xmm2
129 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
130 ; SSE-NEXT: movaps %xmm3, %xmm0
131 ; SSE-NEXT: movaps %xmm2, %xmm1
134 ; AVX1-LABEL: hadd_reverse3_v8f32:
136 ; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0
137 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
138 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
141 ; AVX2-LABEL: hadd_reverse3_v8f32:
143 ; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0
144 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
145 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
147 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
148 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
149 %add = fadd <8 x float> %shuf0, %shuf1
150 %shuf2 = shufflevector <8 x float> %add, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
151 ret <8 x float> %shuf2
154 define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
155 ; SSE-LABEL: hadd_reverse_v16i16:
157 ; SSE-NEXT: phaddw %xmm3, %xmm1
158 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
159 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
160 ; SSE-NEXT: phaddw %xmm2, %xmm0
161 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
162 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
163 ; SSE-NEXT: movdqa %xmm3, %xmm0
166 ; AVX1-LABEL: hadd_reverse_v16i16:
168 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
169 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
170 ; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
171 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
172 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
173 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
174 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
175 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
176 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
179 ; AVX2-LABEL: hadd_reverse_v16i16:
181 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
182 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
183 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
184 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
186 %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
187 %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
188 %add = add <16 x i16> %lhs, %rhs
192 define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
193 ; SSE-LABEL: hadd_reverse2_v16i16:
195 ; SSE-NEXT: movdqa %xmm0, %xmm4
196 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
197 ; SSE-NEXT: pshufb %xmm0, %xmm4
198 ; SSE-NEXT: pshufb %xmm0, %xmm1
199 ; SSE-NEXT: pshufb %xmm0, %xmm2
200 ; SSE-NEXT: phaddw %xmm2, %xmm4
201 ; SSE-NEXT: pshufb %xmm0, %xmm3
202 ; SSE-NEXT: phaddw %xmm3, %xmm1
203 ; SSE-NEXT: movdqa %xmm1, %xmm0
204 ; SSE-NEXT: movdqa %xmm4, %xmm1
207 ; AVX1-LABEL: hadd_reverse2_v16i16:
209 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
210 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
211 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
212 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
213 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
214 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
215 ; AVX1-NEXT: vphaddw %xmm4, %xmm2, %xmm2
216 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
217 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
218 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
221 ; AVX2-LABEL: hadd_reverse2_v16i16:
223 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
224 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
225 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
226 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
227 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
228 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
230 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
231 %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
232 %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
233 %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
234 %add = add <16 x i16> %lhs, %rhs
238 define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
239 ; SSE-LABEL: hadd_reverse_v8f64:
241 ; SSE-NEXT: movapd %xmm1, %xmm8
242 ; SSE-NEXT: movapd %xmm0, %xmm9
243 ; SSE-NEXT: haddpd %xmm7, %xmm3
244 ; SSE-NEXT: haddpd %xmm6, %xmm2
245 ; SSE-NEXT: haddpd %xmm5, %xmm8
246 ; SSE-NEXT: haddpd %xmm4, %xmm9
247 ; SSE-NEXT: movapd %xmm3, %xmm0
248 ; SSE-NEXT: movapd %xmm2, %xmm1
249 ; SSE-NEXT: movapd %xmm8, %xmm2
250 ; SSE-NEXT: movapd %xmm9, %xmm3
253 ; AVX1-LABEL: hadd_reverse_v8f64:
255 ; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
256 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
257 ; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
258 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
259 ; AVX1-NEXT: vmovapd %ymm3, %ymm0
262 ; AVX2-LABEL: hadd_reverse_v8f64:
264 ; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
265 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
266 ; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
267 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
268 ; AVX2-NEXT: vmovapd %ymm3, %ymm0
270 %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9>
271 %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8>
272 %fadd = fadd <8 x double> %lhs, %rhs
273 ret <8 x double> %fadd
276 define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
277 ; SSE-LABEL: hadd_reverse2_v8f64:
279 ; SSE-NEXT: movapd %xmm1, %xmm8
280 ; SSE-NEXT: movapd %xmm0, %xmm9
281 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0]
282 ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0]
283 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
284 ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1,0]
285 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0]
286 ; SSE-NEXT: haddpd %xmm4, %xmm9
287 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1,0]
288 ; SSE-NEXT: haddpd %xmm5, %xmm8
289 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1,0]
290 ; SSE-NEXT: haddpd %xmm6, %xmm2
291 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1,0]
292 ; SSE-NEXT: haddpd %xmm7, %xmm3
293 ; SSE-NEXT: movapd %xmm3, %xmm0
294 ; SSE-NEXT: movapd %xmm2, %xmm1
295 ; SSE-NEXT: movapd %xmm8, %xmm2
296 ; SSE-NEXT: movapd %xmm9, %xmm3
299 ; AVX1-LABEL: hadd_reverse2_v8f64:
301 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
302 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
303 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
304 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2]
305 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
306 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,3,2]
307 ; AVX1-NEXT: vhaddpd %ymm1, %ymm0, %ymm1
308 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
309 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
310 ; AVX1-NEXT: vhaddpd %ymm0, %ymm4, %ymm0
313 ; AVX2-LABEL: hadd_reverse2_v8f64:
315 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
316 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[3,2,1,0]
317 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,1,0]
318 ; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm1
319 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,2,1,0]
320 ; AVX2-NEXT: vhaddpd %ymm0, %ymm4, %ymm0
322 %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
323 %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
324 %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
325 %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
326 %fadd = fadd <8 x double> %lhs, %rhs
327 ret <8 x double> %fadd
330 define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
331 ; SSE-LABEL: hadd_reverse_v16f32:
333 ; SSE-NEXT: movaps %xmm5, %xmm8
334 ; SSE-NEXT: movaps %xmm1, %xmm5
335 ; SSE-NEXT: haddps %xmm2, %xmm3
336 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
337 ; SSE-NEXT: haddps %xmm6, %xmm7
338 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0,3,2]
339 ; SSE-NEXT: haddps %xmm0, %xmm5
340 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0,3,2]
341 ; SSE-NEXT: haddps %xmm4, %xmm8
342 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2]
343 ; SSE-NEXT: movaps %xmm3, %xmm0
344 ; SSE-NEXT: movaps %xmm7, %xmm1
345 ; SSE-NEXT: movaps %xmm5, %xmm2
346 ; SSE-NEXT: movaps %xmm8, %xmm3
349 ; AVX1-LABEL: hadd_reverse_v16f32:
351 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3]
352 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
353 ; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2
354 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3]
355 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
356 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
357 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
358 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
361 ; AVX2-LABEL: hadd_reverse_v16f32:
363 ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1
364 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
365 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1]
366 ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0
367 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
368 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1]
369 ; AVX2-NEXT: vmovaps %ymm3, %ymm0
371 %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
372 %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
373 %fadd = fadd <16 x float> %lhs, %rhs
374 ret <16 x float> %fadd
377 define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
378 ; SSE-LABEL: hadd_reverse2_v16f32:
380 ; SSE-NEXT: movaps %xmm1, %xmm8
381 ; SSE-NEXT: movaps %xmm0, %xmm9
382 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,2],xmm0[1,0]
383 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2],xmm1[1,0]
384 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
385 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
386 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
387 ; SSE-NEXT: haddps %xmm4, %xmm9
388 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2,1,0]
389 ; SSE-NEXT: haddps %xmm5, %xmm8
390 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
391 ; SSE-NEXT: haddps %xmm6, %xmm2
392 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2,1,0]
393 ; SSE-NEXT: haddps %xmm7, %xmm3
394 ; SSE-NEXT: movaps %xmm3, %xmm0
395 ; SSE-NEXT: movaps %xmm2, %xmm1
396 ; SSE-NEXT: movaps %xmm8, %xmm2
397 ; SSE-NEXT: movaps %xmm9, %xmm3
400 ; AVX1-LABEL: hadd_reverse2_v16f32:
402 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
403 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
404 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
405 ; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,2,1,0,7,6,5,4]
406 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
407 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
408 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm1
409 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
410 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
411 ; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm0
414 ; AVX2-LABEL: hadd_reverse2_v16f32:
416 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
417 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
418 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
419 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,3,0,1]
420 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4]
421 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
422 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm1
423 ; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4]
424 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
425 ; AVX2-NEXT: vhaddps %ymm0, %ymm4, %ymm0
427 %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
428 %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
429 %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
430 %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31>
431 %fadd = fadd <16 x float> %lhs, %rhs
432 ret <16 x float> %fadd