1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2
9 define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
10 ; SSE-LABEL: hadd_reverse_v8i16:
12 ; SSE-NEXT: phaddw %xmm1, %xmm0
13 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
14 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
17 ; AVX-LABEL: hadd_reverse_v8i16:
19 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
20 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
21 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
23 %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9>
24 %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8>
25 %add = add <8 x i16> %lhs, %rhs
29 define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
30 ; SSE-LABEL: hadd_reverse2_v8i16:
32 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
33 ; SSE-NEXT: pshufb %xmm2, %xmm0
34 ; SSE-NEXT: pshufb %xmm2, %xmm1
35 ; SSE-NEXT: phaddw %xmm1, %xmm0
38 ; AVX-LABEL: hadd_reverse2_v8i16:
40 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
41 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
42 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
43 ; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
45 %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
46 %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
47 %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
48 %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
49 %add = add <8 x i16> %lhs, %rhs
53 define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) {
54 ; SSE-LABEL: hadd_reverse_v8f32:
56 ; SSE-NEXT: movaps %xmm0, %xmm4
57 ; SSE-NEXT: haddps %xmm3, %xmm1
58 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2]
59 ; SSE-NEXT: haddps %xmm2, %xmm4
60 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
61 ; SSE-NEXT: movaps %xmm1, %xmm0
62 ; SSE-NEXT: movaps %xmm4, %xmm1
65 ; AVX1-LABEL: hadd_reverse_v8f32:
67 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
68 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
69 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
72 ; AVX2-LABEL: hadd_reverse_v8f32:
74 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
75 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
76 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
78 %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9>
79 %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8>
80 %add = fadd <8 x float> %lhs, %rhs
84 define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) {
85 ; SSE-LABEL: hadd_reverse2_v8f32:
87 ; SSE-NEXT: movaps %xmm0, %xmm4
88 ; SSE-NEXT: haddps %xmm3, %xmm1
89 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2]
90 ; SSE-NEXT: haddps %xmm2, %xmm4
91 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
92 ; SSE-NEXT: movaps %xmm1, %xmm0
93 ; SSE-NEXT: movaps %xmm4, %xmm1
96 ; AVX1-LABEL: hadd_reverse2_v8f32:
98 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
99 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
100 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
101 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
102 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
105 ; AVX2-LABEL: hadd_reverse2_v8f32:
107 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
108 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
109 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
110 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
111 ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0
113 %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
114 %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
115 %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
116 %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
117 %add = fadd <8 x float> %lhs, %rhs
121 define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) {
122 ; SSE-LABEL: hadd_reverse3_v8f32:
124 ; SSE-NEXT: haddps %xmm1, %xmm3
125 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
126 ; SSE-NEXT: haddps %xmm0, %xmm2
127 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
128 ; SSE-NEXT: movaps %xmm3, %xmm0
129 ; SSE-NEXT: movaps %xmm2, %xmm1
132 ; AVX1-LABEL: hadd_reverse3_v8f32:
134 ; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0
135 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
136 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
139 ; AVX2-LABEL: hadd_reverse3_v8f32:
141 ; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0
142 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
143 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
145 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
146 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
147 %add = fadd <8 x float> %shuf0, %shuf1
148 %shuf2 = shufflevector <8 x float> %add, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
149 ret <8 x float> %shuf2
152 define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
153 ; SSE-LABEL: hadd_reverse_v16i16:
155 ; SSE-NEXT: phaddw %xmm3, %xmm1
156 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
157 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
158 ; SSE-NEXT: phaddw %xmm2, %xmm0
159 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
160 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
161 ; SSE-NEXT: movdqa %xmm3, %xmm0
164 ; AVX1-LABEL: hadd_reverse_v16i16:
166 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
167 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
168 ; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
169 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
170 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
171 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
172 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
173 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
174 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
177 ; AVX2-LABEL: hadd_reverse_v16i16:
179 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
180 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
181 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
182 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
184 %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
185 %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
186 %add = add <16 x i16> %lhs, %rhs
190 define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
191 ; SSE-LABEL: hadd_reverse2_v16i16:
193 ; SSE-NEXT: movdqa %xmm0, %xmm4
194 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
195 ; SSE-NEXT: pshufb %xmm0, %xmm4
196 ; SSE-NEXT: pshufb %xmm0, %xmm1
197 ; SSE-NEXT: pshufb %xmm0, %xmm2
198 ; SSE-NEXT: phaddw %xmm2, %xmm4
199 ; SSE-NEXT: pshufb %xmm0, %xmm3
200 ; SSE-NEXT: phaddw %xmm3, %xmm1
201 ; SSE-NEXT: movdqa %xmm1, %xmm0
202 ; SSE-NEXT: movdqa %xmm4, %xmm1
205 ; AVX1-LABEL: hadd_reverse2_v16i16:
207 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
208 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
209 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
210 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
211 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
212 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
213 ; AVX1-NEXT: vphaddw %xmm4, %xmm2, %xmm2
214 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
215 ; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
216 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
219 ; AVX2-LABEL: hadd_reverse2_v16i16:
221 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
222 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
223 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
224 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
225 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
226 ; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
228 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
229 %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
230 %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
231 %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
232 %add = add <16 x i16> %lhs, %rhs
236 define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
237 ; SSE-LABEL: hadd_reverse_v8f64:
239 ; SSE-NEXT: movapd %xmm1, %xmm8
240 ; SSE-NEXT: movapd %xmm0, %xmm9
241 ; SSE-NEXT: haddpd %xmm7, %xmm3
242 ; SSE-NEXT: haddpd %xmm6, %xmm2
243 ; SSE-NEXT: haddpd %xmm5, %xmm8
244 ; SSE-NEXT: haddpd %xmm4, %xmm9
245 ; SSE-NEXT: movapd %xmm3, %xmm0
246 ; SSE-NEXT: movapd %xmm2, %xmm1
247 ; SSE-NEXT: movapd %xmm8, %xmm2
248 ; SSE-NEXT: movapd %xmm9, %xmm3
251 ; AVX1-LABEL: hadd_reverse_v8f64:
253 ; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
254 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
255 ; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
256 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
257 ; AVX1-NEXT: vmovapd %ymm3, %ymm0
260 ; AVX2-LABEL: hadd_reverse_v8f64:
262 ; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
263 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
264 ; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
265 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
266 ; AVX2-NEXT: vmovapd %ymm3, %ymm0
268 %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9>
269 %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8>
270 %fadd = fadd <8 x double> %lhs, %rhs
271 ret <8 x double> %fadd
274 define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
275 ; SSE-LABEL: hadd_reverse2_v8f64:
277 ; SSE-NEXT: movapd %xmm1, %xmm8
278 ; SSE-NEXT: movapd %xmm0, %xmm9
279 ; SSE-NEXT: haddpd %xmm7, %xmm3
280 ; SSE-NEXT: haddpd %xmm6, %xmm2
281 ; SSE-NEXT: haddpd %xmm5, %xmm8
282 ; SSE-NEXT: haddpd %xmm4, %xmm9
283 ; SSE-NEXT: movapd %xmm3, %xmm0
284 ; SSE-NEXT: movapd %xmm2, %xmm1
285 ; SSE-NEXT: movapd %xmm8, %xmm2
286 ; SSE-NEXT: movapd %xmm9, %xmm3
289 ; AVX1-LABEL: hadd_reverse2_v8f64:
291 ; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
292 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
293 ; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
294 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
295 ; AVX1-NEXT: vmovapd %ymm3, %ymm0
298 ; AVX2-LABEL: hadd_reverse2_v8f64:
300 ; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1
301 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
302 ; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
303 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
304 ; AVX2-NEXT: vmovapd %ymm3, %ymm0
306 %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
307 %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
308 %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
309 %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
310 %fadd = fadd <8 x double> %lhs, %rhs
311 ret <8 x double> %fadd
314 define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
315 ; SSE-LABEL: hadd_reverse_v16f32:
317 ; SSE-NEXT: movaps %xmm0, %xmm8
318 ; SSE-NEXT: haddps %xmm3, %xmm2
319 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
320 ; SSE-NEXT: haddps %xmm7, %xmm6
321 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
322 ; SSE-NEXT: haddps %xmm1, %xmm8
323 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2,1,0]
324 ; SSE-NEXT: haddps %xmm5, %xmm4
325 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
326 ; SSE-NEXT: movaps %xmm2, %xmm0
327 ; SSE-NEXT: movaps %xmm6, %xmm1
328 ; SSE-NEXT: movaps %xmm8, %xmm2
329 ; SSE-NEXT: movaps %xmm4, %xmm3
332 ; AVX1-LABEL: hadd_reverse_v16f32:
334 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3]
335 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
336 ; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2
337 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3]
338 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
339 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0
340 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
341 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
344 ; AVX2-LABEL: hadd_reverse_v16f32:
346 ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1
347 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
348 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1]
349 ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0
350 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
351 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1]
352 ; AVX2-NEXT: vmovaps %ymm3, %ymm0
354 %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
355 %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
356 %fadd = fadd <16 x float> %lhs, %rhs
357 ret <16 x float> %fadd
360 define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
361 ; SSE-LABEL: hadd_reverse2_v16f32:
363 ; SSE-NEXT: movaps %xmm1, %xmm8
364 ; SSE-NEXT: movaps %xmm0, %xmm9
365 ; SSE-NEXT: haddps %xmm7, %xmm3
366 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
367 ; SSE-NEXT: haddps %xmm6, %xmm2
368 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
369 ; SSE-NEXT: haddps %xmm5, %xmm8
370 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2]
371 ; SSE-NEXT: haddps %xmm4, %xmm9
372 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0,3,2]
373 ; SSE-NEXT: movaps %xmm3, %xmm0
374 ; SSE-NEXT: movaps %xmm2, %xmm1
375 ; SSE-NEXT: movaps %xmm8, %xmm2
376 ; SSE-NEXT: movaps %xmm9, %xmm3
379 ; AVX1-LABEL: hadd_reverse2_v16f32:
381 ; AVX1-NEXT: vhaddps %ymm3, %ymm1, %ymm1
382 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
383 ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6]
384 ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0
385 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
386 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
387 ; AVX1-NEXT: vmovaps %ymm3, %ymm0
390 ; AVX2-LABEL: hadd_reverse2_v16f32:
392 ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1
393 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
394 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
395 ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0
396 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
397 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
398 ; AVX2-NEXT: vmovaps %ymm3, %ymm0
400 %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
401 %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
402 %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
403 %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31>
404 %fadd = fadd <16 x float> %lhs, %rhs
405 ret <16 x float> %fadd