1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
7 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
13 define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
14 ; CHECK-LABEL: @test_v2f64(
15 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
16 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
17 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
18 ; CHECK-NEXT: ret <2 x double> [[TMP3]]
20 %a0 = extractelement <2 x double> %a, i32 0
21 %a1 = extractelement <2 x double> %a, i32 1
22 %b0 = extractelement <2 x double> %b, i32 0
23 %b1 = extractelement <2 x double> %b, i32 1
24 %r0 = fadd double %a0, %a1
25 %r1 = fadd double %b0, %b1
26 %r00 = insertelement <2 x double> poison, double %r0, i32 0
27 %r01 = insertelement <2 x double> %r00, double %r1, i32 1
31 define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
32 ; CHECK-LABEL: @test_v4f32(
33 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
34 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
35 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
36 ; CHECK-NEXT: ret <4 x float> [[TMP3]]
38 %a0 = extractelement <4 x float> %a, i32 0
39 %a1 = extractelement <4 x float> %a, i32 1
40 %a2 = extractelement <4 x float> %a, i32 2
41 %a3 = extractelement <4 x float> %a, i32 3
42 %b0 = extractelement <4 x float> %b, i32 0
43 %b1 = extractelement <4 x float> %b, i32 1
44 %b2 = extractelement <4 x float> %b, i32 2
45 %b3 = extractelement <4 x float> %b, i32 3
46 %r0 = fadd float %a0, %a1
47 %r1 = fadd float %a2, %a3
48 %r2 = fadd float %b0, %b1
49 %r3 = fadd float %b2, %b3
50 %r00 = insertelement <4 x float> poison, float %r0, i32 0
51 %r01 = insertelement <4 x float> %r00, float %r1, i32 1
52 %r02 = insertelement <4 x float> %r01, float %r2, i32 2
53 %r03 = insertelement <4 x float> %r02, float %r3, i32 3
57 define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
58 ; CHECK-LABEL: @test_v2i64(
59 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
60 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
61 ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
62 ; CHECK-NEXT: ret <2 x i64> [[TMP3]]
64 %a0 = extractelement <2 x i64> %a, i32 0
65 %a1 = extractelement <2 x i64> %a, i32 1
66 %b0 = extractelement <2 x i64> %b, i32 0
67 %b1 = extractelement <2 x i64> %b, i32 1
68 %r0 = add i64 %a0, %a1
69 %r1 = add i64 %b0, %b1
70 %r00 = insertelement <2 x i64> poison, i64 %r0, i32 0
71 %r01 = insertelement <2 x i64> %r00, i64 %r1, i32 1
75 define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
76 ; CHECK-LABEL: @test_v4i32(
77 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
78 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
79 ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
80 ; CHECK-NEXT: ret <4 x i32> [[TMP3]]
82 %a0 = extractelement <4 x i32> %a, i32 0
83 %a1 = extractelement <4 x i32> %a, i32 1
84 %a2 = extractelement <4 x i32> %a, i32 2
85 %a3 = extractelement <4 x i32> %a, i32 3
86 %b0 = extractelement <4 x i32> %b, i32 0
87 %b1 = extractelement <4 x i32> %b, i32 1
88 %b2 = extractelement <4 x i32> %b, i32 2
89 %b3 = extractelement <4 x i32> %b, i32 3
90 %r0 = add i32 %a0, %a1
91 %r1 = add i32 %a2, %a3
92 %r2 = add i32 %b0, %b1
93 %r3 = add i32 %b2, %b3
94 %r00 = insertelement <4 x i32> poison, i32 %r0, i32 0
95 %r01 = insertelement <4 x i32> %r00, i32 %r1, i32 1
96 %r02 = insertelement <4 x i32> %r01, i32 %r2, i32 2
97 %r03 = insertelement <4 x i32> %r02, i32 %r3, i32 3
101 define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
102 ; CHECK-LABEL: @test_v8i16(
103 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
104 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
105 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
106 ; CHECK-NEXT: ret <8 x i16> [[TMP3]]
108 %a0 = extractelement <8 x i16> %a, i32 0
109 %a1 = extractelement <8 x i16> %a, i32 1
110 %a2 = extractelement <8 x i16> %a, i32 2
111 %a3 = extractelement <8 x i16> %a, i32 3
112 %a4 = extractelement <8 x i16> %a, i32 4
113 %a5 = extractelement <8 x i16> %a, i32 5
114 %a6 = extractelement <8 x i16> %a, i32 6
115 %a7 = extractelement <8 x i16> %a, i32 7
116 %b0 = extractelement <8 x i16> %b, i32 0
117 %b1 = extractelement <8 x i16> %b, i32 1
118 %b2 = extractelement <8 x i16> %b, i32 2
119 %b3 = extractelement <8 x i16> %b, i32 3
120 %b4 = extractelement <8 x i16> %b, i32 4
121 %b5 = extractelement <8 x i16> %b, i32 5
122 %b6 = extractelement <8 x i16> %b, i32 6
123 %b7 = extractelement <8 x i16> %b, i32 7
124 %r0 = add i16 %a0, %a1
125 %r1 = add i16 %a2, %a3
126 %r2 = add i16 %a4, %a5
127 %r3 = add i16 %a6, %a7
128 %r4 = add i16 %b0, %b1
129 %r5 = add i16 %b2, %b3
130 %r6 = add i16 %b4, %b5
131 %r7 = add i16 %b6, %b7
132 %r00 = insertelement <8 x i16> poison, i16 %r0, i32 0
133 %r01 = insertelement <8 x i16> %r00, i16 %r1, i32 1
134 %r02 = insertelement <8 x i16> %r01, i16 %r2, i32 2
135 %r03 = insertelement <8 x i16> %r02, i16 %r3, i32 3
136 %r04 = insertelement <8 x i16> %r03, i16 %r4, i32 4
137 %r05 = insertelement <8 x i16> %r04, i16 %r5, i32 5
138 %r06 = insertelement <8 x i16> %r05, i16 %r6, i32 6
139 %r07 = insertelement <8 x i16> %r06, i16 %r7, i32 7
144 define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
145 ; CHECK-LABEL: @test_v4f32_v2f32_store(
146 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
147 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> <i32 0, i32 3>
148 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
149 ; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[P:%.*]], align 4
150 ; CHECK-NEXT: ret void
152 %x0 = extractelement <4 x float> %f, i64 0
153 %x1 = extractelement <4 x float> %f, i64 1
154 %add01 = fadd float %x0, %x1
155 store float %add01, ptr %p, align 4
156 %x2 = extractelement <4 x float> %f, i64 2
157 %x3 = extractelement <4 x float> %f, i64 3
158 %add23 = fadd float %x2, %x3
159 %p23 = getelementptr inbounds float, ptr %p, i64 1
160 store float %add23, ptr %p23, align 4
168 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
169 ; SSE-LABEL: @test_v4f64(
170 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
171 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
172 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
173 ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
174 ; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
175 ; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
176 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
177 ; SSE-NEXT: ret <4 x double> [[TMP7]]
179 ; SLM-LABEL: @test_v4f64(
180 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
181 ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
182 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
183 ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
184 ; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
185 ; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
186 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
187 ; SLM-NEXT: ret <4 x double> [[TMP7]]
189 ; AVX-LABEL: @test_v4f64(
190 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
191 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
192 ; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
193 ; AVX-NEXT: ret <4 x double> [[TMP3]]
195 %a0 = extractelement <4 x double> %a, i32 0
196 %a1 = extractelement <4 x double> %a, i32 1
197 %a2 = extractelement <4 x double> %a, i32 2
198 %a3 = extractelement <4 x double> %a, i32 3
199 %b0 = extractelement <4 x double> %b, i32 0
200 %b1 = extractelement <4 x double> %b, i32 1
201 %b2 = extractelement <4 x double> %b, i32 2
202 %b3 = extractelement <4 x double> %b, i32 3
203 %r0 = fadd double %a0, %a1
204 %r1 = fadd double %b0, %b1
205 %r2 = fadd double %a2, %a3
206 %r3 = fadd double %b2, %b3
207 %r00 = insertelement <4 x double> poison, double %r0, i32 0
208 %r01 = insertelement <4 x double> %r00, double %r1, i32 1
209 %r02 = insertelement <4 x double> %r01, double %r2, i32 2
210 %r03 = insertelement <4 x double> %r02, double %r3, i32 3
211 ret <4 x double> %r03
215 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
216 ; CHECK-LABEL: @test_v4f64_partial_swizzle(
217 ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
218 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
219 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
220 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
221 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
222 ; CHECK-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]]
223 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
224 ; CHECK-NEXT: [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
225 ; CHECK-NEXT: ret <4 x double> [[R03]]
227 %a0 = extractelement <4 x double> %a, i64 0
228 %a1 = extractelement <4 x double> %a, i64 1
229 %b0 = extractelement <4 x double> %b, i64 0
230 %b1 = extractelement <4 x double> %b, i64 1
231 %b2 = extractelement <4 x double> %b, i32 2
232 %b3 = extractelement <4 x double> %b, i32 3
233 %r0 = fadd double %a0, %a1
234 %r2 = fadd double %b0, %b1
235 %r3 = fadd double %b2, %b3
236 %r00 = insertelement <4 x double> poison, double %r0, i32 0
237 %r02 = insertelement <4 x double> %r00, double %r2, i32 2
238 %r03 = insertelement <4 x double> %r02, double %r3, i32 3
239 ret <4 x double> %r03
242 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
243 ; SSE-LABEL: @test_v8f32(
244 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
245 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
246 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
247 ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
248 ; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
249 ; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
250 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
251 ; SSE-NEXT: ret <8 x float> [[TMP7]]
253 ; SLM-LABEL: @test_v8f32(
254 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
255 ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
256 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
257 ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
258 ; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
259 ; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
260 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
261 ; SLM-NEXT: ret <8 x float> [[TMP7]]
263 ; AVX-LABEL: @test_v8f32(
264 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
265 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
266 ; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
267 ; AVX-NEXT: ret <8 x float> [[TMP3]]
269 %a0 = extractelement <8 x float> %a, i32 0
270 %a1 = extractelement <8 x float> %a, i32 1
271 %a2 = extractelement <8 x float> %a, i32 2
272 %a3 = extractelement <8 x float> %a, i32 3
273 %a4 = extractelement <8 x float> %a, i32 4
274 %a5 = extractelement <8 x float> %a, i32 5
275 %a6 = extractelement <8 x float> %a, i32 6
276 %a7 = extractelement <8 x float> %a, i32 7
277 %b0 = extractelement <8 x float> %b, i32 0
278 %b1 = extractelement <8 x float> %b, i32 1
279 %b2 = extractelement <8 x float> %b, i32 2
280 %b3 = extractelement <8 x float> %b, i32 3
281 %b4 = extractelement <8 x float> %b, i32 4
282 %b5 = extractelement <8 x float> %b, i32 5
283 %b6 = extractelement <8 x float> %b, i32 6
284 %b7 = extractelement <8 x float> %b, i32 7
285 %r0 = fadd float %a0, %a1
286 %r1 = fadd float %a2, %a3
287 %r2 = fadd float %b0, %b1
288 %r3 = fadd float %b2, %b3
289 %r4 = fadd float %a4, %a5
290 %r5 = fadd float %a6, %a7
291 %r6 = fadd float %b4, %b5
292 %r7 = fadd float %b6, %b7
293 %r00 = insertelement <8 x float> poison, float %r0, i32 0
294 %r01 = insertelement <8 x float> %r00, float %r1, i32 1
295 %r02 = insertelement <8 x float> %r01, float %r2, i32 2
296 %r03 = insertelement <8 x float> %r02, float %r3, i32 3
297 %r04 = insertelement <8 x float> %r03, float %r4, i32 4
298 %r05 = insertelement <8 x float> %r04, float %r5, i32 5
299 %r06 = insertelement <8 x float> %r05, float %r6, i32 6
300 %r07 = insertelement <8 x float> %r06, float %r7, i32 7
304 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
305 ; SSE-LABEL: @test_v4i64(
306 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
307 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
308 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
309 ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
310 ; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
311 ; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
312 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
313 ; SSE-NEXT: ret <4 x i64> [[TMP7]]
315 ; SLM-LABEL: @test_v4i64(
316 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
317 ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
318 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
319 ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
320 ; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
321 ; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
322 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
323 ; SLM-NEXT: ret <4 x i64> [[TMP7]]
325 ; AVX-LABEL: @test_v4i64(
326 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
327 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
328 ; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
329 ; AVX-NEXT: ret <4 x i64> [[TMP3]]
331 %a0 = extractelement <4 x i64> %a, i32 0
332 %a1 = extractelement <4 x i64> %a, i32 1
333 %a2 = extractelement <4 x i64> %a, i32 2
334 %a3 = extractelement <4 x i64> %a, i32 3
335 %b0 = extractelement <4 x i64> %b, i32 0
336 %b1 = extractelement <4 x i64> %b, i32 1
337 %b2 = extractelement <4 x i64> %b, i32 2
338 %b3 = extractelement <4 x i64> %b, i32 3
339 %r0 = add i64 %a0, %a1
340 %r1 = add i64 %b0, %b1
341 %r2 = add i64 %a2, %a3
342 %r3 = add i64 %b2, %b3
343 %r00 = insertelement <4 x i64> poison, i64 %r0, i32 0
344 %r01 = insertelement <4 x i64> %r00, i64 %r1, i32 1
345 %r02 = insertelement <4 x i64> %r01, i64 %r2, i32 2
346 %r03 = insertelement <4 x i64> %r02, i64 %r3, i32 3
350 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
351 ; SSE-LABEL: @test_v8i32(
352 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
353 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
354 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
355 ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
356 ; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
357 ; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
358 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
359 ; SSE-NEXT: ret <8 x i32> [[TMP7]]
361 ; SLM-LABEL: @test_v8i32(
362 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
363 ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
364 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
365 ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
366 ; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
367 ; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
368 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
369 ; SLM-NEXT: ret <8 x i32> [[TMP7]]
371 ; AVX-LABEL: @test_v8i32(
372 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
373 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
374 ; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
375 ; AVX-NEXT: ret <8 x i32> [[TMP3]]
377 %a0 = extractelement <8 x i32> %a, i32 0
378 %a1 = extractelement <8 x i32> %a, i32 1
379 %a2 = extractelement <8 x i32> %a, i32 2
380 %a3 = extractelement <8 x i32> %a, i32 3
381 %a4 = extractelement <8 x i32> %a, i32 4
382 %a5 = extractelement <8 x i32> %a, i32 5
383 %a6 = extractelement <8 x i32> %a, i32 6
384 %a7 = extractelement <8 x i32> %a, i32 7
385 %b0 = extractelement <8 x i32> %b, i32 0
386 %b1 = extractelement <8 x i32> %b, i32 1
387 %b2 = extractelement <8 x i32> %b, i32 2
388 %b3 = extractelement <8 x i32> %b, i32 3
389 %b4 = extractelement <8 x i32> %b, i32 4
390 %b5 = extractelement <8 x i32> %b, i32 5
391 %b6 = extractelement <8 x i32> %b, i32 6
392 %b7 = extractelement <8 x i32> %b, i32 7
393 %r0 = add i32 %a0, %a1
394 %r1 = add i32 %a2, %a3
395 %r2 = add i32 %b0, %b1
396 %r3 = add i32 %b2, %b3
397 %r4 = add i32 %a4, %a5
398 %r5 = add i32 %a6, %a7
399 %r6 = add i32 %b4, %b5
400 %r7 = add i32 %b6, %b7
401 %r00 = insertelement <8 x i32> poison, i32 %r0, i32 0
402 %r01 = insertelement <8 x i32> %r00, i32 %r1, i32 1
403 %r02 = insertelement <8 x i32> %r01, i32 %r2, i32 2
404 %r03 = insertelement <8 x i32> %r02, i32 %r3, i32 3
405 %r04 = insertelement <8 x i32> %r03, i32 %r4, i32 4
406 %r05 = insertelement <8 x i32> %r04, i32 %r5, i32 5
407 %r06 = insertelement <8 x i32> %r05, i32 %r6, i32 6
408 %r07 = insertelement <8 x i32> %r06, i32 %r7, i32 7
412 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
413 ; SSE-LABEL: @test_v16i16(
414 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
415 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
416 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
417 ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
418 ; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
419 ; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
420 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
421 ; SSE-NEXT: ret <16 x i16> [[TMP7]]
423 ; SLM-LABEL: @test_v16i16(
424 ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
425 ; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
426 ; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
427 ; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
428 ; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
429 ; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
430 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
431 ; SLM-NEXT: ret <16 x i16> [[TMP7]]
433 ; AVX-LABEL: @test_v16i16(
434 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
435 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
436 ; AVX-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
437 ; AVX-NEXT: ret <16 x i16> [[TMP3]]
439 %a0 = extractelement <16 x i16> %a, i32 0
440 %a1 = extractelement <16 x i16> %a, i32 1
441 %a2 = extractelement <16 x i16> %a, i32 2
442 %a3 = extractelement <16 x i16> %a, i32 3
443 %a4 = extractelement <16 x i16> %a, i32 4
444 %a5 = extractelement <16 x i16> %a, i32 5
445 %a6 = extractelement <16 x i16> %a, i32 6
446 %a7 = extractelement <16 x i16> %a, i32 7
447 %a8 = extractelement <16 x i16> %a, i32 8
448 %a9 = extractelement <16 x i16> %a, i32 9
449 %a10 = extractelement <16 x i16> %a, i32 10
450 %a11 = extractelement <16 x i16> %a, i32 11
451 %a12 = extractelement <16 x i16> %a, i32 12
452 %a13 = extractelement <16 x i16> %a, i32 13
453 %a14 = extractelement <16 x i16> %a, i32 14
454 %a15 = extractelement <16 x i16> %a, i32 15
455 %b0 = extractelement <16 x i16> %b, i32 0
456 %b1 = extractelement <16 x i16> %b, i32 1
457 %b2 = extractelement <16 x i16> %b, i32 2
458 %b3 = extractelement <16 x i16> %b, i32 3
459 %b4 = extractelement <16 x i16> %b, i32 4
460 %b5 = extractelement <16 x i16> %b, i32 5
461 %b6 = extractelement <16 x i16> %b, i32 6
462 %b7 = extractelement <16 x i16> %b, i32 7
463 %b8 = extractelement <16 x i16> %b, i32 8
464 %b9 = extractelement <16 x i16> %b, i32 9
465 %b10 = extractelement <16 x i16> %b, i32 10
466 %b11 = extractelement <16 x i16> %b, i32 11
467 %b12 = extractelement <16 x i16> %b, i32 12
468 %b13 = extractelement <16 x i16> %b, i32 13
469 %b14 = extractelement <16 x i16> %b, i32 14
470 %b15 = extractelement <16 x i16> %b, i32 15
471 %r0 = add i16 %a0 , %a1
472 %r1 = add i16 %a2 , %a3
473 %r2 = add i16 %a4 , %a5
474 %r3 = add i16 %a6 , %a7
475 %r4 = add i16 %b0 , %b1
476 %r5 = add i16 %b2 , %b3
477 %r6 = add i16 %b4 , %b5
478 %r7 = add i16 %b6 , %b7
479 %r8 = add i16 %a8 , %a9
480 %r9 = add i16 %a10, %a11
481 %r10 = add i16 %a12, %a13
482 %r11 = add i16 %a14, %a15
483 %r12 = add i16 %b8 , %b9
484 %r13 = add i16 %b10, %b11
485 %r14 = add i16 %b12, %b13
486 %r15 = add i16 %b14, %b15
487 %rv0 = insertelement <16 x i16> poison, i16 %r0 , i32 0
488 %rv1 = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
489 %rv2 = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
490 %rv3 = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
491 %rv4 = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
492 %rv5 = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
493 %rv6 = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
494 %rv7 = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
495 %rv8 = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
496 %rv9 = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
497 %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
498 %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
499 %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
500 %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
501 %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
502 %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15