1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
5 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6 ; AVX1-LABEL: shuffle_v8f32_45670123:
7 ; AVX1: # %bb.0: # %entry
8 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
11 ; AVX2-LABEL: shuffle_v8f32_45670123:
12 ; AVX2: # %bb.0: # %entry
13 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
16 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
17 ret <8 x float> %shuffle
20 define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
21 ; AVX1-LABEL: shuffle_v8f32_45670123_mem:
22 ; AVX1: # %bb.0: # %entry
23 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
26 ; AVX2-LABEL: shuffle_v8f32_45670123_mem:
27 ; AVX2: # %bb.0: # %entry
28 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,0,1]
31 %a = load <8 x float>, <8 x float>* %pa
32 %b = load <8 x float>, <8 x float>* %pb
33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
34 ret <8 x float> %shuffle
37 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
38 ; ALL-LABEL: shuffle_v8f32_0123cdef:
39 ; ALL: # %bb.0: # %entry
40 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
43 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
44 ret <8 x float> %shuffle
47 define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
48 ; AVX1-LABEL: shuffle_v8f32_01230123:
49 ; AVX1: # %bb.0: # %entry
50 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
53 ; AVX2-LABEL: shuffle_v8f32_01230123:
54 ; AVX2: # %bb.0: # %entry
55 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
58 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
59 ret <8 x float> %shuffle
62 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
63 ; AVX1-LABEL: shuffle_v8f32_01230123_mem:
64 ; AVX1: # %bb.0: # %entry
65 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
68 ; AVX2-LABEL: shuffle_v8f32_01230123_mem:
69 ; AVX2: # %bb.0: # %entry
70 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1]
73 %a = load <8 x float>, <8 x float>* %pa
74 %b = load <8 x float>, <8 x float>* %pb
75 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
76 ret <8 x float> %shuffle
79 define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
80 ; AVX1-LABEL: shuffle_v8f32_45674567:
81 ; AVX1: # %bb.0: # %entry
82 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
85 ; AVX2-LABEL: shuffle_v8f32_45674567:
86 ; AVX2: # %bb.0: # %entry
87 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
90 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
91 ret <8 x float> %shuffle
94 define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
95 ; AVX1-LABEL: shuffle_v8f32_45674567_mem:
96 ; AVX1: # %bb.0: # %entry
97 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
100 ; AVX2-LABEL: shuffle_v8f32_45674567_mem:
101 ; AVX2: # %bb.0: # %entry
102 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,2,3]
105 %a = load <8 x float>, <8 x float>* %pa
106 %b = load <8 x float>, <8 x float>* %pb
107 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
108 ret <8 x float> %shuffle
111 define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
112 ; AVX1-LABEL: shuffle_v32i8_2323:
113 ; AVX1: # %bb.0: # %entry
114 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
117 ; AVX2-LABEL: shuffle_v32i8_2323:
118 ; AVX2: # %bb.0: # %entry
119 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
122 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
123 ret <32 x i8> %shuffle
126 define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
127 ; AVX1-LABEL: shuffle_v32i8_2323_domain:
128 ; AVX1: # %bb.0: # %entry
129 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
130 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
131 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
132 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
133 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
136 ; AVX2-LABEL: shuffle_v32i8_2323_domain:
137 ; AVX2: # %bb.0: # %entry
138 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
139 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
140 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
143 ; add forces execution domain
144 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
145 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
146 ret <32 x i8> %shuffle
149 define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
150 ; ALL-LABEL: shuffle_v4i64_6701:
151 ; ALL: # %bb.0: # %entry
152 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
155 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
156 ret <4 x i64> %shuffle
159 define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
160 ; AVX1-LABEL: shuffle_v4i64_6701_domain:
161 ; AVX1: # %bb.0: # %entry
162 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
163 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
164 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
167 ; AVX2-LABEL: shuffle_v4i64_6701_domain:
168 ; AVX2: # %bb.0: # %entry
169 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
170 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
171 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
174 ; add forces execution domain
175 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
176 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
177 ret <4 x i64> %shuffle
180 define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
181 ; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
182 ; AVX1: # %bb.0: # %entry
183 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
184 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
185 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
186 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
187 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
190 ; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
191 ; AVX2: # %bb.0: # %entry
192 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
193 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
194 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
197 ; add forces execution domain
198 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
199 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
200 ret <8 x i32> %shuffle
203 define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
204 ; AVX1-LABEL: shuffle_v16i16_4501:
205 ; AVX1: # %bb.0: # %entry
206 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
207 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
208 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
211 ; AVX2-LABEL: shuffle_v16i16_4501:
212 ; AVX2: # %bb.0: # %entry
213 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
214 ; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0
215 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
218 ; add forces execution domain
219 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
220 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
221 ret <16 x i16> %shuffle
224 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
225 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
226 ; AVX1: # %bb.0: # %entry
227 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
228 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
229 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
230 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
233 ; AVX2-LABEL: shuffle_v16i16_4501_mem:
234 ; AVX2: # %bb.0: # %entry
235 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
236 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
237 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
238 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
241 %c = load <16 x i16>, <16 x i16>* %a
242 %d = load <16 x i16>, <16 x i16>* %b
243 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
244 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
245 ret <16 x i16> %shuffle
248 ;;;; Cases with undef indicies mixed in the mask
250 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
251 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
252 ; ALL: # %bb.0: # %entry
253 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
256 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
257 ret <8 x float> %shuffle
260 define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
261 ; AVX1-LABEL: shuffle_v8f32_uu67uu67:
262 ; AVX1: # %bb.0: # %entry
263 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
266 ; AVX2-LABEL: shuffle_v8f32_uu67uu67:
267 ; AVX2: # %bb.0: # %entry
268 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
271 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
272 ret <8 x float> %shuffle
275 define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
276 ; ALL-LABEL: shuffle_v8f32_uu67uuab:
277 ; ALL: # %bb.0: # %entry
278 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
281 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
282 ret <8 x float> %shuffle
285 define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
286 ; ALL-LABEL: shuffle_v8f32_uu67uuef:
287 ; ALL: # %bb.0: # %entry
288 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
291 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
292 ret <8 x float> %shuffle
295 define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
296 ; AVX1-LABEL: shuffle_v8f32_uu674567:
297 ; AVX1: # %bb.0: # %entry
298 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
301 ; AVX2-LABEL: shuffle_v8f32_uu674567:
302 ; AVX2: # %bb.0: # %entry
303 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
306 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
307 ret <8 x float> %shuffle
310 define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
311 ; ALL-LABEL: shuffle_v8f32_uu6789ab:
312 ; ALL: # %bb.0: # %entry
313 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
316 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
317 ret <8 x float> %shuffle
320 define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
321 ; AVX1-LABEL: shuffle_v8f32_4567uu67:
322 ; AVX1: # %bb.0: # %entry
323 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
326 ; AVX2-LABEL: shuffle_v8f32_4567uu67:
327 ; AVX2: # %bb.0: # %entry
328 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
331 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
332 ret <8 x float> %shuffle
335 define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
336 ; ALL-LABEL: shuffle_v8f32_4567uuef:
337 ; ALL: # %bb.0: # %entry
338 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
341 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
342 ret <8 x float> %shuffle
345 ;;;; Cases we must not select vperm2f128
347 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
348 ; ALL-LABEL: shuffle_v8f32_uu67ucuf:
349 ; ALL: # %bb.0: # %entry
350 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
351 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
354 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
355 ret <8 x float> %shuffle
358 ;; Test zero mask generation.
359 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
360 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
361 ;; TODO: When building for optsize we should use vperm2f128.
363 define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
364 ; ALL-LABEL: shuffle_v4f64_zz01:
366 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
368 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
371 define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
372 ; ALL-LABEL: shuffle_v4f64_zz01_optsize:
374 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
376 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
380 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
381 ; ALL-LABEL: shuffle_v4f64_zz23:
383 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
384 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
386 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
389 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
390 ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
392 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
393 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
395 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
399 define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
400 ; ALL-LABEL: shuffle_v4f64_zz45:
402 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
404 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
407 define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
408 ; ALL-LABEL: shuffle_v4f64_zz45_optsize:
410 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
412 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
416 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
417 ; ALL-LABEL: shuffle_v4f64_zz67:
419 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
420 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
422 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
425 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
426 ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
428 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
429 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
431 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
435 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
436 ; ALL-LABEL: shuffle_v4f64_01zz:
438 ; ALL-NEXT: vmovaps %xmm0, %xmm0
440 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
443 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
444 ; ALL-LABEL: shuffle_v4f64_01zz_optsize:
446 ; ALL-NEXT: vmovaps %xmm0, %xmm0
448 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
452 define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
453 ; ALL-LABEL: shuffle_v4f64_23zz:
455 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
457 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
460 define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
461 ; ALL-LABEL: shuffle_v4f64_23zz_optsize:
463 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
465 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
469 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
470 ; ALL-LABEL: shuffle_v4f64_45zz:
472 ; ALL-NEXT: vmovaps %xmm0, %xmm0
474 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
477 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
478 ; ALL-LABEL: shuffle_v4f64_45zz_optsize:
480 ; ALL-NEXT: vmovaps %xmm0, %xmm0
482 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
486 define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
487 ; ALL-LABEL: shuffle_v4f64_67zz:
489 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
491 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
494 define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
495 ; ALL-LABEL: shuffle_v4f64_67zz_optsize:
497 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
499 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
503 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
505 define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
506 ; AVX1-LABEL: shuffle_v4i64_67zz:
508 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
509 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
510 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
513 ; AVX2-LABEL: shuffle_v4i64_67zz:
515 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
516 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
518 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
519 %c = add <4 x i64> %b, %s
523 ;;; Memory folding cases
525 define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
526 ; AVX1-LABEL: ld0_hi0_lo1_4f64:
527 ; AVX1: # %bb.0: # %entry
528 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
529 ; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
532 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
533 ; AVX2: # %bb.0: # %entry
534 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
535 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
536 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
539 %a = load <4 x double>, <4 x double> * %pa
540 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
541 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
542 ret <4 x double> %res
545 define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
546 ; AVX1-LABEL: ld1_hi0_hi1_4f64:
547 ; AVX1: # %bb.0: # %entry
548 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
549 ; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
552 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
553 ; AVX2: # %bb.0: # %entry
554 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
555 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
556 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
559 %b = load <4 x double>, <4 x double> * %pb
560 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
561 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
562 ret <4 x double> %res
565 define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
566 ; AVX1-LABEL: ld0_hi0_lo1_8f32:
567 ; AVX1: # %bb.0: # %entry
568 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
569 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
572 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
573 ; AVX2: # %bb.0: # %entry
574 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
575 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
576 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
579 %a = load <8 x float>, <8 x float> * %pa
580 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
581 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
585 define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
586 ; AVX1-LABEL: ld1_hi0_hi1_8f32:
587 ; AVX1: # %bb.0: # %entry
588 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
589 ; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
592 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
593 ; AVX2: # %bb.0: # %entry
594 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
595 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
596 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
599 %b = load <8 x float>, <8 x float> * %pb
600 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
601 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
605 define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
606 ; AVX1-LABEL: ld0_hi0_lo1_4i64:
607 ; AVX1: # %bb.0: # %entry
608 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
609 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
610 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
611 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
612 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
615 ; AVX2-LABEL: ld0_hi0_lo1_4i64:
616 ; AVX2: # %bb.0: # %entry
617 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
618 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
621 %a = load <4 x i64>, <4 x i64> * %pa
622 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
623 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
627 define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
628 ; AVX1-LABEL: ld1_hi0_hi1_4i64:
629 ; AVX1: # %bb.0: # %entry
630 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
631 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
632 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
633 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
634 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
637 ; AVX2-LABEL: ld1_hi0_hi1_4i64:
638 ; AVX2: # %bb.0: # %entry
639 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
640 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
643 %b = load <4 x i64>, <4 x i64> * %pb
644 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
645 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
649 define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
650 ; AVX1-LABEL: ld0_hi0_lo1_8i32:
651 ; AVX1: # %bb.0: # %entry
652 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
653 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
654 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
655 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
656 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
657 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
660 ; AVX2-LABEL: ld0_hi0_lo1_8i32:
661 ; AVX2: # %bb.0: # %entry
662 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
663 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
666 %a = load <8 x i32>, <8 x i32> * %pa
667 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
668 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
672 define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
673 ; AVX1-LABEL: ld1_hi0_hi1_8i32:
674 ; AVX1: # %bb.0: # %entry
675 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
676 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
677 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
678 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
679 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
680 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
683 ; AVX2-LABEL: ld1_hi0_hi1_8i32:
684 ; AVX2: # %bb.0: # %entry
685 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
686 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
689 %b = load <8 x i32>, <8 x i32> * %pb
690 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
691 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>