1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
5 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6 ; AVX1-LABEL: shuffle_v8f32_45670123:
7 ; AVX1: # %bb.0: # %entry
8 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
11 ; AVX2-LABEL: shuffle_v8f32_45670123:
12 ; AVX2: # %bb.0: # %entry
13 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
16 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
17 ret <8 x float> %shuffle
20 define <8 x float> @shuffle_v8f32_45670123_mem(ptr %pa, ptr %pb) nounwind uwtable readnone ssp {
21 ; AVX1-LABEL: shuffle_v8f32_45670123_mem:
22 ; AVX1: # %bb.0: # %entry
23 ; AVX1-NEXT: vperm2f128 $35, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,0,1]
26 ; AVX2-LABEL: shuffle_v8f32_45670123_mem:
27 ; AVX2: # %bb.0: # %entry
28 ; AVX2-NEXT: vpermpd $78, (%rdi), %ymm0 # ymm0 = mem[2,3,0,1]
31 %a = load <8 x float>, ptr %pa
32 %b = load <8 x float>, ptr %pb
33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
34 ret <8 x float> %shuffle
37 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
38 ; ALL-LABEL: shuffle_v8f32_0123cdef:
39 ; ALL: # %bb.0: # %entry
40 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
43 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
44 ret <8 x float> %shuffle
47 define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
48 ; AVX1-LABEL: shuffle_v8f32_01230123:
49 ; AVX1: # %bb.0: # %entry
50 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
53 ; AVX2-LABEL: shuffle_v8f32_01230123:
54 ; AVX2: # %bb.0: # %entry
55 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
58 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
59 ret <8 x float> %shuffle
62 define <8 x float> @shuffle_v8f32_01230123_mem(ptr %pa, ptr %pb) nounwind uwtable readnone ssp {
63 ; ALL-LABEL: shuffle_v8f32_01230123_mem:
64 ; ALL: # %bb.0: # %entry
65 ; ALL-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
68 %a = load <8 x float>, ptr %pa
69 %b = load <8 x float>, ptr %pb
70 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
71 ret <8 x float> %shuffle
74 define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
75 ; AVX1-LABEL: shuffle_v8f32_45674567:
76 ; AVX1: # %bb.0: # %entry
77 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
80 ; AVX2-LABEL: shuffle_v8f32_45674567:
81 ; AVX2: # %bb.0: # %entry
82 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
85 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
86 ret <8 x float> %shuffle
89 define <8 x float> @shuffle_v8f32_45674567_mem(ptr %pa, ptr %pb) nounwind uwtable readnone ssp {
90 ; ALL-LABEL: shuffle_v8f32_45674567_mem:
91 ; ALL: # %bb.0: # %entry
92 ; ALL-NEXT: vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
95 %a = load <8 x float>, ptr %pa
96 %b = load <8 x float>, ptr %pb
97 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
98 ret <8 x float> %shuffle
101 define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
102 ; AVX1-LABEL: shuffle_v32i8_2323:
103 ; AVX1: # %bb.0: # %entry
104 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
107 ; AVX2-LABEL: shuffle_v32i8_2323:
108 ; AVX2: # %bb.0: # %entry
109 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
112 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
113 ret <32 x i8> %shuffle
116 define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
117 ; AVX1-LABEL: shuffle_v32i8_2323_domain:
118 ; AVX1: # %bb.0: # %entry
119 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
120 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
121 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
122 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
125 ; AVX2-LABEL: shuffle_v32i8_2323_domain:
126 ; AVX2: # %bb.0: # %entry
127 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
128 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
129 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
132 ; add forces execution domain
133 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
134 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
135 ret <32 x i8> %shuffle
138 define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
139 ; ALL-LABEL: shuffle_v4i64_6701:
140 ; ALL: # %bb.0: # %entry
141 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
144 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
145 ret <4 x i64> %shuffle
148 define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
149 ; AVX1-LABEL: shuffle_v4i64_6701_domain:
150 ; AVX1: # %bb.0: # %entry
151 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
152 ; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
153 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
156 ; AVX2-LABEL: shuffle_v4i64_6701_domain:
157 ; AVX2: # %bb.0: # %entry
158 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
159 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
160 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
163 ; add forces execution domain
164 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
165 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
166 ret <4 x i64> %shuffle
169 define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
170 ; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
171 ; AVX1: # %bb.0: # %entry
172 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
173 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174 ; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
175 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
176 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
179 ; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
180 ; AVX2: # %bb.0: # %entry
181 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
182 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
183 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
186 ; add forces execution domain
187 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
188 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
189 ret <8 x i32> %shuffle
192 define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
193 ; AVX1-LABEL: shuffle_v16i16_4501:
194 ; AVX1: # %bb.0: # %entry
195 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
196 ; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
197 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
200 ; AVX2-LABEL: shuffle_v16i16_4501:
201 ; AVX2: # %bb.0: # %entry
202 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
203 ; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0
204 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
207 ; add forces execution domain
208 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
209 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
210 ret <16 x i16> %shuffle
213 define <16 x i16> @shuffle_v16i16_4501_mem(ptr %a, ptr %b) nounwind uwtable readnone ssp {
214 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
215 ; AVX1: # %bb.0: # %entry
216 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
217 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
218 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
219 ; AVX1-NEXT: vperm2f128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1]
222 ; AVX2-LABEL: shuffle_v16i16_4501_mem:
223 ; AVX2: # %bb.0: # %entry
224 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
225 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
226 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
227 ; AVX2-NEXT: vperm2i128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1]
230 %c = load <16 x i16>, ptr %a
231 %d = load <16 x i16>, ptr %b
232 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
233 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
234 ret <16 x i16> %shuffle
237 ;;;; Cases with undef indicies mixed in the mask
239 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
240 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
241 ; ALL: # %bb.0: # %entry
242 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
245 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
246 ret <8 x float> %shuffle
249 define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
250 ; AVX1-LABEL: shuffle_v8f32_uu67uu67:
251 ; AVX1: # %bb.0: # %entry
252 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
255 ; AVX2-LABEL: shuffle_v8f32_uu67uu67:
256 ; AVX2: # %bb.0: # %entry
257 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
260 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
261 ret <8 x float> %shuffle
264 define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
265 ; ALL-LABEL: shuffle_v8f32_uu67uuab:
266 ; ALL: # %bb.0: # %entry
267 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
270 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
271 ret <8 x float> %shuffle
274 define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
275 ; ALL-LABEL: shuffle_v8f32_uu67uuef:
276 ; ALL: # %bb.0: # %entry
277 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
280 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
281 ret <8 x float> %shuffle
284 define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
285 ; AVX1-LABEL: shuffle_v8f32_uu674567:
286 ; AVX1: # %bb.0: # %entry
287 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
290 ; AVX2-LABEL: shuffle_v8f32_uu674567:
291 ; AVX2: # %bb.0: # %entry
292 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
295 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
296 ret <8 x float> %shuffle
299 define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
300 ; ALL-LABEL: shuffle_v8f32_uu6789ab:
301 ; ALL: # %bb.0: # %entry
302 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
305 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
306 ret <8 x float> %shuffle
309 define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
310 ; AVX1-LABEL: shuffle_v8f32_4567uu67:
311 ; AVX1: # %bb.0: # %entry
312 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
315 ; AVX2-LABEL: shuffle_v8f32_4567uu67:
316 ; AVX2: # %bb.0: # %entry
317 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
320 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
321 ret <8 x float> %shuffle
324 define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
325 ; ALL-LABEL: shuffle_v8f32_4567uuef:
326 ; ALL: # %bb.0: # %entry
327 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
330 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
331 ret <8 x float> %shuffle
334 ;;;; Cases we must not select vperm2f128
336 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
337 ; ALL-LABEL: shuffle_v8f32_uu67ucuf:
338 ; ALL: # %bb.0: # %entry
339 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
340 ; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
343 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
344 ret <8 x float> %shuffle
347 ;; Test zero mask generation.
348 ;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
349 ;; Prefer xor+vblendpd over vperm2f128 because that has better performance,
350 ;; unless building for optsize where we should still use vperm2f128.
352 define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
353 ; ALL-LABEL: shuffle_v4f64_zz01:
355 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
357 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
360 define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
361 ; ALL-LABEL: shuffle_v4f64_zz01_optsize:
363 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
365 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
369 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
370 ; ALL-LABEL: shuffle_v4f64_zz23:
372 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
373 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
375 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
378 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
379 ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
381 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
383 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
386 define <4 x double> @shuffle_v4f64_zz23_pgso(<4 x double> %a) !prof !14 {
387 ; ALL-LABEL: shuffle_v4f64_zz23_pgso:
389 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
391 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
395 define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
396 ; ALL-LABEL: shuffle_v4f64_zz45:
398 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
400 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
403 define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
404 ; ALL-LABEL: shuffle_v4f64_zz45_optsize:
406 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
408 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
412 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
413 ; ALL-LABEL: shuffle_v4f64_zz67:
415 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
416 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
418 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
421 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
422 ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
424 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
426 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
429 define <4 x double> @shuffle_v4f64_zz67_pgso(<4 x double> %a) !prof !14 {
430 ; ALL-LABEL: shuffle_v4f64_zz67_pgso:
432 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3]
434 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
438 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
439 ; ALL-LABEL: shuffle_v4f64_01zz:
441 ; ALL-NEXT: vmovaps %xmm0, %xmm0
443 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
446 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
447 ; ALL-LABEL: shuffle_v4f64_01zz_optsize:
449 ; ALL-NEXT: vmovaps %xmm0, %xmm0
451 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
455 define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
456 ; ALL-LABEL: shuffle_v4f64_23zz:
458 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
460 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
463 define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
464 ; ALL-LABEL: shuffle_v4f64_23zz_optsize:
466 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
468 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
472 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
473 ; ALL-LABEL: shuffle_v4f64_45zz:
475 ; ALL-NEXT: vmovaps %xmm0, %xmm0
477 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
480 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
481 ; ALL-LABEL: shuffle_v4f64_45zz_optsize:
483 ; ALL-NEXT: vmovaps %xmm0, %xmm0
485 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
489 define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
490 ; ALL-LABEL: shuffle_v4f64_67zz:
492 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
494 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
497 define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
498 ; ALL-LABEL: shuffle_v4f64_67zz_optsize:
500 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
502 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
506 ;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
508 define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
509 ; AVX1-LABEL: shuffle_v4i64_67zz:
511 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
512 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
513 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
516 ; AVX2-LABEL: shuffle_v4i64_67zz:
518 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
519 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
521 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
522 %c = add <4 x i64> %b, %s
526 ;;; Memory folding cases
528 define <4 x double> @ld0_hi0_lo1_4f64(ptr %pa, <4 x double> %b) nounwind uwtable readnone ssp {
529 ; AVX1-LABEL: ld0_hi0_lo1_4f64:
530 ; AVX1: # %bb.0: # %entry
531 ; AVX1-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
532 ; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
535 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
536 ; AVX2: # %bb.0: # %entry
537 ; AVX2-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
538 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
539 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
542 %a = load <4 x double>, ptr %pa
543 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
544 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
545 ret <4 x double> %res
548 define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, ptr %pb) nounwind uwtable readnone ssp {
549 ; AVX1-LABEL: ld1_hi0_hi1_4f64:
550 ; AVX1: # %bb.0: # %entry
551 ; AVX1-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
552 ; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
555 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
556 ; AVX2: # %bb.0: # %entry
557 ; AVX2-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
558 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
559 ; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
562 %b = load <4 x double>, ptr %pb
563 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
564 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
565 ret <4 x double> %res
568 define <8 x float> @ld0_hi0_lo1_8f32(ptr %pa, <8 x float> %b) nounwind uwtable readnone ssp {
569 ; AVX1-LABEL: ld0_hi0_lo1_8f32:
570 ; AVX1: # %bb.0: # %entry
571 ; AVX1-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
572 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
575 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
576 ; AVX2: # %bb.0: # %entry
577 ; AVX2-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
578 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
579 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
582 %a = load <8 x float>, ptr %pa
583 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
584 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
588 define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, ptr %pb) nounwind uwtable readnone ssp {
589 ; AVX1-LABEL: ld1_hi0_hi1_8f32:
590 ; AVX1: # %bb.0: # %entry
591 ; AVX1-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
592 ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
595 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
596 ; AVX2: # %bb.0: # %entry
597 ; AVX2-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
598 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
599 ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
602 %b = load <8 x float>, ptr %pb
603 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
604 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
608 define <4 x i64> @ld0_hi0_lo1_4i64(ptr %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
609 ; AVX1-LABEL: ld0_hi0_lo1_4i64:
610 ; AVX1: # %bb.0: # %entry
611 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
612 ; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
613 ; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
614 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
617 ; AVX2-LABEL: ld0_hi0_lo1_4i64:
618 ; AVX2: # %bb.0: # %entry
619 ; AVX2-NEXT: vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
620 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
623 %a = load <4 x i64>, ptr %pa
624 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
625 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
629 define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, ptr %pb) nounwind uwtable readnone ssp {
630 ; AVX1-LABEL: ld1_hi0_hi1_4i64:
631 ; AVX1: # %bb.0: # %entry
632 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
633 ; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
634 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
635 ; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
636 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
639 ; AVX2-LABEL: ld1_hi0_hi1_4i64:
640 ; AVX2: # %bb.0: # %entry
641 ; AVX2-NEXT: vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
642 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
645 %b = load <4 x i64>, ptr %pb
646 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
647 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
651 define <8 x i32> @ld0_hi0_lo1_8i32(ptr %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
652 ; AVX1-LABEL: ld0_hi0_lo1_8i32:
653 ; AVX1: # %bb.0: # %entry
654 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4]
655 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
656 ; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm1
657 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
660 ; AVX2-LABEL: ld0_hi0_lo1_8i32:
661 ; AVX2: # %bb.0: # %entry
662 ; AVX2-NEXT: vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1]
663 ; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
666 %a = load <8 x i32>, ptr %pa
667 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
668 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
672 define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, ptr %pb) nounwind uwtable readnone ssp {
673 ; AVX1-LABEL: ld1_hi0_hi1_8i32:
674 ; AVX1: # %bb.0: # %entry
675 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4]
676 ; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm2
677 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
678 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
679 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
682 ; AVX2-LABEL: ld1_hi0_hi1_8i32:
683 ; AVX2: # %bb.0: # %entry
684 ; AVX2-NEXT: vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3]
685 ; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
688 %b = load <8 x i32>, ptr %pb
689 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
690 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
694 define void @PR50053(ptr nocapture %0, ptr nocapture readonly %1) {
695 ; ALL-LABEL: PR50053:
697 ; ALL-NEXT: vmovaps (%rsi), %ymm0
698 ; ALL-NEXT: vinsertf128 $1, 32(%rsi), %ymm0, %ymm1
699 ; ALL-NEXT: vinsertf128 $0, 48(%rsi), %ymm0, %ymm0
700 ; ALL-NEXT: vmovaps %ymm1, (%rdi)
701 ; ALL-NEXT: vmovaps %ymm0, 32(%rdi)
702 ; ALL-NEXT: vzeroupper
704 %3 = load <4 x i64>, ptr %1, align 32
705 %4 = getelementptr inbounds <4 x i64>, ptr %1, i64 1
706 %5 = load <2 x i64>, ptr %4, align 16
707 %6 = getelementptr inbounds <2 x i64>, ptr %4, i64 1
708 %7 = load <2 x i64>, ptr %6, align 16
709 %8 = shufflevector <2 x i64> %5, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
710 %9 = shufflevector <4 x i64> %3, <4 x i64> %8, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
711 store <4 x i64> %9, ptr %0, align 32
712 %10 = shufflevector <2 x i64> %7, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
713 %11 = shufflevector <4 x i64> %10, <4 x i64> %3, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
714 %12 = getelementptr inbounds <4 x i64>, ptr %0, i64 1
715 store <4 x i64> %11, ptr %12, align 32
719 !llvm.module.flags = !{!0}
720 !0 = !{i32 1, !"ProfileSummary", !1}
721 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
722 !2 = !{!"ProfileFormat", !"InstrProf"}
723 !3 = !{!"TotalCount", i64 10000}
724 !4 = !{!"MaxCount", i64 10}
725 !5 = !{!"MaxInternalCount", i64 1}
726 !6 = !{!"MaxFunctionCount", i64 1000}
727 !7 = !{!"NumCounts", i64 3}
728 !8 = !{!"NumFunctions", i64 3}
729 !9 = !{!"DetailedSummary", !10}
730 !10 = !{!11, !12, !13}
731 !11 = !{i32 10000, i64 100, i32 1}
732 !12 = !{i32 999000, i64 100, i32 1}
733 !13 = !{i32 999999, i64 1, i32 2}
734 !14 = !{!"function_entry_count", i64 0}