1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL
10 define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) {
11 ; AVX1-LABEL: unpckh_unary_extracted_v4i64:
13 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
14 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
15 ; AVX1-NEXT: vzeroupper
18 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64:
19 ; AVX2OR512VL: # %bb.0:
20 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
21 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
22 ; AVX2OR512VL-NEXT: vzeroupper
23 ; AVX2OR512VL-NEXT: retq
24 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
25 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
26 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 1, i32 3>
30 define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
31 ; AVX1-LABEL: unpckh_unary_extracted_v8f64:
33 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
34 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
35 ; AVX1-NEXT: vzeroupper
38 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64:
39 ; AVX2OR512VL: # %bb.0:
40 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
41 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
42 ; AVX2OR512VL-NEXT: vzeroupper
43 ; AVX2OR512VL-NEXT: retq
44 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
45 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
46 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 1, i32 3>
50 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
52 define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
53 ; ALL-LABEL: unpckh_unary_extracted_v8i32:
55 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
56 ; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
57 ; ALL-NEXT: vzeroupper
59 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
60 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
61 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
65 define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
66 ; ALL-LABEL: unpckh_unary_extracted_v8f32:
68 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
69 ; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
70 ; ALL-NEXT: vzeroupper
72 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
73 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
74 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
78 define <8 x i16> @unpckh_unary_extracted_v16i16(<16 x i16> %x) {
79 ; AVX1-LABEL: unpckh_unary_extracted_v16i16:
81 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
82 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
83 ; AVX1-NEXT: vzeroupper
86 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v16i16:
87 ; AVX2OR512VL: # %bb.0:
88 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
89 ; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
90 ; AVX2OR512VL-NEXT: vzeroupper
91 ; AVX2OR512VL-NEXT: retq
92 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
93 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
94 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
98 define <16 x i8> @unpckh_unary_extracted_v32i8(<32 x i8> %x) {
99 ; AVX1-LABEL: unpckh_unary_extracted_v32i8:
101 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
102 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
103 ; AVX1-NEXT: vzeroupper
106 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v32i8:
107 ; AVX2OR512VL: # %bb.0:
108 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
109 ; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
110 ; AVX2OR512VL-NEXT: vzeroupper
111 ; AVX2OR512VL-NEXT: retq
112 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
113 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
114 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
118 define <2 x i64> @unpckl_unary_extracted_v4i64(<4 x i64> %x) {
119 ; AVX1-LABEL: unpckl_unary_extracted_v4i64:
121 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
122 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
123 ; AVX1-NEXT: vzeroupper
126 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64:
127 ; AVX2OR512VL: # %bb.0:
128 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
129 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
130 ; AVX2OR512VL-NEXT: vzeroupper
131 ; AVX2OR512VL-NEXT: retq
132 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
133 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
134 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 0, i32 2>
138 define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
139 ; AVX1-LABEL: unpckl_unary_extracted_v8f64:
141 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
142 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
143 ; AVX1-NEXT: vzeroupper
146 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64:
147 ; AVX2OR512VL: # %bb.0:
148 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
149 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
150 ; AVX2OR512VL-NEXT: vzeroupper
151 ; AVX2OR512VL-NEXT: retq
152 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
153 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
154 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 0, i32 2>
158 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
160 define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
161 ; ALL-LABEL: unpckl_unary_extracted_v8i32:
163 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
164 ; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
165 ; ALL-NEXT: vzeroupper
167 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
168 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
173 define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
174 ; ALL-LABEL: unpckl_unary_extracted_v8f32:
176 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
177 ; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
178 ; ALL-NEXT: vzeroupper
180 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
181 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
182 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
186 define <8 x i16> @unpckl_unary_extracted_v16i16(<16 x i16> %x) {
187 ; AVX1-LABEL: unpckl_unary_extracted_v16i16:
189 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
190 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
191 ; AVX1-NEXT: vzeroupper
194 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v16i16:
195 ; AVX2OR512VL: # %bb.0:
196 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
197 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
198 ; AVX2OR512VL-NEXT: vzeroupper
199 ; AVX2OR512VL-NEXT: retq
200 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
201 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
202 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
206 define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) {
207 ; AVX1-LABEL: unpckl_unary_extracted_v32i8:
209 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
210 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
211 ; AVX1-NEXT: vzeroupper
214 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v32i8:
215 ; AVX2OR512VL: # %bb.0:
216 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
217 ; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
218 ; AVX2OR512VL-NEXT: vzeroupper
219 ; AVX2OR512VL-NEXT: retq
220 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
221 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
222 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
226 ; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form.
228 define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) {
229 ; ALL-LABEL: extract_unpckl_v8i32:
231 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
232 ; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
234 %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
235 ret <8 x i32> %shuffle