1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
8 define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) {
9 ; AVX1-LABEL: unpckh_unary_extracted_v4i64:
11 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
12 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
13 ; AVX1-NEXT: vzeroupper
16 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64:
17 ; AVX2OR512VL: # %bb.0:
18 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
19 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
20 ; AVX2OR512VL-NEXT: vzeroupper
21 ; AVX2OR512VL-NEXT: retq
22 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
23 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
24 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 1, i32 3>
28 define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
29 ; AVX1-LABEL: unpckh_unary_extracted_v8f64:
31 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
32 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
33 ; AVX1-NEXT: vzeroupper
36 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64:
37 ; AVX2OR512VL: # %bb.0:
38 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
39 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
40 ; AVX2OR512VL-NEXT: vzeroupper
41 ; AVX2OR512VL-NEXT: retq
42 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
43 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
44 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 1, i32 3>
48 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
50 define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
51 ; ALL-LABEL: unpckh_unary_extracted_v8i32:
53 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
54 ; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
55 ; ALL-NEXT: vzeroupper
57 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
58 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
59 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
63 define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
64 ; ALL-LABEL: unpckh_unary_extracted_v8f32:
66 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
67 ; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
68 ; ALL-NEXT: vzeroupper
70 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
71 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
72 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
76 define <8 x i16> @unpckh_unary_extracted_v16i16(<16 x i16> %x) {
77 ; AVX1-LABEL: unpckh_unary_extracted_v16i16:
79 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
80 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
81 ; AVX1-NEXT: vzeroupper
84 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v16i16:
85 ; AVX2OR512VL: # %bb.0:
86 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
87 ; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
88 ; AVX2OR512VL-NEXT: vzeroupper
89 ; AVX2OR512VL-NEXT: retq
90 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
91 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
92 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
96 define <16 x i8> @unpckh_unary_extracted_v32i8(<32 x i8> %x) {
97 ; AVX1-LABEL: unpckh_unary_extracted_v32i8:
99 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
100 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
101 ; AVX1-NEXT: vzeroupper
104 ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v32i8:
105 ; AVX2OR512VL: # %bb.0:
106 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
107 ; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
108 ; AVX2OR512VL-NEXT: vzeroupper
109 ; AVX2OR512VL-NEXT: retq
110 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
111 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
112 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
116 define <2 x i64> @unpckl_unary_extracted_v4i64(<4 x i64> %x) {
117 ; AVX1-LABEL: unpckl_unary_extracted_v4i64:
119 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
120 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
121 ; AVX1-NEXT: vzeroupper
124 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64:
125 ; AVX2OR512VL: # %bb.0:
126 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
127 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
128 ; AVX2OR512VL-NEXT: vzeroupper
129 ; AVX2OR512VL-NEXT: retq
130 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
131 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
132 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 0, i32 2>
136 define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
137 ; AVX1-LABEL: unpckl_unary_extracted_v8f64:
139 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
140 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
141 ; AVX1-NEXT: vzeroupper
144 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64:
145 ; AVX2OR512VL: # %bb.0:
146 ; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
147 ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
148 ; AVX2OR512VL-NEXT: vzeroupper
149 ; AVX2OR512VL-NEXT: retq
150 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1>
151 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3>
152 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 0, i32 2>
156 ; vpermps requires a constant load for the index op. It's unlikely to be profitable.
158 define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
159 ; ALL-LABEL: unpckl_unary_extracted_v8i32:
161 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
162 ; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163 ; ALL-NEXT: vzeroupper
165 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
166 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
167 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
171 define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
172 ; ALL-LABEL: unpckl_unary_extracted_v8f32:
174 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
175 ; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
176 ; ALL-NEXT: vzeroupper
178 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
179 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
180 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
184 define <8 x i16> @unpckl_unary_extracted_v16i16(<16 x i16> %x) {
185 ; AVX1-LABEL: unpckl_unary_extracted_v16i16:
187 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
188 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
189 ; AVX1-NEXT: vzeroupper
192 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v16i16:
193 ; AVX2OR512VL: # %bb.0:
194 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
195 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
196 ; AVX2OR512VL-NEXT: vzeroupper
197 ; AVX2OR512VL-NEXT: retq
198 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
199 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
200 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
204 define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) {
205 ; AVX1-LABEL: unpckl_unary_extracted_v32i8:
207 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
208 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
209 ; AVX1-NEXT: vzeroupper
212 ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v32i8:
213 ; AVX2OR512VL: # %bb.0:
214 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
215 ; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
216 ; AVX2OR512VL-NEXT: vzeroupper
217 ; AVX2OR512VL-NEXT: retq
218 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
219 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
220 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
224 ; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form.
226 define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) {
227 ; ALL-LABEL: extract_unpckl_v8i32:
229 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
230 ; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
232 %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
233 ret <8 x i32> %shuffle