1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI
11 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
12 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
14 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
15 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
16 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
19 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
21 ; SSSE3-NEXT: pxor %xmm1, %xmm1
22 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
25 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
27 ; SSE41-NEXT: pxor %xmm1, %xmm1
28 ; SSE41-NEXT: pshufb %xmm1, %xmm0
31 ; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
33 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
34 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
37 ; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
38 ; AVX2OR512VL: # %bb.0:
39 ; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0
40 ; AVX2OR512VL-NEXT: retq
41 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
42 ret <16 x i8> %shuffle
45 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
46 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
48 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
49 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
50 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
53 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
55 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
58 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
60 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
63 ; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
65 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
67 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
68 ret <16 x i8> %shuffle
71 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
72 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
74 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
75 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
76 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
77 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
78 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
81 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
83 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
86 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
88 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
91 ; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
93 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
95 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
96 ret <16 x i8> %shuffle
99 define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
100 ; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
102 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
103 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
106 ; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
108 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
109 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
112 ; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
113 ; AVX2-SLOW: # %bb.0:
114 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
115 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
116 ; AVX2-SLOW-NEXT: retq
118 ; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
119 ; AVX2-FAST: # %bb.0:
120 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
121 ; AVX2-FAST-NEXT: retq
123 ; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
125 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
126 ; AVX512VL-NEXT: retq
127 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
128 ret <16 x i8> %shuffle
131 define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
132 ; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
134 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
135 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
138 ; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
140 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
141 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
144 ; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
145 ; AVX2-SLOW: # %bb.0:
146 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
147 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
148 ; AVX2-SLOW-NEXT: retq
150 ; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
151 ; AVX2-FAST: # %bb.0:
152 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
153 ; AVX2-FAST-NEXT: retq
155 ; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
157 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
158 ; AVX512VL-NEXT: retq
159 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
160 ret <16 x i8> %shuffle
163 define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
164 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
166 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
167 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
168 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
169 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
170 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
171 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
174 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
176 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
179 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
181 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
184 ; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
186 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
188 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
189 ret <16 x i8> %shuffle
192 define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
193 ; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
195 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
198 ; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
200 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
202 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
203 ret <16 x i8> %shuffle
206 define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
207 ; SSE-LABEL: shuffle_v16i8_0101010101010101:
209 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
210 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
213 ; AVX1-LABEL: shuffle_v16i8_0101010101010101:
215 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
216 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
219 ; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
220 ; AVX2OR512VL: # %bb.0:
221 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0
222 ; AVX2OR512VL-NEXT: retq
223 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
224 ret <16 x i8> %shuffle
227 define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
228 ; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
230 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
233 ; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
235 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
237 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
238 ret <16 x i8> %shuffle
241 define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
242 ; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
244 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
247 ; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
249 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
251 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
252 ret <16 x i8> %shuffle
255 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
256 ; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
258 ; SSE2-NEXT: pxor %xmm2, %xmm2
259 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
260 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
261 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
262 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
263 ; SSE2-NEXT: por %xmm2, %xmm0
266 ; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
268 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
269 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
270 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
271 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
274 ; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
276 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
277 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
278 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
279 ; SSE41-NEXT: movdqa %xmm1, %xmm0
282 ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
284 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
285 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
286 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
289 ; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
291 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
292 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
295 ; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
296 ; AVX512VLBW: # %bb.0:
297 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
298 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
299 ; AVX512VLBW-NEXT: retq
301 ; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
302 ; AVX512VLVBMI: # %bb.0:
303 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23]
304 ; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2
305 ; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0
306 ; AVX512VLVBMI-NEXT: retq
307 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
308 ret <16 x i8> %shuffle
311 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
312 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
314 ; SSE2-NEXT: pxor %xmm1, %xmm1
315 ; SSE2-NEXT: movdqa %xmm0, %xmm2
316 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
317 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
318 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
319 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
320 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
321 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
322 ; SSE2-NEXT: packuswb %xmm2, %xmm0
325 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
327 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
330 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
332 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
335 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
337 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
339 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
340 ret <16 x i8> %shuffle
343 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
344 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
346 ; SSE2-NEXT: pxor %xmm2, %xmm2
347 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
348 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
349 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
350 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
351 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
352 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
353 ; SSE2-NEXT: packuswb %xmm1, %xmm0
356 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
358 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
359 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
362 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
364 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
365 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
368 ; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
370 ; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
371 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
374 ; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
375 ; AVX512VLBW: # %bb.0:
376 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
377 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
378 ; AVX512VLBW-NEXT: retq
380 ; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
381 ; AVX512VLVBMI: # %bb.0:
382 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20]
383 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
384 ; AVX512VLVBMI-NEXT: retq
385 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
386 ret <16 x i8> %shuffle
389 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
390 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
392 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
393 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
394 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
395 ; SSE2-NEXT: pxor %xmm1, %xmm1
396 ; SSE2-NEXT: movdqa %xmm0, %xmm2
397 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
398 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
399 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
400 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
401 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
402 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7]
403 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4]
404 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
405 ; SSE2-NEXT: packuswb %xmm1, %xmm0
408 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
410 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
411 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
412 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
415 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
417 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
418 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
419 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
422 ; AVX1OR2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
424 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
425 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
426 ; AVX1OR2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
429 ; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
430 ; AVX512VLBW: # %bb.0:
431 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
432 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
433 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
434 ; AVX512VLBW-NEXT: retq
436 ; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
437 ; AVX512VLVBMI: # %bb.0:
438 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20]
439 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
440 ; AVX512VLVBMI-NEXT: retq
441 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
442 ret <16 x i8> %shuffle
445 define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
446 ; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
448 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
449 ; SSE2-NEXT: andps %xmm2, %xmm0
450 ; SSE2-NEXT: andnps %xmm1, %xmm2
451 ; SSE2-NEXT: orps %xmm2, %xmm0
454 ; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
456 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
457 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
458 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
461 ; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
463 ; SSE41-NEXT: movdqa %xmm0, %xmm2
464 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
465 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
466 ; SSE41-NEXT: movdqa %xmm1, %xmm0
469 ; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
471 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
472 ; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
475 ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
477 ; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA
478 ; AVX512VL-NEXT: kmovd %eax, %k1
479 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
480 ; AVX512VL-NEXT: retq
481 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
482 ret <16 x i8> %shuffle
485 define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
486 ; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
488 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
489 ; SSE2-NEXT: andps %xmm2, %xmm0
490 ; SSE2-NEXT: andnps %xmm1, %xmm2
491 ; SSE2-NEXT: orps %xmm2, %xmm0
494 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
496 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
497 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
498 ; SSSE3-NEXT: por %xmm1, %xmm0
501 ; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
503 ; SSE41-NEXT: movdqa %xmm0, %xmm2
504 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
505 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
506 ; SSE41-NEXT: movdqa %xmm1, %xmm0
509 ; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
511 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
512 ; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
515 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
517 ; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888
518 ; AVX512VL-NEXT: kmovd %eax, %k1
519 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
520 ; AVX512VL-NEXT: retq
521 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
522 ret <16 x i8> %shuffle
525 define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
526 ; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
528 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
531 ; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
533 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
535 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
536 ret <16 x i8> %shuffle
539 define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
540 ; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
542 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
543 ; SSE2-NEXT: andps %xmm2, %xmm0
544 ; SSE2-NEXT: andnps %xmm1, %xmm2
545 ; SSE2-NEXT: orps %xmm2, %xmm0
548 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
550 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
551 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
552 ; SSSE3-NEXT: por %xmm1, %xmm0
555 ; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
557 ; SSE41-NEXT: movdqa %xmm0, %xmm2
558 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
559 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
560 ; SSE41-NEXT: movdqa %xmm1, %xmm0
563 ; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
565 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
566 ; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
569 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
571 ; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090
572 ; AVX512VL-NEXT: kmovd %eax, %k1
573 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
574 ; AVX512VL-NEXT: retq
575 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
576 ret <16 x i8> %shuffle
579 define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
580 ; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
582 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
583 ; SSE2-NEXT: andps %xmm2, %xmm1
584 ; SSE2-NEXT: andnps %xmm0, %xmm2
585 ; SSE2-NEXT: orps %xmm1, %xmm2
586 ; SSE2-NEXT: movaps %xmm2, %xmm0
589 ; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
591 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
592 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
593 ; SSSE3-NEXT: por %xmm1, %xmm0
596 ; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
598 ; SSE41-NEXT: movdqa %xmm0, %xmm2
599 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
600 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
601 ; SSE41-NEXT: movdqa %xmm2, %xmm0
604 ; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
606 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
607 ; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
610 ; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
612 ; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0
613 ; AVX512VL-NEXT: kmovd %eax, %k1
614 ; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
615 ; AVX512VL-NEXT: retq
616 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
617 ret <16 x i8> %shuffle
621 define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> %a, <16 x i8> %b) {
622 ; SSE2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
624 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,255]
625 ; SSE2-NEXT: movdqa %xmm0, %xmm3
626 ; SSE2-NEXT: pand %xmm2, %xmm3
627 ; SSE2-NEXT: pandn %xmm1, %xmm2
628 ; SSE2-NEXT: por %xmm3, %xmm2
629 ; SSE2-NEXT: pxor %xmm1, %xmm1
630 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
631 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
632 ; SSE2-NEXT: movdqa %xmm0, %xmm1
633 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,65535]
634 ; SSE2-NEXT: pand %xmm3, %xmm0
635 ; SSE2-NEXT: pandn %xmm2, %xmm3
636 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
637 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
638 ; SSE2-NEXT: por %xmm2, %xmm1
639 ; SSE2-NEXT: por %xmm0, %xmm3
640 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0]
641 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,5,7]
642 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
643 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,3,4,5,6,7]
644 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
645 ; SSE2-NEXT: packuswb %xmm0, %xmm1
646 ; SSE2-NEXT: movdqa %xmm1, %xmm0
649 ; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
651 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
652 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
655 ; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
657 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
658 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
661 ; AVX1OR2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
663 ; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
664 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
667 ; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
668 ; AVX512VLBW: # %bb.0:
669 ; AVX512VLBW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
670 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
671 ; AVX512VLBW-NEXT: retq
673 ; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
674 ; AVX512VLVBMI: # %bb.0:
675 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [5,6,7,8,9,10,27,28,29,30,30,1,1,2,3,4]
676 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
677 ; AVX512VLVBMI-NEXT: retq
678 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 30, i32 1, i32 1, i32 2, i32 3, i32 4>
682 ; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
684 define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
685 ; SSE2-LABEL: load_fold_pblendvb:
687 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
688 ; SSE2-NEXT: andps %xmm1, %xmm0
689 ; SSE2-NEXT: andnps (%rdi), %xmm1
690 ; SSE2-NEXT: orps %xmm1, %xmm0
693 ; SSSE3-LABEL: load_fold_pblendvb:
695 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
696 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3],zero,zero,zero,xmm0[7,8,9],zero,xmm0[11],zero,zero,zero,xmm0[15]
697 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,xmm1[4,5,6],zero,zero,zero,xmm1[10],zero,xmm1[12,13,14],zero
698 ; SSSE3-NEXT: por %xmm1, %xmm0
701 ; SSE41-LABEL: load_fold_pblendvb:
703 ; SSE41-NEXT: movdqa %xmm0, %xmm1
704 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
705 ; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1
706 ; SSE41-NEXT: movdqa %xmm1, %xmm0
709 ; AVX1OR2-LABEL: load_fold_pblendvb:
711 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
712 ; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
715 ; AVX512VL-LABEL: load_fold_pblendvb:
717 ; AVX512VL-NEXT: movw $29812, %ax # imm = 0x7474
718 ; AVX512VL-NEXT: kmovd %eax, %k1
719 ; AVX512VL-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1}
720 ; AVX512VL-NEXT: retq
721 %x = load <16 x i8>, <16 x i8>* %px, align 16
722 %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
723 ret <16 x i8> %select
726 define <16 x i8> @load_fold_pblendvb_commute(<16 x i8>* %px, <16 x i8> %y) {
727 ; SSE2-LABEL: load_fold_pblendvb_commute:
729 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
730 ; SSE2-NEXT: movaps %xmm1, %xmm2
731 ; SSE2-NEXT: andnps %xmm0, %xmm2
732 ; SSE2-NEXT: andps (%rdi), %xmm1
733 ; SSE2-NEXT: orps %xmm2, %xmm1
734 ; SSE2-NEXT: movaps %xmm1, %xmm0
737 ; SSSE3-LABEL: load_fold_pblendvb_commute:
739 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
740 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,xmm0[4,5,6],zero,zero,zero,xmm0[10],zero,xmm0[12,13,14],zero
741 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3],zero,zero,zero,xmm1[7,8,9],zero,xmm1[11],zero,zero,zero,xmm1[15]
742 ; SSSE3-NEXT: por %xmm1, %xmm0
745 ; SSE41-LABEL: load_fold_pblendvb_commute:
747 ; SSE41-NEXT: movdqa %xmm0, %xmm1
748 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
749 ; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1
750 ; SSE41-NEXT: movdqa %xmm1, %xmm0
753 ; AVX1OR2-LABEL: load_fold_pblendvb_commute:
755 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
756 ; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
759 ; AVX512VL-LABEL: load_fold_pblendvb_commute:
761 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1
762 ; AVX512VL-NEXT: movw $29812, %ax # imm = 0x7474
763 ; AVX512VL-NEXT: kmovd %eax, %k1
764 ; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
765 ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
766 ; AVX512VL-NEXT: retq
767 %x = load <16 x i8>, <16 x i8>* %px, align 16
768 %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
769 ret <16 x i8> %select
772 define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
773 ; SSE2-LABEL: trunc_v4i32_shuffle:
775 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
776 ; SSE2-NEXT: packuswb %xmm0, %xmm0
777 ; SSE2-NEXT: packuswb %xmm0, %xmm0
780 ; SSSE3-LABEL: trunc_v4i32_shuffle:
782 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
785 ; SSE41-LABEL: trunc_v4i32_shuffle:
787 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
790 ; AVX-LABEL: trunc_v4i32_shuffle:
792 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
794 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
795 ret <16 x i8> %shuffle
798 define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
799 ; We don't have anything useful to check here. This generates 100s of
800 ; instructions. Instead, just make sure we survived codegen.
801 ; ALL-LABEL: stress_test0:
804 %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
805 %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
806 %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
807 %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
808 %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
809 %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
810 %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
811 %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
812 %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
813 %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
814 ret <16 x i8> %s.16.0
817 define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
818 ; There is nothing interesting to check about these instructions other than
819 ; that they survive codegen. However, we actually do better and delete all of
820 ; them because the result is 'undef'.
822 ; ALL-LABEL: undef_test1:
823 ; ALL: # %bb.0: # %entry
826 %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
827 %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
828 %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
829 %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
830 %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
831 %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
832 %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
833 %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
834 %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
835 %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
836 %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
837 %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
838 %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
840 ret <16 x i8> %s.12.4
843 define <16 x i8> @PR20540(<8 x i8> %a) {
844 ; SSE-LABEL: PR20540:
846 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
849 ; AVX-LABEL: PR20540:
851 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
853 %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
854 ret <16 x i8> %shuffle
857 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
858 ; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
860 ; SSE-NEXT: movzbl %dil, %eax
861 ; SSE-NEXT: movd %eax, %xmm0
864 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
866 ; AVX-NEXT: movzbl %dil, %eax
867 ; AVX-NEXT: vmovd %eax, %xmm0
869 %a = insertelement <16 x i8> undef, i8 %i, i32 0
870 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
871 ret <16 x i8> %shuffle
874 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
875 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
877 ; SSE2-NEXT: shll $8, %edi
878 ; SSE2-NEXT: pxor %xmm0, %xmm0
879 ; SSE2-NEXT: pinsrw $2, %edi, %xmm0
882 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
884 ; SSSE3-NEXT: shll $8, %edi
885 ; SSSE3-NEXT: pxor %xmm0, %xmm0
886 ; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
889 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
891 ; SSE41-NEXT: pxor %xmm0, %xmm0
892 ; SSE41-NEXT: pinsrb $5, %edi, %xmm0
895 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
897 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
898 ; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
900 %a = insertelement <16 x i8> undef, i8 %i, i32 0
901 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
902 ret <16 x i8> %shuffle
905 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
906 ; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
908 ; SSE2-NEXT: shll $8, %edi
909 ; SSE2-NEXT: pxor %xmm0, %xmm0
910 ; SSE2-NEXT: pinsrw $7, %edi, %xmm0
913 ; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
915 ; SSSE3-NEXT: shll $8, %edi
916 ; SSSE3-NEXT: pxor %xmm0, %xmm0
917 ; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
920 ; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
922 ; SSE41-NEXT: pxor %xmm0, %xmm0
923 ; SSE41-NEXT: pinsrb $15, %edi, %xmm0
926 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
928 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
929 ; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
931 %a = insertelement <16 x i8> undef, i8 %i, i32 0
932 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
933 ret <16 x i8> %shuffle
936 define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
937 ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
939 ; SSE2-NEXT: movzbl %dil, %eax
940 ; SSE2-NEXT: pxor %xmm0, %xmm0
941 ; SSE2-NEXT: pinsrw $1, %eax, %xmm0
944 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
946 ; SSSE3-NEXT: movzbl %dil, %eax
947 ; SSSE3-NEXT: pxor %xmm0, %xmm0
948 ; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
951 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
953 ; SSE41-NEXT: pxor %xmm0, %xmm0
954 ; SSE41-NEXT: pinsrb $2, %edi, %xmm0
957 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
959 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
960 ; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
962 %a = insertelement <16 x i8> undef, i8 %i, i32 3
963 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
964 ret <16 x i8> %shuffle
967 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
968 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
970 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
973 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
975 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
977 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
978 ret <16 x i8> %shuffle
981 define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
982 ; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
984 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
987 ; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
989 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
991 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
992 ret <16 x i8> %shuffle
995 define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
996 ; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
998 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
999 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1000 ; SSE2-NEXT: por %xmm1, %xmm0
1003 ; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1005 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1008 ; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1010 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1013 ; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1015 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1017 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
1018 ret <16 x i8> %shuffle
1021 define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1022 ; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1024 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1025 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1026 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1027 ; SSE2-NEXT: por %xmm1, %xmm0
1030 ; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1032 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1035 ; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1037 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1040 ; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1042 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1044 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
1045 ret <16 x i8> %shuffle
1048 define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
1049 ; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1051 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1052 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
1053 ; SSE2-NEXT: por %xmm1, %xmm0
1056 ; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1058 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1061 ; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1063 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1066 ; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1068 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1070 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
1071 ret <16 x i8> %shuffle
1074 define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
1075 ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1077 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1078 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1079 ; SSE2-NEXT: por %xmm1, %xmm0
1082 ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1084 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1085 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1088 ; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1090 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1091 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1094 ; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1096 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1098 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
1099 ret <16 x i8> %shuffle
1102 define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
1103 ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1105 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1106 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1107 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
1108 ; SSE2-NEXT: por %xmm1, %xmm0
1111 ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1113 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1116 ; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1118 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1121 ; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1123 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1125 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
1126 ret <16 x i8> %shuffle
1129 define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
1130 ; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1132 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1133 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1134 ; SSE2-NEXT: por %xmm1, %xmm0
1137 ; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1139 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1140 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1143 ; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1145 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1146 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1149 ; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1151 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1153 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
1154 ret <16 x i8> %shuffle
1158 define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) {
1159 ; SSE-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1161 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1162 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1165 ; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1167 ; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1168 ; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1169 ; AVX1OR2-NEXT: retq
1171 ; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1172 ; AVX512VLBW: # %bb.0:
1173 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1174 ; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1175 ; AVX512VLBW-NEXT: retq
1177 ; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1178 ; AVX512VLVBMI: # %bb.0:
1179 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23]
1180 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1181 ; AVX512VLVBMI-NEXT: retq
1182 %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
1183 ret <16 x i8> %shuffle
1186 define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
1187 ; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1189 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1190 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1191 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1194 ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1196 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u]
1199 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1201 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1204 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1206 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1208 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1209 ret <16 x i8> %shuffle
1212 define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1213 ; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1215 ; SSE2-NEXT: pxor %xmm1, %xmm1
1216 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1217 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1218 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1221 ; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1223 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1226 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1228 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1231 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1233 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1235 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1236 ret <16 x i8> %shuffle
1239 define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
1240 ; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1242 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1243 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1246 ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1248 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1249 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1252 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1254 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1257 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1259 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1261 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
1262 ret <16 x i8> %shuffle
1265 define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
1266 ; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1268 ; SSE2-NEXT: pxor %xmm1, %xmm1
1269 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1270 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1273 ; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1275 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1276 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1277 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1280 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1282 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1285 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1287 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1289 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
1290 ret <16 x i8> %shuffle
1293 define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
1294 ; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1296 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1299 ; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1301 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1304 ; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1306 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1309 ; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1311 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1313 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
1314 ret <16 x i8> %shuffle
1317 define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
1318 ; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1320 ; SSE2-NEXT: pxor %xmm1, %xmm1
1321 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1324 ; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1326 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1327 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1330 ; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1332 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1335 ; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1337 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1339 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1340 ret <16 x i8> %shuffle
1343 define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
1344 ; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1345 ; SSE2: # %bb.0: # %entry
1346 ; SSE2-NEXT: pxor %xmm2, %xmm2
1347 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1348 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1349 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7]
1350 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
1351 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
1352 ; SSE2-NEXT: pand %xmm5, %xmm4
1353 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1354 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1]
1355 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
1356 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
1357 ; SSE2-NEXT: pandn %xmm2, %xmm5
1358 ; SSE2-NEXT: por %xmm4, %xmm5
1359 ; SSE2-NEXT: psrlq $16, %xmm0
1360 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
1361 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3]
1362 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1363 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
1364 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1365 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1366 ; SSE2-NEXT: packuswb %xmm5, %xmm2
1367 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1368 ; SSE2-NEXT: pand %xmm0, %xmm2
1369 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1370 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1371 ; SSE2-NEXT: pandn %xmm1, %xmm0
1372 ; SSE2-NEXT: por %xmm2, %xmm0
1375 ; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1376 ; SSSE3: # %bb.0: # %entry
1377 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1378 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1379 ; SSSE3-NEXT: por %xmm1, %xmm0
1382 ; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1383 ; SSE41: # %bb.0: # %entry
1384 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1385 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1386 ; SSE41-NEXT: por %xmm1, %xmm0
1389 ; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1390 ; AVX1OR2: # %bb.0: # %entry
1391 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1392 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1393 ; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
1394 ; AVX1OR2-NEXT: retq
1396 ; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1397 ; AVX512VLBW: # %bb.0: # %entry
1398 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1399 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1400 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1401 ; AVX512VLBW-NEXT: retq
1403 ; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1404 ; AVX512VLVBMI: # %bb.0: # %entry
1405 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
1406 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1407 ; AVX512VLVBMI-NEXT: retq
1409 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
1411 ret <16 x i8> %shuffle
1414 define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<8 x i16> %a0, <8 x i16> %a1) {
1415 ; SSE-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1417 ; SSE-NEXT: psrlw $8, %xmm0
1418 ; SSE-NEXT: psrlw $8, %xmm1
1419 ; SSE-NEXT: packuswb %xmm1, %xmm0
1422 ; AVX1OR2-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1424 ; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
1425 ; AVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1
1426 ; AVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1427 ; AVX1OR2-NEXT: retq
1429 ; AVX512VLBW-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1430 ; AVX512VLBW: # %bb.0:
1431 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
1432 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
1433 ; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1434 ; AVX512VLBW-NEXT: retq
1436 ; AVX512VLVBMI-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1437 ; AVX512VLVBMI: # %bb.0:
1438 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1439 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1440 ; AVX512VLVBMI-NEXT: retq
1441 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1442 %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1443 %3 = bitcast <8 x i16> %1 to <16 x i8>
1444 %4 = bitcast <8 x i16> %2 to <16 x i8>
1445 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1449 define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
1450 ; Nothing interesting to test here. Just make sure we didn't crashe.
1451 ; ALL-LABEL: stress_test2:
1454 %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
1455 %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
1456 %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
1458 ret <16 x i8> %s.2.0
1461 define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
1462 ; SSE-LABEL: constant_gets_selected:
1463 ; SSE: # %bb.0: # %entry
1464 ; SSE-NEXT: xorps %xmm0, %xmm0
1465 ; SSE-NEXT: movaps %xmm0, (%rdi)
1466 ; SSE-NEXT: movaps %xmm0, (%rsi)
1469 ; AVX-LABEL: constant_gets_selected:
1470 ; AVX: # %bb.0: # %entry
1471 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1472 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1473 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
1476 %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
1477 %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
1478 %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
1479 store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16
1480 store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16
1485 ; Shuffle to logical bit shifts
1488 define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
1489 ; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1491 ; SSE-NEXT: psllw $8, %xmm0
1494 ; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1496 ; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
1498 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
1499 ret <16 x i8> %shuffle
1502 define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
1503 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1505 ; SSE-NEXT: pslld $24, %xmm0
1508 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1510 ; AVX-NEXT: vpslld $24, %xmm0, %xmm0
1512 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
1513 ret <16 x i8> %shuffle
1516 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
1517 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1519 ; SSE-NEXT: psllq $56, %xmm0
1522 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1524 ; AVX-NEXT: vpsllq $56, %xmm0, %xmm0
1526 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
1527 ret <16 x i8> %shuffle
1530 define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1531 ; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1533 ; SSE-NEXT: psllq $8, %xmm0
1536 ; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1538 ; AVX-NEXT: vpsllq $8, %xmm0, %xmm0
1540 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
1541 ret <16 x i8> %shuffle
1544 define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
1545 ; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1547 ; SSE-NEXT: psrlw $8, %xmm0
1550 ; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1552 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
1554 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
1555 ret <16 x i8> %shuffle
1558 define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1559 ; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1561 ; SSE-NEXT: psrld $16, %xmm0
1564 ; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1566 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
1568 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
1569 ret <16 x i8> %shuffle
1572 define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1573 ; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1575 ; SSE-NEXT: psrlq $56, %xmm0
1578 ; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1580 ; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0
1582 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
1583 ret <16 x i8> %shuffle
1586 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1587 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1589 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1590 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1591 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1594 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1596 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
1599 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1601 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
1604 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1606 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
1608 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1609 ret <16 x i8> %shuffle
1612 define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1613 ; SSE-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1615 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1616 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1619 ; AVX1-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1621 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1622 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1625 ; AVX2-SLOW-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1626 ; AVX2-SLOW: # %bb.0:
1627 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1628 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1629 ; AVX2-SLOW-NEXT: retq
1631 ; AVX2-FAST-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1632 ; AVX2-FAST: # %bb.0:
1633 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1634 ; AVX2-FAST-NEXT: retq
1636 ; AVX512VL-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1637 ; AVX512VL: # %bb.0:
1638 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1639 ; AVX512VL-NEXT: retq
1640 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1641 ret <16 x i8> %shuffle
1644 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) {
1645 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1647 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1648 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1651 ; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1653 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1654 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1657 ; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1658 ; AVX2-SLOW: # %bb.0:
1659 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1660 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1661 ; AVX2-SLOW-NEXT: retq
1663 ; AVX2-FAST-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1664 ; AVX2-FAST: # %bb.0:
1665 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6]
1666 ; AVX2-FAST-NEXT: retq
1668 ; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1669 ; AVX512VL: # %bb.0:
1670 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6]
1671 ; AVX512VL-NEXT: retq
1672 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
1673 ret <16 x i8> %shuffle
1676 define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
1677 ; SSE2-LABEL: PR12412:
1678 ; SSE2: # %bb.0: # %entry
1679 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1680 ; SSE2-NEXT: pand %xmm2, %xmm1
1681 ; SSE2-NEXT: pand %xmm2, %xmm0
1682 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1685 ; SSSE3-LABEL: PR12412:
1686 ; SSSE3: # %bb.0: # %entry
1687 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1688 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
1689 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
1690 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1693 ; SSE41-LABEL: PR12412:
1694 ; SSE41: # %bb.0: # %entry
1695 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1696 ; SSE41-NEXT: pshufb %xmm2, %xmm1
1697 ; SSE41-NEXT: pshufb %xmm2, %xmm0
1698 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1701 ; AVX1OR2-LABEL: PR12412:
1702 ; AVX1OR2: # %bb.0: # %entry
1703 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1704 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1705 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1706 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1707 ; AVX1OR2-NEXT: retq
1709 ; AVX512VLBW-LABEL: PR12412:
1710 ; AVX512VLBW: # %bb.0: # %entry
1711 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1712 ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1713 ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1714 ; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1715 ; AVX512VLBW-NEXT: retq
1717 ; AVX512VLVBMI-LABEL: PR12412:
1718 ; AVX512VLVBMI: # %bb.0: # %entry
1719 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
1720 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1721 ; AVX512VLVBMI-NEXT: retq
1723 %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1727 define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
1728 ; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1730 ; SSE-NEXT: psrld $8, %xmm0
1733 ; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
1735 ; AVX-NEXT: vpsrld $8, %xmm0, %xmm0
1737 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
1738 ret <16 x i8> %shuffle
1741 define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
1742 ; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
1744 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1747 ; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
1749 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1751 %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
1752 %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
1753 %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1754 %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
1755 %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1756 %bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8>
1757 ret <16 x i8> %bitcast8
1760 define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
1761 ; SSE2-LABEL: insert_dup_mem_v16i8_i32:
1763 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1764 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1765 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1766 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1769 ; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
1771 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1772 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1773 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1776 ; SSE41-LABEL: insert_dup_mem_v16i8_i32:
1778 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1779 ; SSE41-NEXT: pxor %xmm1, %xmm1
1780 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1783 ; AVX1-LABEL: insert_dup_mem_v16i8_i32:
1785 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1786 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1787 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1790 ; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32:
1791 ; AVX2OR512VL: # %bb.0:
1792 ; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
1793 ; AVX2OR512VL-NEXT: retq
1794 %tmp = load i32, i32* %ptr, align 4
1795 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1796 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1797 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
1801 define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
1802 ; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
1804 ; SSE2-NEXT: movzbl (%rdi), %eax
1805 ; SSE2-NEXT: movd %eax, %xmm0
1806 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1807 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1808 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1811 ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
1813 ; SSSE3-NEXT: movzbl (%rdi), %eax
1814 ; SSSE3-NEXT: movd %eax, %xmm0
1815 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1816 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1819 ; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
1821 ; SSE41-NEXT: movzbl (%rdi), %eax
1822 ; SSE41-NEXT: movd %eax, %xmm0
1823 ; SSE41-NEXT: pxor %xmm1, %xmm1
1824 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1827 ; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
1829 ; AVX1-NEXT: movzbl (%rdi), %eax
1830 ; AVX1-NEXT: vmovd %eax, %xmm0
1831 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1832 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1835 ; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8:
1836 ; AVX2OR512VL: # %bb.0:
1837 ; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
1838 ; AVX2OR512VL-NEXT: retq
1839 %tmp = load i8, i8* %ptr, align 1
1840 %tmp1 = sext i8 %tmp to i32
1841 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1842 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1843 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
1847 define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
1848 ; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
1850 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1851 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1852 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
1853 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1856 ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
1858 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1859 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1862 ; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
1864 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1865 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1868 ; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
1870 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1871 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1874 ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32:
1875 ; AVX2OR512VL: # %bb.0:
1876 ; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0
1877 ; AVX2OR512VL-NEXT: retq
1878 %tmp = load i32, i32* %ptr, align 4
1879 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1880 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1881 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1885 define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
1886 ; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
1888 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1889 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1890 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
1891 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1894 ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
1896 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1897 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1900 ; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
1902 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1903 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1906 ; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
1908 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1909 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1912 ; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32:
1913 ; AVX2OR512VL: # %bb.0:
1914 ; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0
1915 ; AVX2OR512VL-NEXT: retq
1916 %tmp = load i32, i32* %ptr, align 4
1917 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
1918 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
1919 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1923 define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
1924 ; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1926 ; SSE2-NEXT: movsbl (%rdi), %eax
1927 ; SSE2-NEXT: movd %eax, %xmm0
1928 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1929 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
1930 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1933 ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1935 ; SSSE3-NEXT: movsbl (%rdi), %eax
1936 ; SSSE3-NEXT: movd %eax, %xmm0
1937 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1940 ; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1942 ; SSE41-NEXT: movsbl (%rdi), %eax
1943 ; SSE41-NEXT: movd %eax, %xmm0
1944 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1947 ; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1949 ; AVX1-NEXT: movsbl (%rdi), %eax
1950 ; AVX1-NEXT: vmovd %eax, %xmm0
1951 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1954 ; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1956 ; AVX2-NEXT: movsbl (%rdi), %eax
1957 ; AVX2-NEXT: shrl $8, %eax
1958 ; AVX2-NEXT: vmovd %eax, %xmm0
1959 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
1962 ; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
1963 ; AVX512VL: # %bb.0:
1964 ; AVX512VL-NEXT: movsbl (%rdi), %eax
1965 ; AVX512VL-NEXT: shrl $8, %eax
1966 ; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
1967 ; AVX512VL-NEXT: retq
1968 %tmp = load i8, i8* %ptr, align 1
1969 %tmp1 = sext i8 %tmp to i32
1970 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
1971 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
1972 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1976 define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
1977 ; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1979 ; SSE2-NEXT: movsbl (%rdi), %eax
1980 ; SSE2-NEXT: movd %eax, %xmm0
1981 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1982 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
1983 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1986 ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1988 ; SSSE3-NEXT: movsbl (%rdi), %eax
1989 ; SSSE3-NEXT: movd %eax, %xmm0
1990 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
1993 ; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
1995 ; SSE41-NEXT: movsbl (%rdi), %eax
1996 ; SSE41-NEXT: movd %eax, %xmm0
1997 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2000 ; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2002 ; AVX1-NEXT: movsbl (%rdi), %eax
2003 ; AVX1-NEXT: vmovd %eax, %xmm0
2004 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2007 ; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2009 ; AVX2-NEXT: movsbl (%rdi), %eax
2010 ; AVX2-NEXT: shrl $16, %eax
2011 ; AVX2-NEXT: vmovd %eax, %xmm0
2012 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2015 ; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2016 ; AVX512VL: # %bb.0:
2017 ; AVX512VL-NEXT: movsbl (%rdi), %eax
2018 ; AVX512VL-NEXT: shrl $16, %eax
2019 ; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
2020 ; AVX512VL-NEXT: retq
2021 %tmp = load i8, i8* %ptr, align 1
2022 %tmp1 = sext i8 %tmp to i32
2023 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2024 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2025 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2029 define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) {
2030 ; SSE2-LABEL: PR31364:
2032 ; SSE2-NEXT: movzbl (%rdi), %eax
2033 ; SSE2-NEXT: movzbl (%rsi), %ecx
2034 ; SSE2-NEXT: shll $8, %ecx
2035 ; SSE2-NEXT: orl %eax, %ecx
2036 ; SSE2-NEXT: movd %ecx, %xmm1
2037 ; SSE2-NEXT: pxor %xmm0, %xmm0
2038 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2039 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7]
2040 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
2041 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2042 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
2043 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
2044 ; SSE2-NEXT: packuswb %xmm1, %xmm0
2047 ; SSSE3-LABEL: PR31364:
2049 ; SSSE3-NEXT: movzbl (%rdi), %eax
2050 ; SSSE3-NEXT: movzbl (%rsi), %ecx
2051 ; SSSE3-NEXT: shll $8, %ecx
2052 ; SSSE3-NEXT: orl %eax, %ecx
2053 ; SSSE3-NEXT: movd %ecx, %xmm0
2054 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2057 ; SSE41-LABEL: PR31364:
2059 ; SSE41-NEXT: pxor %xmm0, %xmm0
2060 ; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0
2061 ; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0
2062 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2065 ; AVX-LABEL: PR31364:
2067 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
2068 ; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0
2069 ; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0
2070 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2072 %v0 = load i8, i8* %a, align 1
2073 %vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0
2074 %v1 = load i8, i8* %b, align 1
2075 %vecins2 = insertelement <16 x i8> %vecins, i8 %v1, i32 1
2076 %result = shufflevector <16 x i8> %vecins2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 3, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
2077 ret <16 x i8> %result
2080 define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) {
2081 ; SSE2-LABEL: PR31301:
2082 ; SSE2: # %bb.0: # %entry
2083 ; SSE2-NEXT: movzbl (%rdi), %eax
2084 ; SSE2-NEXT: movd %eax, %xmm0
2085 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2086 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
2087 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2088 ; SSE2-NEXT: movzbl (%rsi), %eax
2089 ; SSE2-NEXT: movd %eax, %xmm1
2090 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2091 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
2092 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2093 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2096 ; SSSE3-LABEL: PR31301:
2097 ; SSSE3: # %bb.0: # %entry
2098 ; SSSE3-NEXT: movzbl (%rdi), %eax
2099 ; SSSE3-NEXT: movd %eax, %xmm0
2100 ; SSSE3-NEXT: pxor %xmm1, %xmm1
2101 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2102 ; SSSE3-NEXT: movzbl (%rsi), %eax
2103 ; SSSE3-NEXT: movd %eax, %xmm2
2104 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
2105 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2108 ; SSE41-LABEL: PR31301:
2109 ; SSE41: # %bb.0: # %entry
2110 ; SSE41-NEXT: movzbl (%rdi), %eax
2111 ; SSE41-NEXT: movd %eax, %xmm0
2112 ; SSE41-NEXT: pxor %xmm1, %xmm1
2113 ; SSE41-NEXT: pshufb %xmm1, %xmm0
2114 ; SSE41-NEXT: movzbl (%rsi), %eax
2115 ; SSE41-NEXT: movd %eax, %xmm2
2116 ; SSE41-NEXT: pshufb %xmm1, %xmm2
2117 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2120 ; AVX1-LABEL: PR31301:
2121 ; AVX1: # %bb.0: # %entry
2122 ; AVX1-NEXT: movzbl (%rdi), %eax
2123 ; AVX1-NEXT: vmovd %eax, %xmm0
2124 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2125 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2126 ; AVX1-NEXT: movzbl (%rsi), %eax
2127 ; AVX1-NEXT: vmovd %eax, %xmm2
2128 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
2129 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2132 ; AVX2OR512VL-LABEL: PR31301:
2133 ; AVX2OR512VL: # %bb.0: # %entry
2134 ; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
2135 ; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1
2136 ; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2137 ; AVX2OR512VL-NEXT: retq
2139 %0 = load i8, i8* %x, align 1
2140 %1 = insertelement <16 x i8> undef, i8 %0, i32 0
2141 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2142 %2 = load i8, i8* %y, align 1
2143 %3 = insertelement <16 x i8> undef, i8 %2, i32 0
2144 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2145 %vzip.i = shufflevector <16 x i8> %lane, <16 x i8> %lane3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2146 ret <16 x i8> %vzip.i