1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=ALL,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX1
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX1
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX2
16 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
17 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
19 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
20 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
21 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
24 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
26 ; SSSE3-NEXT: pxor %xmm1, %xmm1
27 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
30 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
32 ; SSE41-NEXT: pxor %xmm1, %xmm1
33 ; SSE41-NEXT: pshufb %xmm1, %xmm0
36 ; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
38 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
39 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
42 ; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
43 ; AVX2OR512VL: # %bb.0:
44 ; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0
45 ; AVX2OR512VL-NEXT: retq
47 ; XOPAVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
49 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
50 ; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
53 ; XOPAVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
55 ; XOPAVX2-NEXT: vpbroadcastb %xmm0, %xmm0
57 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
58 ret <16 x i8> %shuffle
61 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
62 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
64 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
65 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
66 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
69 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
71 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
74 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
76 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
79 ; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
81 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
83 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
84 ret <16 x i8> %shuffle
87 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
88 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
91 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
92 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
93 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
94 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
97 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
99 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
102 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
104 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
107 ; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
109 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
111 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
112 ret <16 x i8> %shuffle
115 define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
116 ; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
118 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
122 ; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
124 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
125 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
128 ; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
129 ; AVX2-SLOW: # %bb.0:
130 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
131 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
132 ; AVX2-SLOW-NEXT: retq
134 ; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
135 ; AVX2-FAST: # %bb.0:
136 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
137 ; AVX2-FAST-NEXT: retq
139 ; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
141 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
142 ; AVX512VL-NEXT: retq
144 ; XOP-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
146 ; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
147 ; XOP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
149 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
150 ret <16 x i8> %shuffle
153 define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
154 ; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
156 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
157 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
160 ; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
162 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
163 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
166 ; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
167 ; AVX2-SLOW: # %bb.0:
168 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
169 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
170 ; AVX2-SLOW-NEXT: retq
172 ; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
173 ; AVX2-FAST: # %bb.0:
174 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
175 ; AVX2-FAST-NEXT: retq
177 ; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
179 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
180 ; AVX512VL-NEXT: retq
182 ; XOP-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
184 ; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
185 ; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
187 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
188 ret <16 x i8> %shuffle
191 define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
192 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
194 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
195 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
196 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
197 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
198 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
199 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
202 ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
204 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
207 ; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
209 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
212 ; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
214 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
216 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
217 ret <16 x i8> %shuffle
220 define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
221 ; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
223 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
226 ; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
228 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
230 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
231 ret <16 x i8> %shuffle
234 define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
235 ; SSE-LABEL: shuffle_v16i8_0101010101010101:
237 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
238 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
241 ; AVX1-LABEL: shuffle_v16i8_0101010101010101:
243 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
244 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
247 ; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
248 ; AVX2OR512VL: # %bb.0:
249 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0
250 ; AVX2OR512VL-NEXT: retq
252 ; XOPAVX1-LABEL: shuffle_v16i8_0101010101010101:
254 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
255 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
258 ; XOPAVX2-LABEL: shuffle_v16i8_0101010101010101:
260 ; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
262 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
263 ret <16 x i8> %shuffle
266 define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
267 ; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
269 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
272 ; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
274 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
276 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
277 ret <16 x i8> %shuffle
280 define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
281 ; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
283 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
286 ; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
288 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
290 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
291 ret <16 x i8> %shuffle
294 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
295 ; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
297 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
298 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
299 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
300 ; SSE-NEXT: movdqa %xmm1, %xmm0
303 ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
305 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
306 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
307 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
310 ; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
311 ; AVX2OR512VL: # %bb.0:
312 ; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
313 ; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
314 ; AVX2OR512VL-NEXT: retq
316 ; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
318 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[0],xmm0[1],xmm1[0],xmm0[2],xmm1[0],xmm0[3],xmm1[0],xmm0[4],xmm1[0],xmm0[5],xmm1[0],xmm0[6],xmm1[0],xmm0[7]
321 ; XOPAVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
323 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
324 ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
326 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
327 ret <16 x i8> %shuffle
330 define <16 x i8> @shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<16 x i8> %a, <16 x i8> %b) {
331 ; SSE2-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
333 ; SSE2-NEXT: pxor %xmm1, %xmm1
334 ; SSE2-NEXT: movdqa %xmm0, %xmm2
335 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
336 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
337 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
338 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
339 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
340 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
341 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
342 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
343 ; SSE2-NEXT: packuswb %xmm2, %xmm0
346 ; SSSE3-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
348 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
351 ; SSE41-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
353 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
356 ; AVX-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
358 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
360 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
361 ret <16 x i8> %shuffle
364 define <16 x i8> @shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08(<16 x i8> %a, <16 x i8> %b) {
365 ; SSE2-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
367 ; SSE2-NEXT: pxor %xmm1, %xmm1
368 ; SSE2-NEXT: movdqa %xmm0, %xmm2
369 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
370 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
371 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
372 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
373 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
374 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
375 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
376 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
377 ; SSE2-NEXT: packuswb %xmm2, %xmm0
380 ; SSSE3-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
382 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
385 ; SSE41-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
387 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
390 ; AVX-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
392 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
394 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
395 ret <16 x i8> %shuffle
398 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
399 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
401 ; SSE2-NEXT: pxor %xmm1, %xmm1
402 ; SSE2-NEXT: movdqa %xmm0, %xmm2
403 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
404 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
405 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
406 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
407 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
408 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
409 ; SSE2-NEXT: packuswb %xmm2, %xmm0
412 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
414 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
417 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
419 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
422 ; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
424 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
426 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
427 ret <16 x i8> %shuffle
430 define <16 x i8> @shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14(<16 x i8> %a, <16 x i8> %b) {
431 ; SSE2-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
433 ; SSE2-NEXT: movdqa %xmm0, %xmm1
434 ; SSE2-NEXT: psrlw $8, %xmm1
435 ; SSE2-NEXT: psllw $8, %xmm0
436 ; SSE2-NEXT: por %xmm1, %xmm0
439 ; SSSE3-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
441 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
444 ; SSE41-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
446 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
449 ; AVX1-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
451 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
454 ; AVX2OR512VL-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
455 ; AVX2OR512VL: # %bb.0:
456 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
457 ; AVX2OR512VL-NEXT: retq
459 ; XOP-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
461 ; XOP-NEXT: vprotw $8, %xmm0, %xmm0
463 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
464 ret <16 x i8> %shuffle
467 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
468 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
470 ; SSE2-NEXT: pxor %xmm2, %xmm2
471 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
472 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
473 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
474 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
475 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
476 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
477 ; SSE2-NEXT: packuswb %xmm1, %xmm0
480 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
482 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
483 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
486 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
488 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
489 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
492 ; AVX1-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
494 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
495 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
498 ; AVX2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
500 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
501 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
504 ; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
505 ; AVX512VLBW: # %bb.0:
506 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
507 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
508 ; AVX512VLBW-NEXT: retq
510 ; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
511 ; AVX512VLVBMI: # %bb.0:
512 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20]
513 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
514 ; AVX512VLVBMI-NEXT: retq
516 ; XOP-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
518 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],xmm1[3,2,1,0,7,6,5,4]
520 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
521 ret <16 x i8> %shuffle
524 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
525 ; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
527 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
528 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
529 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
530 ; SSE2-NEXT: pxor %xmm1, %xmm1
531 ; SSE2-NEXT: movdqa %xmm0, %xmm2
532 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
533 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
534 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
535 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
536 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
537 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7]
538 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4]
539 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
540 ; SSE2-NEXT: packuswb %xmm1, %xmm0
543 ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
545 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
546 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
547 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
550 ; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
552 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
553 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
554 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
557 ; AVX1-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
559 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
560 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
561 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
564 ; AVX2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
566 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
567 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
568 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
571 ; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
572 ; AVX512VLBW: # %bb.0:
573 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
574 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
575 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
576 ; AVX512VLBW-NEXT: retq
578 ; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
579 ; AVX512VLVBMI: # %bb.0:
580 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20]
581 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
582 ; AVX512VLVBMI-NEXT: retq
584 ; XOP-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
586 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[3,2,1,0],xmm1[15,14,13,12],xmm0[11,10,9,8],xmm1[7,6,5,4]
588 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
589 ret <16 x i8> %shuffle
592 define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
593 ; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
595 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
596 ; SSE2-NEXT: andps %xmm2, %xmm0
597 ; SSE2-NEXT: andnps %xmm1, %xmm2
598 ; SSE2-NEXT: orps %xmm2, %xmm0
601 ; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
603 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
604 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
605 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
608 ; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
610 ; SSE41-NEXT: movdqa %xmm0, %xmm2
611 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
612 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
613 ; SSE41-NEXT: movdqa %xmm1, %xmm0
616 ; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
618 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
619 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
622 ; AVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
624 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
625 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
628 ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
630 ; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA
631 ; AVX512VL-NEXT: kmovd %eax, %k1
632 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
633 ; AVX512VL-NEXT: retq
635 ; XOPAVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
637 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
638 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
641 ; XOPAVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
643 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
644 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
646 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
647 ret <16 x i8> %shuffle
650 define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
651 ; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
653 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
654 ; SSE2-NEXT: andps %xmm2, %xmm0
655 ; SSE2-NEXT: andnps %xmm1, %xmm2
656 ; SSE2-NEXT: orps %xmm2, %xmm0
659 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
661 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
662 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
663 ; SSSE3-NEXT: por %xmm1, %xmm0
666 ; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
668 ; SSE41-NEXT: movdqa %xmm0, %xmm2
669 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
670 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
671 ; SSE41-NEXT: movdqa %xmm1, %xmm0
674 ; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
676 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
677 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
680 ; AVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
682 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
683 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
686 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
688 ; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888
689 ; AVX512VL-NEXT: kmovd %eax, %k1
690 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
691 ; AVX512VL-NEXT: retq
693 ; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
695 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
696 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
699 ; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
701 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
702 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
704 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
705 ret <16 x i8> %shuffle
708 define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
709 ; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
711 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
714 ; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
716 ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
718 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
719 ret <16 x i8> %shuffle
722 define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
723 ; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
725 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
726 ; SSE2-NEXT: andps %xmm2, %xmm0
727 ; SSE2-NEXT: andnps %xmm1, %xmm2
728 ; SSE2-NEXT: orps %xmm2, %xmm0
731 ; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
733 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
734 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
735 ; SSSE3-NEXT: por %xmm1, %xmm0
738 ; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
740 ; SSE41-NEXT: movdqa %xmm0, %xmm2
741 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
742 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
743 ; SSE41-NEXT: movdqa %xmm1, %xmm0
746 ; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
748 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
749 ; AVX1-NEXT: # xmm2 = mem[0,0]
750 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
753 ; AVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
755 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
756 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
759 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
761 ; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090
762 ; AVX512VL-NEXT: kmovd %eax, %k1
763 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
764 ; AVX512VL-NEXT: retq
766 ; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
768 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
769 ; XOPAVX1-NEXT: # xmm2 = mem[0,0]
770 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
773 ; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
775 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
776 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
778 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
779 ret <16 x i8> %shuffle
782 define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
783 ; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
785 ; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
786 ; SSE2-NEXT: andps %xmm2, %xmm1
787 ; SSE2-NEXT: andnps %xmm0, %xmm2
788 ; SSE2-NEXT: orps %xmm1, %xmm2
789 ; SSE2-NEXT: movaps %xmm2, %xmm0
792 ; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
794 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
795 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
796 ; SSSE3-NEXT: por %xmm1, %xmm0
799 ; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
801 ; SSE41-NEXT: movdqa %xmm0, %xmm2
802 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
803 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
804 ; SSE41-NEXT: movdqa %xmm2, %xmm0
807 ; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
809 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
810 ; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
813 ; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
815 ; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0
816 ; AVX512VL-NEXT: kmovd %eax, %k1
817 ; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
818 ; AVX512VL-NEXT: retq
819 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
820 ret <16 x i8> %shuffle
823 define <16 x i8> @shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a, <16 x i8> %b) {
824 ; SSE2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
826 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
827 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
828 ; SSE2-NEXT: psrlq $16, %xmm0
829 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
830 ; SSE2-NEXT: packuswb %xmm0, %xmm0
833 ; SSSE3-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
835 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
836 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
839 ; SSE41-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
841 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
842 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
845 ; AVX1-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
847 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
848 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
851 ; AVX2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
853 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
854 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
857 ; AVX512VLBW-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
858 ; AVX512VLBW: # %bb.0:
859 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
860 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
861 ; AVX512VLBW-NEXT: retq
863 ; AVX512VLVBMI-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
864 ; AVX512VLVBMI: # %bb.0:
865 ; AVX512VLVBMI-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,20,2,20,2,20,2,20,2,20,2,20,2,20,2,20]
866 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
867 ; AVX512VLVBMI-NEXT: retq
869 ; XOP-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
871 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2],xmm1[4],xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
873 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 20, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
874 ret <16 x i8> %shuffle
878 define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> %a, <16 x i8> %b) {
879 ; SSE2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
881 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,255]
882 ; SSE2-NEXT: movdqa %xmm0, %xmm3
883 ; SSE2-NEXT: pand %xmm2, %xmm3
884 ; SSE2-NEXT: pandn %xmm1, %xmm2
885 ; SSE2-NEXT: por %xmm3, %xmm2
886 ; SSE2-NEXT: pxor %xmm1, %xmm1
887 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
888 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
889 ; SSE2-NEXT: movdqa %xmm0, %xmm1
890 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,65535]
891 ; SSE2-NEXT: pand %xmm3, %xmm0
892 ; SSE2-NEXT: pandn %xmm2, %xmm3
893 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
894 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
895 ; SSE2-NEXT: por %xmm2, %xmm1
896 ; SSE2-NEXT: por %xmm0, %xmm3
897 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0]
898 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,5,7]
899 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
900 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,3,4,5,6,7]
901 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
902 ; SSE2-NEXT: packuswb %xmm0, %xmm1
903 ; SSE2-NEXT: movdqa %xmm1, %xmm0
906 ; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
908 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
909 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
912 ; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
914 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
915 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
918 ; AVX1-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
920 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
921 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
924 ; AVX2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
926 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
927 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
930 ; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
931 ; AVX512VLBW: # %bb.0:
932 ; AVX512VLBW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
933 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
934 ; AVX512VLBW-NEXT: retq
936 ; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
937 ; AVX512VLVBMI: # %bb.0:
938 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [5,6,7,8,9,10,27,28,29,30,30,1,1,2,3,4]
939 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
940 ; AVX512VLVBMI-NEXT: retq
942 ; XOP-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
944 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],xmm1[11,12,13,14,14],xmm0[1,1,2,3,4]
946 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 30, i32 1, i32 1, i32 2, i32 3, i32 4>
950 define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<16 x i8> %a, <16 x i8> %b) {
951 ; SSE-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
953 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
954 ; SSE-NEXT: pand %xmm2, %xmm1
955 ; SSE-NEXT: pand %xmm2, %xmm0
956 ; SSE-NEXT: packuswb %xmm1, %xmm0
959 ; AVX1-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
961 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
962 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
963 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
964 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
967 ; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
969 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
970 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
971 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
972 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
975 ; AVX512VL-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
977 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
978 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
979 ; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0
980 ; AVX512VL-NEXT: vzeroupper
981 ; AVX512VL-NEXT: retq
983 ; XOP-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
985 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
987 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
988 ret <16 x i8> %shuffle
991 define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(<16 x i8> %a, <16 x i8> %b) {
992 ; SSE2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
994 ; SSE2-NEXT: psrlw $8, %xmm1
995 ; SSE2-NEXT: psrlw $8, %xmm0
996 ; SSE2-NEXT: packuswb %xmm1, %xmm0
999 ; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1001 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
1002 ; SSSE3-NEXT: pshufb %xmm2, %xmm1
1003 ; SSSE3-NEXT: pshufb %xmm2, %xmm0
1004 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1007 ; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1009 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
1010 ; SSE41-NEXT: pshufb %xmm2, %xmm1
1011 ; SSE41-NEXT: pshufb %xmm2, %xmm0
1012 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1015 ; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1017 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1018 ; AVX1-NEXT: # xmm2 = mem[0,0]
1019 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1020 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1021 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1024 ; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1026 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1027 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1028 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1029 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1032 ; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1033 ; AVX512VLBW: # %bb.0:
1034 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1035 ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1036 ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1037 ; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1038 ; AVX512VLBW-NEXT: retq
1040 ; AVX512VLVBMI-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1041 ; AVX512VLVBMI: # %bb.0:
1042 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1043 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1044 ; AVX512VLVBMI-NEXT: retq
1046 ; XOP-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
1048 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
1050 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
1051 ret <16 x i8> %shuffle
1054 ; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
1056 define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) {
1057 ; SSE2-LABEL: load_fold_pblendvb:
1059 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1060 ; SSE2-NEXT: andps %xmm1, %xmm0
1061 ; SSE2-NEXT: andnps (%rdi), %xmm1
1062 ; SSE2-NEXT: orps %xmm1, %xmm0
1065 ; SSSE3-LABEL: load_fold_pblendvb:
1067 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
1068 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3],zero,zero,zero,xmm0[7,8,9],zero,xmm0[11],zero,zero,zero,xmm0[15]
1069 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,xmm1[4,5,6],zero,zero,zero,xmm1[10],zero,xmm1[12,13,14],zero
1070 ; SSSE3-NEXT: por %xmm1, %xmm0
1073 ; SSE41-LABEL: load_fold_pblendvb:
1075 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1076 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
1077 ; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1
1078 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1081 ; AVX1-LABEL: load_fold_pblendvb:
1083 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
1084 ; AVX1-NEXT: # xmm1 = mem[0,0]
1085 ; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1088 ; AVX2-LABEL: load_fold_pblendvb:
1090 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
1091 ; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1094 ; AVX512VL-LABEL: load_fold_pblendvb:
1095 ; AVX512VL: # %bb.0:
1096 ; AVX512VL-NEXT: movw $29812, %ax # imm = 0x7474
1097 ; AVX512VL-NEXT: kmovd %eax, %k1
1098 ; AVX512VL-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1}
1099 ; AVX512VL-NEXT: retq
1101 ; XOPAVX1-LABEL: load_fold_pblendvb:
1103 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
1104 ; XOPAVX1-NEXT: # xmm1 = mem[0,0]
1105 ; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1106 ; XOPAVX1-NEXT: retq
1108 ; XOPAVX2-LABEL: load_fold_pblendvb:
1110 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
1111 ; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1112 ; XOPAVX2-NEXT: retq
1113 %x = load <16 x i8>, ptr %px, align 16
1114 %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
1115 ret <16 x i8> %select
1118 define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) {
1119 ; SSE2-LABEL: load_fold_pblendvb_commute:
1121 ; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1122 ; SSE2-NEXT: movaps %xmm1, %xmm2
1123 ; SSE2-NEXT: andnps %xmm0, %xmm2
1124 ; SSE2-NEXT: andps (%rdi), %xmm1
1125 ; SSE2-NEXT: orps %xmm2, %xmm1
1126 ; SSE2-NEXT: movaps %xmm1, %xmm0
1129 ; SSSE3-LABEL: load_fold_pblendvb_commute:
1131 ; SSSE3-NEXT: movdqa (%rdi), %xmm1
1132 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,xmm0[4,5,6],zero,zero,zero,xmm0[10],zero,xmm0[12,13,14],zero
1133 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3],zero,zero,zero,xmm1[7,8,9],zero,xmm1[11],zero,zero,zero,xmm1[15]
1134 ; SSSE3-NEXT: por %xmm1, %xmm0
1137 ; SSE41-LABEL: load_fold_pblendvb_commute:
1139 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1140 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1141 ; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1
1142 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1145 ; AVX1-LABEL: load_fold_pblendvb_commute:
1147 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1148 ; AVX1-NEXT: # xmm1 = mem[0,0]
1149 ; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1152 ; AVX2-LABEL: load_fold_pblendvb_commute:
1154 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1155 ; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1158 ; AVX512VL-LABEL: load_fold_pblendvb_commute:
1159 ; AVX512VL: # %bb.0:
1160 ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1
1161 ; AVX512VL-NEXT: movw $29812, %ax # imm = 0x7474
1162 ; AVX512VL-NEXT: kmovd %eax, %k1
1163 ; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
1164 ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
1165 ; AVX512VL-NEXT: retq
1167 ; XOPAVX1-LABEL: load_fold_pblendvb_commute:
1169 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1170 ; XOPAVX1-NEXT: # xmm1 = mem[0,0]
1171 ; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1172 ; XOPAVX1-NEXT: retq
1174 ; XOPAVX2-LABEL: load_fold_pblendvb_commute:
1176 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
1177 ; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
1178 ; XOPAVX2-NEXT: retq
1179 %x = load <16 x i8>, ptr %px, align 16
1180 %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
1181 ret <16 x i8> %select
1184 define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
1185 ; SSE2-LABEL: trunc_v4i32_shuffle:
1187 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1188 ; SSE2-NEXT: packuswb %xmm0, %xmm0
1189 ; SSE2-NEXT: packuswb %xmm0, %xmm0
1192 ; SSSE3-LABEL: trunc_v4i32_shuffle:
1194 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1197 ; SSE41-LABEL: trunc_v4i32_shuffle:
1199 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1202 ; AVX1OR2-LABEL: trunc_v4i32_shuffle:
1204 ; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1205 ; AVX1OR2-NEXT: retq
1207 ; AVX512VL-LABEL: trunc_v4i32_shuffle:
1208 ; AVX512VL: # %bb.0:
1209 ; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
1210 ; AVX512VL-NEXT: retq
1211 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1212 ret <16 x i8> %shuffle
1215 define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
1216 ; We don't have anything useful to check here. This generates 100s of
1217 ; instructions. Instead, just make sure we survived codegen.
1218 ; ALL-LABEL: stress_test0:
1221 %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
1222 %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
1223 %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
1224 %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
1225 %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
1226 %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
1227 %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
1228 %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
1229 %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
1230 %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
1231 ret <16 x i8> %s.16.0
1234 define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
1235 ; There is nothing interesting to check about these instructions other than
1236 ; that they survive codegen. However, we actually do better and delete all of
1237 ; them because the result is 'undef'.
1239 ; ALL-LABEL: undef_test1:
1240 ; ALL: # %bb.0: # %entry
1243 %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
1244 %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
1245 %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
1246 %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
1247 %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
1248 %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
1249 %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
1250 %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
1251 %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1252 %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1253 %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
1254 %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1255 %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
1257 ret <16 x i8> %s.12.4
1260 define <16 x i8> @PR20540(<8 x i8> %a) {
1261 ; SSE-LABEL: PR20540:
1263 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
1266 ; AVX-LABEL: PR20540:
1268 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1270 %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
1271 ret <16 x i8> %shuffle
1274 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
1275 ; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1277 ; SSE-NEXT: movzbl %dil, %eax
1278 ; SSE-NEXT: movd %eax, %xmm0
1281 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1283 ; AVX-NEXT: movzbl %dil, %eax
1284 ; AVX-NEXT: vmovd %eax, %xmm0
1286 %a = insertelement <16 x i8> undef, i8 %i, i32 0
1287 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1288 ret <16 x i8> %shuffle
1291 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
1292 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1294 ; SSE2-NEXT: shll $8, %edi
1295 ; SSE2-NEXT: pxor %xmm0, %xmm0
1296 ; SSE2-NEXT: pinsrw $2, %edi, %xmm0
1299 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1301 ; SSSE3-NEXT: shll $8, %edi
1302 ; SSSE3-NEXT: pxor %xmm0, %xmm0
1303 ; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
1306 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1308 ; SSE41-NEXT: pxor %xmm0, %xmm0
1309 ; SSE41-NEXT: pinsrb $5, %edi, %xmm0
1312 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1314 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1315 ; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
1317 %a = insertelement <16 x i8> undef, i8 %i, i32 0
1318 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1319 ret <16 x i8> %shuffle
1322 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
1323 ; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1325 ; SSE2-NEXT: shll $8, %edi
1326 ; SSE2-NEXT: pxor %xmm0, %xmm0
1327 ; SSE2-NEXT: pinsrw $7, %edi, %xmm0
1330 ; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1332 ; SSSE3-NEXT: shll $8, %edi
1333 ; SSSE3-NEXT: pxor %xmm0, %xmm0
1334 ; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
1337 ; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1339 ; SSE41-NEXT: pxor %xmm0, %xmm0
1340 ; SSE41-NEXT: pinsrb $15, %edi, %xmm0
1343 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1345 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1346 ; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
1348 %a = insertelement <16 x i8> undef, i8 %i, i32 0
1349 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
1350 ret <16 x i8> %shuffle
1353 define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
1354 ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1356 ; SSE2-NEXT: movzbl %dil, %eax
1357 ; SSE2-NEXT: pxor %xmm0, %xmm0
1358 ; SSE2-NEXT: pinsrw $1, %eax, %xmm0
1361 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1363 ; SSSE3-NEXT: movzbl %dil, %eax
1364 ; SSSE3-NEXT: pxor %xmm0, %xmm0
1365 ; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
1368 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1370 ; SSE41-NEXT: pxor %xmm0, %xmm0
1371 ; SSE41-NEXT: pinsrb $2, %edi, %xmm0
1374 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1376 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1377 ; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
1379 %a = insertelement <16 x i8> undef, i8 %i, i32 3
1380 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1381 ret <16 x i8> %shuffle
1384 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
1385 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
1387 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1390 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
1392 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1394 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
1395 ret <16 x i8> %shuffle
1398 define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1399 ; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1401 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1404 ; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1406 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1408 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
1409 ret <16 x i8> %shuffle
1412 define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1413 ; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1415 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1416 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1417 ; SSE2-NEXT: por %xmm1, %xmm0
1420 ; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1422 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1425 ; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1427 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1430 ; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1432 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1434 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
1435 ret <16 x i8> %shuffle
1438 define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1439 ; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1441 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1442 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1443 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1444 ; SSE2-NEXT: por %xmm1, %xmm0
1447 ; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1449 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1452 ; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1454 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1457 ; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1459 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1461 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
1462 ret <16 x i8> %shuffle
1465 define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
1466 ; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1468 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1469 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
1470 ; SSE2-NEXT: por %xmm1, %xmm0
1473 ; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1475 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1478 ; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1480 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1483 ; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1485 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1487 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
1488 ret <16 x i8> %shuffle
1491 define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
1492 ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1494 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1495 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1496 ; SSE2-NEXT: por %xmm1, %xmm0
1499 ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1501 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1502 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1505 ; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1507 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1508 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1511 ; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1513 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1515 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
1516 ret <16 x i8> %shuffle
1519 define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
1520 ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1522 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1523 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1524 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
1525 ; SSE2-NEXT: por %xmm1, %xmm0
1528 ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1530 ; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1533 ; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1535 ; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1538 ; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1540 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1542 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
1543 ret <16 x i8> %shuffle
1546 define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
1547 ; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1549 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1550 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1551 ; SSE2-NEXT: por %xmm1, %xmm0
1554 ; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1556 ; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1557 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1560 ; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1562 ; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1563 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1566 ; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1568 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1570 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
1571 ret <16 x i8> %shuffle
1575 define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) {
1576 ; SSE-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1578 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1579 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1582 ; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1584 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1585 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1587 %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
1588 ret <16 x i8> %shuffle
1591 define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
1592 ; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1594 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1595 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1596 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1599 ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1601 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u]
1604 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1606 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1609 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1611 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1613 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1614 ret <16 x i8> %shuffle
1617 define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1618 ; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1620 ; SSE2-NEXT: pxor %xmm1, %xmm1
1621 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1622 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1623 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1626 ; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1628 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1631 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1633 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1636 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1638 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1640 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1641 ret <16 x i8> %shuffle
1644 define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
1645 ; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1647 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1648 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1651 ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1653 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1654 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1657 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1659 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1662 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1664 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1666 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
1667 ret <16 x i8> %shuffle
1670 define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
1671 ; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1673 ; SSE2-NEXT: pxor %xmm1, %xmm1
1674 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1675 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1678 ; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1680 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1681 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1682 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1685 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1687 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1690 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1692 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1694 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
1695 ret <16 x i8> %shuffle
1698 define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
1699 ; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1701 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1704 ; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1706 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1709 ; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1711 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1714 ; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1716 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1718 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
1719 ret <16 x i8> %shuffle
1722 define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
1723 ; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1725 ; SSE2-NEXT: pxor %xmm1, %xmm1
1726 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1729 ; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1731 ; SSSE3-NEXT: pxor %xmm1, %xmm1
1732 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1735 ; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1737 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1740 ; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1742 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1744 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1745 ret <16 x i8> %shuffle
1748 define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
1749 ; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1750 ; SSE2: # %bb.0: # %entry
1751 ; SSE2-NEXT: pxor %xmm2, %xmm2
1752 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1753 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1754 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7]
1755 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
1756 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
1757 ; SSE2-NEXT: pand %xmm5, %xmm4
1758 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1759 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1]
1760 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
1761 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
1762 ; SSE2-NEXT: pandn %xmm2, %xmm5
1763 ; SSE2-NEXT: por %xmm4, %xmm5
1764 ; SSE2-NEXT: psrlq $16, %xmm0
1765 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
1766 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3]
1767 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1768 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
1769 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1770 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1771 ; SSE2-NEXT: packuswb %xmm5, %xmm2
1772 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1773 ; SSE2-NEXT: pand %xmm0, %xmm2
1774 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1775 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1776 ; SSE2-NEXT: pandn %xmm1, %xmm0
1777 ; SSE2-NEXT: por %xmm2, %xmm0
1780 ; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1781 ; SSSE3: # %bb.0: # %entry
1782 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1783 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1784 ; SSSE3-NEXT: por %xmm1, %xmm0
1787 ; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1788 ; SSE41: # %bb.0: # %entry
1789 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1790 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1791 ; SSE41-NEXT: por %xmm1, %xmm0
1794 ; AVX1-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1795 ; AVX1: # %bb.0: # %entry
1796 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1797 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1798 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1801 ; AVX2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1802 ; AVX2: # %bb.0: # %entry
1803 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1804 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1805 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1808 ; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1809 ; AVX512VLBW: # %bb.0: # %entry
1810 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1811 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1812 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1813 ; AVX512VLBW-NEXT: retq
1815 ; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1816 ; AVX512VLVBMI: # %bb.0: # %entry
1817 ; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
1818 ; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
1819 ; AVX512VLVBMI-NEXT: retq
1821 ; XOP-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1822 ; XOP: # %bb.0: # %entry
1823 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,10,2,7],xmm1[6],xmm0[14,7,2],xmm1[2],xmm0[3,1,14],xmm1[2],xmm0[9,11,0]
1826 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
1828 ret <16 x i8> %shuffle
1831 define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<8 x i16> %a0, <8 x i16> %a1) {
1832 ; SSE-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1834 ; SSE-NEXT: psrlw $8, %xmm0
1835 ; SSE-NEXT: psrlw $8, %xmm1
1836 ; SSE-NEXT: packuswb %xmm1, %xmm0
1839 ; AVX1-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1841 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1842 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1843 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1846 ; AVX2OR512VL-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1847 ; AVX2OR512VL: # %bb.0:
1848 ; AVX2OR512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
1849 ; AVX2OR512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
1850 ; AVX2OR512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1851 ; AVX2OR512VL-NEXT: retq
1853 ; XOP-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1855 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
1857 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1858 %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1859 %3 = bitcast <8 x i16> %1 to <16 x i8>
1860 %4 = bitcast <8 x i16> %2 to <16 x i8>
1861 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1865 define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
1866 ; Nothing interesting to test here. Just make sure we didn't crashe.
1867 ; ALL-LABEL: stress_test2:
1870 %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
1871 %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
1872 %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
1874 ret <16 x i8> %s.2.0
1877 define void @constant_gets_selected(ptr %ptr1, ptr %ptr2) {
1878 ; SSE-LABEL: constant_gets_selected:
1879 ; SSE: # %bb.0: # %entry
1880 ; SSE-NEXT: xorps %xmm0, %xmm0
1881 ; SSE-NEXT: movaps %xmm0, (%rdi)
1882 ; SSE-NEXT: movaps %xmm0, (%rsi)
1885 ; AVX-LABEL: constant_gets_selected:
1886 ; AVX: # %bb.0: # %entry
1887 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1888 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
1889 ; AVX-NEXT: vmovaps %xmm0, (%rsi)
1892 %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
1893 %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
1894 %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
1895 store <4 x i32> %weirder_zero, ptr %ptr1, align 16
1896 store <4 x i32> zeroinitializer, ptr %ptr2, align 16
1901 ; Shuffle to logical bit shifts
1904 define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
1905 ; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1907 ; SSE-NEXT: psllw $8, %xmm0
1910 ; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1912 ; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
1914 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
1915 ret <16 x i8> %shuffle
1918 define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
1919 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1921 ; SSE-NEXT: pslld $24, %xmm0
1924 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1926 ; AVX-NEXT: vpslld $24, %xmm0, %xmm0
1928 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
1929 ret <16 x i8> %shuffle
1932 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
1933 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1935 ; SSE-NEXT: psllq $56, %xmm0
1938 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1940 ; AVX-NEXT: vpsllq $56, %xmm0, %xmm0
1942 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
1943 ret <16 x i8> %shuffle
1946 define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1947 ; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1949 ; SSE-NEXT: psllq $8, %xmm0
1952 ; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1954 ; AVX-NEXT: vpsllq $8, %xmm0, %xmm0
1956 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
1957 ret <16 x i8> %shuffle
1960 define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
1961 ; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1963 ; SSE-NEXT: psrlw $8, %xmm0
1966 ; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1968 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
1970 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
1971 ret <16 x i8> %shuffle
1974 define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1975 ; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1977 ; SSE-NEXT: psrld $16, %xmm0
1980 ; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1982 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
1984 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
1985 ret <16 x i8> %shuffle
1988 define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1989 ; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1991 ; SSE-NEXT: psrlq $56, %xmm0
1994 ; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1996 ; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0
1998 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
1999 ret <16 x i8> %shuffle
2002 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
2003 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
2005 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
2006 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2007 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
2010 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
2012 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
2015 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
2017 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
2020 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
2022 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
2024 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
2025 ret <16 x i8> %shuffle
2028 define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
2029 ; SSE-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
2031 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
2032 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2035 ; AVX1-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
2037 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
2038 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2041 ; AVX2-SLOW-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
2042 ; AVX2-SLOW: # %bb.0:
2043 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
2044 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2045 ; AVX2-SLOW-NEXT: retq
2047 ; AVX2-FAST-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
2048 ; AVX2-FAST: # %bb.0:
2049 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2050 ; AVX2-FAST-NEXT: retq
2052 ; AVX512VL-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
2053 ; AVX512VL: # %bb.0:
2054 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2055 ; AVX512VL-NEXT: retq
2057 ; XOP-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
2059 ; XOP-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
2060 ; XOP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2062 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
2063 ret <16 x i8> %shuffle
2066 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) {
2067 ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
2069 ; SSE-NEXT: psrlq $8, %xmm0
2070 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
2073 ; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
2075 ; AVX1-NEXT: vpsrlq $8, %xmm0, %xmm0
2076 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
2079 ; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
2080 ; AVX2-SLOW: # %bb.0:
2081 ; AVX2-SLOW-NEXT: vpsrlq $8, %xmm0, %xmm0
2082 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
2083 ; AVX2-SLOW-NEXT: retq
2085 ; AVX2-FAST-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
2086 ; AVX2-FAST: # %bb.0:
2087 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6]
2088 ; AVX2-FAST-NEXT: retq
2090 ; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
2091 ; AVX512VL: # %bb.0:
2092 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6]
2093 ; AVX512VL-NEXT: retq
2095 ; XOP-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
2097 ; XOP-NEXT: vpsrlq $8, %xmm0, %xmm0
2098 ; XOP-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
2100 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
2101 ret <16 x i8> %shuffle
2104 define <16 x i8> @shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14(<16 x i8> %a) {
2105 ; SSE2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2107 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2108 ; SSE2-NEXT: psrld $24, %xmm1
2109 ; SSE2-NEXT: pslld $8, %xmm0
2110 ; SSE2-NEXT: por %xmm1, %xmm0
2113 ; SSSE3-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2115 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
2118 ; SSE41-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2120 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
2123 ; AVX1-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2125 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
2128 ; AVX2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2130 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
2133 ; AVX512VL-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2134 ; AVX512VL: # %bb.0:
2135 ; AVX512VL-NEXT: vprold $8, %xmm0, %xmm0
2136 ; AVX512VL-NEXT: retq
2138 ; XOP-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
2140 ; XOP-NEXT: vprotd $8, %xmm0, %xmm0
2142 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14>
2143 ret <16 x i8> %shuffle
2147 define <16 x i8> @shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09(<16 x i8> %a) {
2148 ; SSE-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
2150 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2151 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2154 ; AVX1-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
2156 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2157 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2160 ; AVX2-SLOW-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
2161 ; AVX2-SLOW: # %bb.0:
2162 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2163 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2164 ; AVX2-SLOW-NEXT: retq
2166 ; AVX2-FAST-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
2167 ; AVX2-FAST: # %bb.0:
2168 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9]
2169 ; AVX2-FAST-NEXT: retq
2171 ; AVX512VL-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
2172 ; AVX512VL: # %bb.0:
2173 ; AVX512VL-NEXT: vprolq $48, %xmm0, %xmm0
2174 ; AVX512VL-NEXT: retq
2176 ; XOP-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
2178 ; XOP-NEXT: vprotq $48, %xmm0, %xmm0
2180 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9>
2181 ret <16 x i8> %shuffle
2184 define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
2185 ; SSE-LABEL: PR12412:
2186 ; SSE: # %bb.0: # %entry
2187 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2188 ; SSE-NEXT: pand %xmm2, %xmm1
2189 ; SSE-NEXT: pand %xmm2, %xmm0
2190 ; SSE-NEXT: packuswb %xmm1, %xmm0
2193 ; AVX1-LABEL: PR12412:
2194 ; AVX1: # %bb.0: # %entry
2195 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2196 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2197 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2198 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2201 ; AVX2-LABEL: PR12412:
2202 ; AVX2: # %bb.0: # %entry
2203 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2204 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
2205 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
2206 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2209 ; AVX512VL-LABEL: PR12412:
2210 ; AVX512VL: # %bb.0: # %entry
2211 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2212 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2213 ; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0
2214 ; AVX512VL-NEXT: vzeroupper
2215 ; AVX512VL-NEXT: retq
2217 ; XOP-LABEL: PR12412:
2218 ; XOP: # %bb.0: # %entry
2219 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
2222 %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2226 define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
2227 ; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
2229 ; SSE-NEXT: psrld $8, %xmm0
2232 ; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
2234 ; AVX-NEXT: vpsrld $8, %xmm0, %xmm0
2236 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
2237 ret <16 x i8> %shuffle
2240 define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
2241 ; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
2243 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2246 ; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
2248 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2250 %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
2251 %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
2252 %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2253 %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
2254 %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
2255 %bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8>
2256 ret <16 x i8> %bitcast8
2259 define <16 x i8> @insert_dup_mem_v16i8_i32(ptr %ptr) {
2260 ; SSE2-LABEL: insert_dup_mem_v16i8_i32:
2262 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2263 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2264 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2265 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2268 ; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
2270 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2271 ; SSSE3-NEXT: pxor %xmm1, %xmm1
2272 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2275 ; SSE41-LABEL: insert_dup_mem_v16i8_i32:
2277 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2278 ; SSE41-NEXT: pxor %xmm1, %xmm1
2279 ; SSE41-NEXT: pshufb %xmm1, %xmm0
2282 ; AVX1-LABEL: insert_dup_mem_v16i8_i32:
2284 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2285 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2286 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2289 ; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32:
2290 ; AVX2OR512VL: # %bb.0:
2291 ; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
2292 ; AVX2OR512VL-NEXT: retq
2294 ; XOPAVX1-LABEL: insert_dup_mem_v16i8_i32:
2296 ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2297 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2298 ; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2299 ; XOPAVX1-NEXT: retq
2301 ; XOPAVX2-LABEL: insert_dup_mem_v16i8_i32:
2303 ; XOPAVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2304 ; XOPAVX2-NEXT: retq
2305 %tmp = load i32, ptr %ptr, align 4
2306 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2307 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
2308 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
2312 define <16 x i8> @insert_dup_mem_v16i8_sext_i8(ptr %ptr) {
2313 ; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
2315 ; SSE2-NEXT: movzbl (%rdi), %eax
2316 ; SSE2-NEXT: movd %eax, %xmm0
2317 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2318 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2319 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2322 ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
2324 ; SSSE3-NEXT: movzbl (%rdi), %eax
2325 ; SSSE3-NEXT: movd %eax, %xmm0
2326 ; SSSE3-NEXT: pxor %xmm1, %xmm1
2327 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2330 ; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
2332 ; SSE41-NEXT: movzbl (%rdi), %eax
2333 ; SSE41-NEXT: movd %eax, %xmm0
2334 ; SSE41-NEXT: pxor %xmm1, %xmm1
2335 ; SSE41-NEXT: pshufb %xmm1, %xmm0
2338 ; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
2340 ; AVX1-NEXT: movzbl (%rdi), %eax
2341 ; AVX1-NEXT: vmovd %eax, %xmm0
2342 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2343 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2346 ; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8:
2347 ; AVX2OR512VL: # %bb.0:
2348 ; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
2349 ; AVX2OR512VL-NEXT: retq
2351 ; XOPAVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
2353 ; XOPAVX1-NEXT: movzbl (%rdi), %eax
2354 ; XOPAVX1-NEXT: vmovd %eax, %xmm0
2355 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2356 ; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2357 ; XOPAVX1-NEXT: retq
2359 ; XOPAVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
2361 ; XOPAVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2362 ; XOPAVX2-NEXT: retq
2363 %tmp = load i8, ptr %ptr, align 1
2364 %tmp1 = sext i8 %tmp to i32
2365 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2366 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2367 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
2371 define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(ptr %ptr) {
2372 ; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
2374 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2375 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2376 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
2377 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2380 ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
2382 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2383 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2386 ; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
2388 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2389 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2392 ; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
2394 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2395 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2398 ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32:
2399 ; AVX2OR512VL: # %bb.0:
2400 ; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0
2401 ; AVX2OR512VL-NEXT: retq
2403 ; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
2405 ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2406 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2407 ; XOPAVX1-NEXT: retq
2409 ; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i8_i32:
2411 ; XOPAVX2-NEXT: vpbroadcastb 1(%rdi), %xmm0
2412 ; XOPAVX2-NEXT: retq
2413 %tmp = load i32, ptr %ptr, align 4
2414 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2415 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
2416 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2420 define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(ptr %ptr) {
2421 ; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
2423 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2424 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2425 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2426 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2429 ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
2431 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2432 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2435 ; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
2437 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2438 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2441 ; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
2443 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2444 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2447 ; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32:
2448 ; AVX2OR512VL: # %bb.0:
2449 ; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0
2450 ; AVX2OR512VL-NEXT: retq
2452 ; XOPAVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
2454 ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2455 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2456 ; XOPAVX1-NEXT: retq
2458 ; XOPAVX2-LABEL: insert_dup_elt2_mem_v16i8_i32:
2460 ; XOPAVX2-NEXT: vpbroadcastb 2(%rdi), %xmm0
2461 ; XOPAVX2-NEXT: retq
2462 %tmp = load i32, ptr %ptr, align 4
2463 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2464 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
2465 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2469 define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(ptr %ptr) {
2470 ; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2472 ; SSE2-NEXT: movsbl (%rdi), %eax
2473 ; SSE2-NEXT: movd %eax, %xmm0
2474 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2475 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
2476 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2479 ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2481 ; SSSE3-NEXT: movsbl (%rdi), %eax
2482 ; SSSE3-NEXT: movd %eax, %xmm0
2483 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2486 ; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2488 ; SSE41-NEXT: movsbl (%rdi), %eax
2489 ; SSE41-NEXT: movd %eax, %xmm0
2490 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2493 ; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2495 ; AVX1-NEXT: movsbl (%rdi), %eax
2496 ; AVX1-NEXT: vmovd %eax, %xmm0
2497 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2500 ; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2502 ; AVX2-NEXT: movsbl (%rdi), %eax
2503 ; AVX2-NEXT: shrl $8, %eax
2504 ; AVX2-NEXT: vmovd %eax, %xmm0
2505 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2508 ; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2509 ; AVX512VL: # %bb.0:
2510 ; AVX512VL-NEXT: movsbl (%rdi), %eax
2511 ; AVX512VL-NEXT: shrl $8, %eax
2512 ; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
2513 ; AVX512VL-NEXT: retq
2515 ; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2517 ; XOPAVX1-NEXT: movsbl (%rdi), %eax
2518 ; XOPAVX1-NEXT: vmovd %eax, %xmm0
2519 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2520 ; XOPAVX1-NEXT: retq
2522 ; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2524 ; XOPAVX2-NEXT: movsbl (%rdi), %eax
2525 ; XOPAVX2-NEXT: shrl $8, %eax
2526 ; XOPAVX2-NEXT: vmovd %eax, %xmm0
2527 ; XOPAVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2528 ; XOPAVX2-NEXT: retq
2529 %tmp = load i8, ptr %ptr, align 1
2530 %tmp1 = sext i8 %tmp to i32
2531 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2532 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2533 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2537 define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(ptr %ptr) {
2538 ; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2540 ; SSE2-NEXT: movsbl (%rdi), %eax
2541 ; SSE2-NEXT: movd %eax, %xmm0
2542 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2543 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2544 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2547 ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2549 ; SSSE3-NEXT: movsbl (%rdi), %eax
2550 ; SSSE3-NEXT: movd %eax, %xmm0
2551 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2554 ; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2556 ; SSE41-NEXT: movsbl (%rdi), %eax
2557 ; SSE41-NEXT: movd %eax, %xmm0
2558 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2561 ; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2563 ; AVX1-NEXT: movsbl (%rdi), %eax
2564 ; AVX1-NEXT: vmovd %eax, %xmm0
2565 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2568 ; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2570 ; AVX2-NEXT: movsbl (%rdi), %eax
2571 ; AVX2-NEXT: shrl $16, %eax
2572 ; AVX2-NEXT: vmovd %eax, %xmm0
2573 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2576 ; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2577 ; AVX512VL: # %bb.0:
2578 ; AVX512VL-NEXT: movsbl (%rdi), %eax
2579 ; AVX512VL-NEXT: shrl $16, %eax
2580 ; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
2581 ; AVX512VL-NEXT: retq
2583 ; XOPAVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2585 ; XOPAVX1-NEXT: movsbl (%rdi), %eax
2586 ; XOPAVX1-NEXT: vmovd %eax, %xmm0
2587 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2588 ; XOPAVX1-NEXT: retq
2590 ; XOPAVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2592 ; XOPAVX2-NEXT: movsbl (%rdi), %eax
2593 ; XOPAVX2-NEXT: shrl $16, %eax
2594 ; XOPAVX2-NEXT: vmovd %eax, %xmm0
2595 ; XOPAVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2596 ; XOPAVX2-NEXT: retq
2597 %tmp = load i8, ptr %ptr, align 1
2598 %tmp1 = sext i8 %tmp to i32
2599 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2600 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2601 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2605 define <16 x i8> @PR31364(ptr nocapture readonly %a, ptr nocapture readonly %b) {
2606 ; SSE2-LABEL: PR31364:
2608 ; SSE2-NEXT: movzbl (%rdi), %eax
2609 ; SSE2-NEXT: movzbl (%rsi), %ecx
2610 ; SSE2-NEXT: shll $8, %ecx
2611 ; SSE2-NEXT: orl %eax, %ecx
2612 ; SSE2-NEXT: movd %ecx, %xmm1
2613 ; SSE2-NEXT: pxor %xmm0, %xmm0
2614 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2615 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7]
2616 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
2617 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2618 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
2619 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
2620 ; SSE2-NEXT: packuswb %xmm1, %xmm0
2623 ; SSSE3-LABEL: PR31364:
2625 ; SSSE3-NEXT: movzbl (%rdi), %eax
2626 ; SSSE3-NEXT: movzbl (%rsi), %ecx
2627 ; SSSE3-NEXT: shll $8, %ecx
2628 ; SSSE3-NEXT: orl %eax, %ecx
2629 ; SSSE3-NEXT: movd %ecx, %xmm0
2630 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2633 ; SSE41-LABEL: PR31364:
2635 ; SSE41-NEXT: movzbl (%rdi), %eax
2636 ; SSE41-NEXT: movd %eax, %xmm0
2637 ; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0
2638 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2641 ; AVX-LABEL: PR31364:
2643 ; AVX-NEXT: movzbl (%rdi), %eax
2644 ; AVX-NEXT: vmovd %eax, %xmm0
2645 ; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0
2646 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2648 %v0 = load i8, ptr %a, align 1
2649 %vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0
2650 %v1 = load i8, ptr %b, align 1
2651 %vecins2 = insertelement <16 x i8> %vecins, i8 %v1, i32 1
2652 %result = shufflevector <16 x i8> %vecins2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 3, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
2653 ret <16 x i8> %result
2656 define <16 x i8> @PR31301(ptr nocapture readonly %x, ptr nocapture readonly %y) {
2657 ; SSE2-LABEL: PR31301:
2658 ; SSE2: # %bb.0: # %entry
2659 ; SSE2-NEXT: movzbl (%rdi), %eax
2660 ; SSE2-NEXT: movd %eax, %xmm0
2661 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2662 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2663 ; SSE2-NEXT: movzbl (%rsi), %eax
2664 ; SSE2-NEXT: movd %eax, %xmm1
2665 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2666 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
2667 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2670 ; SSSE3-LABEL: PR31301:
2671 ; SSSE3: # %bb.0: # %entry
2672 ; SSSE3-NEXT: movzbl (%rdi), %eax
2673 ; SSSE3-NEXT: movd %eax, %xmm0
2674 ; SSSE3-NEXT: pxor %xmm1, %xmm1
2675 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2676 ; SSSE3-NEXT: movzbl (%rsi), %eax
2677 ; SSSE3-NEXT: movd %eax, %xmm2
2678 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
2679 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2682 ; SSE41-LABEL: PR31301:
2683 ; SSE41: # %bb.0: # %entry
2684 ; SSE41-NEXT: movzbl (%rdi), %eax
2685 ; SSE41-NEXT: movd %eax, %xmm0
2686 ; SSE41-NEXT: pxor %xmm1, %xmm1
2687 ; SSE41-NEXT: pshufb %xmm1, %xmm0
2688 ; SSE41-NEXT: movzbl (%rsi), %eax
2689 ; SSE41-NEXT: movd %eax, %xmm2
2690 ; SSE41-NEXT: pshufb %xmm1, %xmm2
2691 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2694 ; AVX1-LABEL: PR31301:
2695 ; AVX1: # %bb.0: # %entry
2696 ; AVX1-NEXT: movzbl (%rdi), %eax
2697 ; AVX1-NEXT: vmovd %eax, %xmm0
2698 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2699 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2700 ; AVX1-NEXT: movzbl (%rsi), %eax
2701 ; AVX1-NEXT: vmovd %eax, %xmm2
2702 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
2703 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2706 ; AVX2OR512VL-LABEL: PR31301:
2707 ; AVX2OR512VL: # %bb.0: # %entry
2708 ; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
2709 ; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1
2710 ; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2711 ; AVX2OR512VL-NEXT: retq
2713 ; XOPAVX1-LABEL: PR31301:
2714 ; XOPAVX1: # %bb.0: # %entry
2715 ; XOPAVX1-NEXT: movzbl (%rdi), %eax
2716 ; XOPAVX1-NEXT: vmovd %eax, %xmm0
2717 ; XOPAVX1-NEXT: movzbl (%rsi), %eax
2718 ; XOPAVX1-NEXT: vmovd %eax, %xmm1
2719 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0]
2720 ; XOPAVX1-NEXT: retq
2722 ; XOPAVX2-LABEL: PR31301:
2723 ; XOPAVX2: # %bb.0: # %entry
2724 ; XOPAVX2-NEXT: vpbroadcastb (%rdi), %xmm0
2725 ; XOPAVX2-NEXT: vpbroadcastb (%rsi), %xmm1
2726 ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2727 ; XOPAVX2-NEXT: retq
2729 %0 = load i8, ptr %x, align 1
2730 %1 = insertelement <16 x i8> undef, i8 %0, i32 0
2731 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2732 %2 = load i8, ptr %y, align 1
2733 %3 = insertelement <16 x i8> undef, i8 %2, i32 0
2734 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2735 %vzip.i = shufflevector <16 x i8> %lane, <16 x i8> %lane3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2736 ret <16 x i8> %vzip.i