1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
9 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
10 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
11 declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
12 declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
18 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
19 ; AVX512-LABEL: var_funnnel_v8i64:
21 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
23 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
27 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
28 ; AVX512-LABEL: var_funnnel_v16i32:
30 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
32 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
36 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
37 ; AVX512F-LABEL: var_funnnel_v32i16:
39 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
40 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
41 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
42 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm2, %zmm4
43 ; AVX512F-NEXT: vpmovdw %zmm4, %ymm4
44 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
45 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
46 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
47 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
48 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
49 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
50 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
51 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
52 ; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm5
53 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
54 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
55 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
56 ; AVX512F-NEXT: vpsrlvd %zmm5, %zmm2, %zmm2
57 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
58 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
59 ; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1
60 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
61 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
62 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
63 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
64 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
65 ; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
68 ; AVX512VL-LABEL: var_funnnel_v32i16:
70 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
71 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3
72 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
73 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm2, %zmm4
74 ; AVX512VL-NEXT: vpmovdw %zmm4, %ymm4
75 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
76 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
77 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3
78 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
79 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
80 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
81 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
82 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
83 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm5
84 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
85 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
86 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
87 ; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm2, %zmm2
88 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
89 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
90 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1
91 ; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
92 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
93 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
94 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
95 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
96 ; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
99 ; AVX512BW-LABEL: var_funnnel_v32i16:
101 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
102 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
103 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
104 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
105 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
106 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
107 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
108 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
109 ; AVX512BW-NEXT: retq
111 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
112 ; AVX512VLBW: # %bb.0:
113 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
114 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
115 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
116 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
117 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
118 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
119 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
120 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
121 ; AVX512VLBW-NEXT: retq
123 ; AVX512VBMI2-LABEL: var_funnnel_v32i16:
124 ; AVX512VBMI2: # %bb.0:
125 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
126 ; AVX512VBMI2-NEXT: retq
128 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
129 ; AVX512VLVBMI2: # %bb.0:
130 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
131 ; AVX512VLVBMI2-NEXT: retq
132 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
136 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
137 ; AVX512F-LABEL: var_funnnel_v64i8:
139 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
140 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3
141 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
142 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
143 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
144 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
145 ; AVX512F-NEXT: vpsubb %ymm5, %ymm6, %ymm5
146 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
147 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
148 ; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
149 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
150 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm8
151 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
152 ; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8
153 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
154 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
155 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm8
156 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
157 ; AVX512F-NEXT: vpand %ymm10, %ymm8, %ymm8
158 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
159 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
160 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm5
161 ; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
162 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm5
163 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
164 ; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
165 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
166 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6
167 ; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
168 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
169 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
170 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
171 ; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6
172 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
173 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
174 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
175 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4
176 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
177 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
178 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
179 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6
180 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
181 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
182 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm4
183 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
184 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
185 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
186 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
187 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4
188 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
189 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
190 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
191 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
192 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
193 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
194 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
195 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
196 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
197 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
198 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
199 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
200 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
201 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
202 ; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
205 ; AVX512VL-LABEL: var_funnnel_v64i8:
207 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
208 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3
209 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
210 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
211 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
212 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
213 ; AVX512VL-NEXT: vpsubb %ymm5, %ymm6, %ymm5
214 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
215 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
216 ; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
217 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
218 ; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm8
219 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
220 ; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8
221 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
222 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
223 ; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm8
224 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
225 ; AVX512VL-NEXT: vpand %ymm10, %ymm8, %ymm8
226 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
227 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm3, %ymm3
228 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm5
229 ; AVX512VL-NEXT: vpand %ymm4, %ymm5, %ymm4
230 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm5
231 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
232 ; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
233 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
234 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6
235 ; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
236 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
237 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
238 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6
239 ; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
240 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
241 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
242 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
243 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4
244 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
245 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
246 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
247 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
248 ; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
249 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
250 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm4
251 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
252 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
253 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
254 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
255 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm4
256 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
257 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
258 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
259 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
260 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
261 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
262 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
263 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
264 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
265 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
266 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
267 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
268 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
269 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
270 ; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
271 ; AVX512VL-NEXT: retq
273 ; AVX512BW-LABEL: var_funnnel_v64i8:
275 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
276 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
277 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
278 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
279 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
280 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
281 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
282 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
283 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
284 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
285 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
286 ; AVX512BW-NEXT: vpsrlw $2, %zmm2, %zmm5
287 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
288 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
289 ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm5
290 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
291 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
292 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
293 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
294 ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
295 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
296 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
297 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
298 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
299 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
300 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
301 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
302 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
303 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
304 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
305 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
306 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
307 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
308 ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
309 ; AVX512BW-NEXT: retq
311 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
312 ; AVX512VLBW: # %bb.0:
313 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
314 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
315 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
316 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
317 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
318 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
319 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
320 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
321 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
322 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
323 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
324 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm2, %zmm5
325 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
326 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
327 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm2, %zmm5
328 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
329 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
330 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
331 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
332 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
333 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
334 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
335 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
336 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
337 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
338 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
339 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
340 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
341 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
342 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
343 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
344 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
345 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
346 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
347 ; AVX512VLBW-NEXT: retq
349 ; AVX512VBMI2-LABEL: var_funnnel_v64i8:
350 ; AVX512VBMI2: # %bb.0:
351 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
352 ; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
353 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
354 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
355 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
356 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
357 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
358 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
359 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
360 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
361 ; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
362 ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
363 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
364 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
365 ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
366 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
367 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
368 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
369 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
370 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
371 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
372 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
373 ; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1
374 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
375 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
376 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
377 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
378 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
379 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
380 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
381 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
382 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
383 ; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
384 ; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
385 ; AVX512VBMI2-NEXT: retq
387 ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
388 ; AVX512VLVBMI2: # %bb.0:
389 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
390 ; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
391 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
392 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
393 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
394 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
395 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
396 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
397 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
398 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
399 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
400 ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
401 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
402 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
403 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
404 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
405 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
406 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
407 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
408 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
409 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
410 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
411 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1
412 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
413 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
414 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
415 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
416 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
417 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
418 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
419 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
420 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
421 ; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
422 ; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
423 ; AVX512VLVBMI2-NEXT: retq
424 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
429 ; Uniform Variable Shifts
432 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
433 ; AVX512-LABEL: splatvar_funnnel_v8i64:
435 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
436 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
438 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
439 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
443 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
444 ; AVX512-LABEL: splatvar_funnnel_v16i32:
446 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
447 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
449 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
450 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
454 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
455 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
457 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
458 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
459 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
460 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
461 ; AVX512F-NEXT: vpsllw %xmm3, %ymm4, %ymm5
462 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
463 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
464 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
465 ; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
466 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
467 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
468 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
469 ; AVX512F-NEXT: vpsrld %xmm1, %zmm0, %zmm0
470 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
471 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
472 ; AVX512F-NEXT: vpsrld %xmm1, %zmm2, %zmm1
473 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
474 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
475 ; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
478 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
480 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
481 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
482 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
483 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
484 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm5
485 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
486 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
487 ; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
488 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
489 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
490 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
491 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
492 ; AVX512VL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
493 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
494 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
495 ; AVX512VL-NEXT: vpsrld %xmm1, %zmm2, %zmm1
496 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
497 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
498 ; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
499 ; AVX512VL-NEXT: retq
501 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
503 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
504 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
505 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
506 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
507 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
508 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
509 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
510 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
511 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
512 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
513 ; AVX512BW-NEXT: retq
515 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
516 ; AVX512VLBW: # %bb.0:
517 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
518 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
519 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
520 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
521 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
522 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
523 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
524 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
525 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
526 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
527 ; AVX512VLBW-NEXT: retq
529 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
530 ; AVX512VBMI2: # %bb.0:
531 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
532 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
533 ; AVX512VBMI2-NEXT: retq
535 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
536 ; AVX512VLVBMI2: # %bb.0:
537 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
538 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
539 ; AVX512VLVBMI2-NEXT: retq
540 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
541 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
545 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
546 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
548 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm3
549 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
550 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4
551 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
552 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
553 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
554 ; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3
555 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
556 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
557 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm4
558 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm6
559 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
560 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
561 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
562 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
563 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
564 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
565 ; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6
566 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm10
567 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm4
568 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm6
569 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
570 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
571 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm5
572 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
573 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
574 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm5
575 ; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5
576 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
577 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
578 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
579 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
580 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
581 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
582 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
583 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
584 ; AVX512F-NEXT: vpsllw %xmm1, %xmm0, %xmm0
585 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
586 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
587 ; AVX512F-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
590 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
592 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm3
593 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
594 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4
595 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
596 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
597 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
598 ; AVX512VL-NEXT: vpsubb %ymm3, %ymm6, %ymm3
599 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
600 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
601 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm4
602 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6
603 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
604 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
605 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
606 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
607 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6
608 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
609 ; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
610 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm10
611 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm4
612 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm6
613 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
614 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm3
615 ; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm5
616 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
617 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
618 ; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm5
619 ; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
620 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
621 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
622 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
623 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
624 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
625 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
626 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
627 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
628 ; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
629 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
630 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
631 ; AVX512VL-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
632 ; AVX512VL-NEXT: retq
634 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
636 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
637 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
638 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
639 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
640 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
641 ; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
642 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
643 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
644 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
645 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
646 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
647 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
648 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
649 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
650 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
651 ; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
652 ; AVX512BW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
653 ; AVX512BW-NEXT: retq
655 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
656 ; AVX512VLBW: # %bb.0:
657 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
658 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
659 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
660 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
661 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
662 ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
663 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
664 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
665 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
666 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
667 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
668 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
669 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
670 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
671 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
672 ; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
673 ; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
674 ; AVX512VLBW-NEXT: retq
676 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
677 ; AVX512VBMI2: # %bb.0:
678 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
679 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
680 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
681 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
682 ; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
683 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
684 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
685 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
686 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
687 ; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
688 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
689 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
690 ; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
691 ; AVX512VBMI2-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
692 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
693 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
694 ; AVX512VBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
695 ; AVX512VBMI2-NEXT: retq
697 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
698 ; AVX512VLVBMI2: # %bb.0:
699 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
700 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
701 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
702 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
703 ; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
704 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
705 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
706 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
707 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
708 ; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
709 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
710 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
711 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
712 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
713 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
714 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
715 ; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
716 ; AVX512VLVBMI2-NEXT: retq
717 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
718 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
726 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
727 ; AVX512-LABEL: constant_funnnel_v8i64:
729 ; AVX512-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
731 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
735 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
736 ; AVX512-LABEL: constant_funnnel_v16i32:
738 ; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
740 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
744 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
745 ; AVX512F-LABEL: constant_funnnel_v32i16:
747 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
748 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
749 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
750 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7]
751 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
752 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
753 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7]
754 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
755 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
756 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
757 ; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
758 ; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
759 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
760 ; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
763 ; AVX512VL-LABEL: constant_funnnel_v32i16:
765 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
766 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
767 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
768 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1,2,3,4,5,6,7]
769 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
770 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
771 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1,2,3,4,5,6,7]
772 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
773 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
774 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
775 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
776 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
777 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
778 ; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
779 ; AVX512VL-NEXT: retq
781 ; AVX512BW-LABEL: constant_funnnel_v32i16:
783 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
784 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
785 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
786 ; AVX512BW-NEXT: retq
788 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
789 ; AVX512VLBW: # %bb.0:
790 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
791 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
792 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
793 ; AVX512VLBW-NEXT: retq
795 ; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
796 ; AVX512VBMI2: # %bb.0:
797 ; AVX512VBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
798 ; AVX512VBMI2-NEXT: retq
800 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
801 ; AVX512VLVBMI2: # %bb.0:
802 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
803 ; AVX512VLVBMI2-NEXT: retq
804 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
808 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
809 ; AVX512F-LABEL: constant_funnnel_v64i8:
811 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
812 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
813 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
814 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
815 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
816 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
817 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
818 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
819 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
820 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
821 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
822 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
823 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
824 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
825 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
826 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
827 ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3
828 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
829 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
830 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
831 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
832 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
833 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
834 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
835 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
836 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
837 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
838 ; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
839 ; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
840 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
841 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
842 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
843 ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
844 ; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
845 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
846 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
847 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
848 ; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
849 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
850 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
851 ; AVX512F-NEXT: vpmullw %ymm6, %ymm0, %ymm0
852 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
853 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
854 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
855 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
858 ; AVX512VL-LABEL: constant_funnnel_v64i8:
860 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
861 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
862 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
863 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
864 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
865 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
866 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
867 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
868 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
869 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
870 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
871 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
872 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
873 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
874 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
875 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
876 ; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3
877 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
878 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
879 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
880 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
881 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
882 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
883 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
884 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
885 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
886 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
887 ; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
888 ; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
889 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
890 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
891 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
892 ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
893 ; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1
894 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
895 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
896 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
897 ; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
898 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
899 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
900 ; AVX512VL-NEXT: vpmullw %ymm6, %ymm0, %ymm0
901 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
902 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
903 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
904 ; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
905 ; AVX512VL-NEXT: retq
907 ; AVX512BW-LABEL: constant_funnnel_v64i8:
909 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
910 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
911 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
912 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
913 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
914 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
915 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
916 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
917 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
918 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
919 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
920 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
921 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
922 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
923 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
924 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
925 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
926 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
927 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
928 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
929 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
930 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
931 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
932 ; AVX512BW-NEXT: retq
934 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
935 ; AVX512VLBW: # %bb.0:
936 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
937 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
938 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
939 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
940 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
941 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
942 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
943 ; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
944 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
945 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
946 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
947 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
948 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
949 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
950 ; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
951 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
952 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
953 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
954 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
955 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
956 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
957 ; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
958 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
959 ; AVX512VLBW-NEXT: retq
961 ; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
962 ; AVX512VBMI2: # %bb.0:
963 ; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
964 ; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
965 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
966 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
967 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
968 ; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
969 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
970 ; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
971 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
972 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
973 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
974 ; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
975 ; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
976 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
977 ; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
978 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
979 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
980 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
981 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
982 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
983 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
984 ; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
985 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
986 ; AVX512VBMI2-NEXT: retq
988 ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
989 ; AVX512VLVBMI2: # %bb.0:
990 ; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
991 ; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
992 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
993 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
994 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
995 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
996 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
997 ; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
998 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
999 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
1000 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
1001 ; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
1002 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
1003 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
1004 ; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1005 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
1006 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
1007 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
1008 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
1009 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1010 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
1011 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
1012 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
1013 ; AVX512VLVBMI2-NEXT: retq
1014 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1019 ; Uniform Constant Shifts
1022 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
1023 ; AVX512-LABEL: splatconstant_funnnel_v8i64:
1025 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
1027 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
1031 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
1032 ; AVX512-LABEL: splatconstant_funnnel_v16i32:
1034 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
1036 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1040 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
1041 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
1043 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
1044 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1045 ; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm3
1046 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1047 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1048 ; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
1049 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1050 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
1051 ; AVX512F-NEXT: retq
1053 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
1054 ; AVX512VL: # %bb.0:
1055 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
1056 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1057 ; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm3
1058 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1059 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1060 ; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2
1061 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1062 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
1063 ; AVX512VL-NEXT: retq
1065 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
1066 ; AVX512BW: # %bb.0:
1067 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm1
1068 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
1069 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1070 ; AVX512BW-NEXT: retq
1072 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
1073 ; AVX512VLBW: # %bb.0:
1074 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm1
1075 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
1076 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1077 ; AVX512VLBW-NEXT: retq
1079 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
1080 ; AVX512VBMI2: # %bb.0:
1081 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1082 ; AVX512VBMI2-NEXT: retq
1084 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
1085 ; AVX512VLVBMI2: # %bb.0:
1086 ; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1087 ; AVX512VLVBMI2-NEXT: retq
1088 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1092 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
1093 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
1095 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1096 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1097 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
1098 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1099 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1100 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
1101 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1102 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1103 ; AVX512F-NEXT: retq
1105 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
1106 ; AVX512VL: # %bb.0:
1107 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1108 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1109 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
1110 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1111 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1112 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
1113 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1114 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1115 ; AVX512VL-NEXT: retq
1117 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
1118 ; AVX512BW: # %bb.0:
1119 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
1120 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
1121 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1122 ; AVX512BW-NEXT: retq
1124 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
1125 ; AVX512VLBW: # %bb.0:
1126 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
1127 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
1128 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1129 ; AVX512VLBW-NEXT: retq
1131 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
1132 ; AVX512VBMI2: # %bb.0:
1133 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
1134 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
1135 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1136 ; AVX512VBMI2-NEXT: retq
1138 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
1139 ; AVX512VLVBMI2: # %bb.0:
1140 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
1141 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
1142 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
1143 ; AVX512VLVBMI2-NEXT: retq
1144 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)