1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
8 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9 declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
10 declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
16 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
17 ; AVX512-LABEL: var_funnnel_v8i64:
19 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
21 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
25 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
26 ; AVX512-LABEL: var_funnnel_v16i32:
28 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
30 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
34 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
35 ; AVX512F-LABEL: var_funnnel_v32i16:
37 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
38 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
39 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
40 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
41 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
42 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
43 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
44 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
45 ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
46 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
47 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
48 ; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
49 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
50 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
51 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
52 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
53 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
54 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
55 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
56 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
57 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
58 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
59 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
62 ; AVX512VL-LABEL: var_funnnel_v32i16:
64 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
65 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
66 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
67 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
68 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
69 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
70 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
71 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
72 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
73 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
74 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
75 ; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
76 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
77 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
78 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
79 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
80 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
81 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
82 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
83 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
84 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
85 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
86 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
89 ; AVX512BW-LABEL: var_funnnel_v32i16:
91 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
92 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
93 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
94 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
95 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
96 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
97 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
98 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
101 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
102 ; AVX512VLBW: # %bb.0:
103 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
104 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
105 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
106 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
107 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
108 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
109 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
110 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
111 ; AVX512VLBW-NEXT: retq
112 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
116 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
117 ; AVX512F-LABEL: var_funnnel_v64i8:
119 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
120 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
121 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
122 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
123 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
124 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
125 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
126 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
127 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
128 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
129 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
130 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
131 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
132 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
133 ; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4
134 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8
135 ; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8
136 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
137 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
138 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
139 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
140 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
141 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
142 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9
143 ; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
144 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
145 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
146 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
147 ; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
148 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
149 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
150 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
151 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
152 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
153 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
154 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
155 ; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3
156 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
157 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
158 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
159 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
160 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
161 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
162 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
163 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
164 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
165 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
166 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
167 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
170 ; AVX512VL-LABEL: var_funnnel_v64i8:
172 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
173 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
174 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
175 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
176 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
177 ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
178 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
179 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
180 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
181 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
182 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5
183 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7
184 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
185 ; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7
186 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
187 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3
188 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5
189 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
190 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
191 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9
192 ; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5
193 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
194 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2
195 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
196 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
197 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5
198 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
199 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
200 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
201 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
202 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
203 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4
204 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
205 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
206 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
207 ; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
208 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
209 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
210 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
211 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
212 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
213 ; AVX512VL-NEXT: retq
215 ; AVX512BW-LABEL: var_funnnel_v64i8:
217 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
218 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
219 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
220 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
221 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
222 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
223 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
224 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
225 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
226 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
227 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
228 ; AVX512BW-NEXT: vpsrlw $2, %zmm2, %zmm5
229 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
230 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
231 ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm5
232 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
233 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
234 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
235 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
236 ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
237 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
238 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
239 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
240 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
241 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
242 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
243 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
244 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
245 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
246 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
247 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
248 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
249 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
250 ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
251 ; AVX512BW-NEXT: retq
253 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
254 ; AVX512VLBW: # %bb.0:
255 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
256 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
257 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
258 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
259 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
260 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
261 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
262 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
263 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
264 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
265 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
266 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm2, %zmm5
267 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
268 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
269 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm2, %zmm5
270 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
271 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
272 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
273 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
274 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
275 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
276 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
277 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
278 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
279 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
280 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
281 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
282 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
283 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
284 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
285 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
286 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
287 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
288 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
289 ; AVX512VLBW-NEXT: retq
290 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
295 ; Uniform Variable Shifts
298 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
299 ; AVX512-LABEL: splatvar_funnnel_v8i64:
301 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
302 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
304 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
305 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
309 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
310 ; AVX512-LABEL: splatvar_funnnel_v16i32:
312 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
313 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
315 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
316 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
320 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
321 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
323 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
324 ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
325 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
326 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
327 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
328 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
329 ; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
330 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
331 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
332 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
333 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
334 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
335 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
336 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
339 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
341 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
342 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
343 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
344 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
345 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
346 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
347 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
348 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
349 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
350 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
351 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
352 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
353 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
354 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
355 ; AVX512VL-NEXT: retq
357 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
359 ; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
360 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
361 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
362 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
363 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
364 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
365 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
366 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
367 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
368 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
369 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
370 ; AVX512BW-NEXT: retq
372 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
373 ; AVX512VLBW: # %bb.0:
374 ; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
375 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
376 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
377 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
378 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
379 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
380 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
381 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
382 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
383 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
384 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
385 ; AVX512VLBW-NEXT: retq
386 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
387 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
391 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
392 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
394 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
395 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
396 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
397 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
398 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
399 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
400 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
401 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
402 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
403 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
404 ; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
405 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
406 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
407 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
408 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
409 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
410 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
411 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
412 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
413 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
414 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
415 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
416 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
417 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
420 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
422 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
423 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
424 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
425 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
426 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
427 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
428 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
429 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
430 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
431 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
432 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
433 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
434 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
435 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
436 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
437 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
438 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
439 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
440 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
441 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
442 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
443 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
444 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
445 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
446 ; AVX512VL-NEXT: retq
448 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
450 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
451 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
452 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
453 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
454 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
455 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
456 ; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
457 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
458 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
459 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
460 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
461 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
462 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
463 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
464 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
465 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
466 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
467 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
468 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
469 ; AVX512BW-NEXT: retq
471 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
472 ; AVX512VLBW: # %bb.0:
473 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
474 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
475 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
476 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
477 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
478 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
479 ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
480 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
481 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
482 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
483 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
484 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
485 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
486 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
487 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
488 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
489 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
490 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
491 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
492 ; AVX512VLBW-NEXT: retq
493 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
494 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
502 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
503 ; AVX512-LABEL: constant_funnnel_v8i64:
505 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
507 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
511 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
512 ; AVX512-LABEL: constant_funnnel_v16i32:
514 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
516 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
520 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
521 ; AVX512F-LABEL: constant_funnnel_v32i16:
523 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
524 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
525 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
526 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
527 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
528 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
529 ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
530 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
531 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
532 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
533 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
534 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
535 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
536 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
539 ; AVX512VL-LABEL: constant_funnnel_v32i16:
541 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
542 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
543 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
544 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
545 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
546 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
547 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
548 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
549 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
550 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
551 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
552 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
553 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
554 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
555 ; AVX512VL-NEXT: retq
557 ; AVX512BW-LABEL: constant_funnnel_v32i16:
559 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
560 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
561 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
562 ; AVX512BW-NEXT: retq
564 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
565 ; AVX512VLBW: # %bb.0:
566 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
567 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
568 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
569 ; AVX512VLBW-NEXT: retq
570 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
574 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
575 ; AVX512F-LABEL: constant_funnnel_v64i8:
577 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
578 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
579 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
580 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
581 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
582 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
583 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
584 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
585 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
586 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
587 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
588 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
589 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
590 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
591 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
592 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
593 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
594 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
595 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
596 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
597 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
598 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
599 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
600 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
601 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
602 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
603 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
604 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
605 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
606 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
607 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
608 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
609 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
610 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
611 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
612 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
613 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
614 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
615 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
616 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
617 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
618 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
619 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
620 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
621 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
624 ; AVX512VL-LABEL: constant_funnnel_v64i8:
626 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
627 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
628 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
629 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
630 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
631 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
632 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
633 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
634 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
635 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
636 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
637 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
638 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
639 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
640 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
641 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
642 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
643 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
644 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
645 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
646 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
647 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
648 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
649 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
650 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
651 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
652 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
653 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
654 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
655 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
656 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
657 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
658 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
659 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
660 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
661 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
662 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
663 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
664 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
665 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
666 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
667 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
668 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
669 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
670 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
671 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
672 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
673 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
674 ; AVX512VL-NEXT: retq
676 ; AVX512BW-LABEL: constant_funnnel_v64i8:
678 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
679 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
680 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
681 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
682 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
683 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
684 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
685 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
686 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
687 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
688 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
689 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
690 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
691 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
692 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
693 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
694 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
695 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
696 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
697 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
698 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
699 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
700 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
701 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
702 ; AVX512BW-NEXT: retq
704 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
705 ; AVX512VLBW: # %bb.0:
706 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
707 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
708 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
709 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
710 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
711 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
712 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
713 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
714 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
715 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
716 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
717 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
718 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
719 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
720 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
721 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
722 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
723 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
724 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
725 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
726 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
727 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
728 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
729 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
730 ; AVX512VLBW-NEXT: retq
731 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
736 ; Uniform Constant Shifts
739 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
740 ; AVX512-LABEL: splatconstant_funnnel_v8i64:
742 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
744 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
748 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
749 ; AVX512-LABEL: splatconstant_funnnel_v16i32:
751 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
753 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
757 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
758 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
760 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
761 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
762 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
763 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
764 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
765 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
766 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
767 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
770 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
772 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
773 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
774 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
775 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
776 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
777 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
778 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
779 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
780 ; AVX512VL-NEXT: retq
782 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
784 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm1
785 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
786 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
787 ; AVX512BW-NEXT: retq
789 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
790 ; AVX512VLBW: # %bb.0:
791 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm1
792 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
793 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
794 ; AVX512VLBW-NEXT: retq
795 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
799 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
800 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
802 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
803 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
804 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
805 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
806 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
807 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
808 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
809 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
810 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
811 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
812 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
813 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
814 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
817 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
819 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
820 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
821 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
822 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
823 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
824 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
825 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
826 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0
827 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
828 ; AVX512VL-NEXT: retq
830 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
832 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
833 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
834 ; AVX512BW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
835 ; AVX512BW-NEXT: retq
837 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
838 ; AVX512VLBW: # %bb.0:
839 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
840 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
841 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
842 ; AVX512VLBW-NEXT: retq
843 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)