1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
8 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9 declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
10 declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
16 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
17 ; AVX512-LABEL: var_funnnel_v8i64:
19 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
21 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
25 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
26 ; AVX512-LABEL: var_funnnel_v16i32:
28 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
30 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
34 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
35 ; AVX512F-LABEL: var_funnnel_v32i16:
37 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
38 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
39 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
40 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
41 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
42 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
43 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
44 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
45 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
46 ; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0
47 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
48 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2
49 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
50 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
51 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
52 ; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
53 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
54 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
55 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
56 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
59 ; AVX512VL-LABEL: var_funnnel_v32i16:
61 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
62 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
63 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
64 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
65 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
66 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
67 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
68 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
69 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
70 ; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0
71 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
72 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2
73 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
75 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
76 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
77 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
78 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
79 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
80 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
83 ; AVX512BW-LABEL: var_funnnel_v32i16:
85 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
86 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
87 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
88 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
89 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
90 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
91 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
92 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
95 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
96 ; AVX512VLBW: # %bb.0:
97 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
98 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
99 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
100 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
101 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
102 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
103 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
104 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
105 ; AVX512VLBW-NEXT: retq
106 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
110 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
111 ; AVX512F-LABEL: var_funnnel_v64i8:
113 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
114 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
115 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
116 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
117 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
118 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
119 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
120 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
121 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
122 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
123 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4
124 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
125 ; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4
126 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm8
127 ; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8
128 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
129 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
130 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
131 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
132 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
133 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
134 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm9
135 ; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
136 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
137 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
138 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
139 ; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2
140 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
141 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
142 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
143 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
144 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
145 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
146 ; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2
147 ; AVX512F-NEXT: vpandn %ymm2, %ymm7, %ymm2
148 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
149 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
150 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
151 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
152 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
153 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
154 ; AVX512F-NEXT: vpand %ymm8, %ymm2, %ymm2
155 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
156 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
157 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
158 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
161 ; AVX512VL-LABEL: var_funnnel_v64i8:
163 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
164 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
165 ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
166 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
167 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
168 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
169 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
170 ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
171 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
172 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
173 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4
174 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
175 ; AVX512VL-NEXT: vpandn %ymm4, %ymm7, %ymm4
176 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm8
177 ; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm8
178 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
179 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
180 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
181 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4
182 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
183 ; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
184 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm9
185 ; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4
186 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
187 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
188 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
189 ; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2
190 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4
191 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
192 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
193 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
194 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
195 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
196 ; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2
197 ; AVX512VL-NEXT: vpandn %ymm2, %ymm7, %ymm2
198 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4
199 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
200 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
201 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
202 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
203 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
204 ; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
205 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4
206 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
207 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
208 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
209 ; AVX512VL-NEXT: retq
211 ; AVX512BW-LABEL: var_funnnel_v64i8:
213 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
214 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
215 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
216 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
217 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
218 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
219 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
220 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
221 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
222 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
223 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
224 ; AVX512BW-NEXT: vpsrlw $2, %zmm2, %zmm5
225 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
226 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
227 ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm5
228 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
229 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
230 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
231 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
232 ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
233 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
234 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
235 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
236 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
237 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
238 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
239 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
240 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
241 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
242 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
243 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
244 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
245 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
246 ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
247 ; AVX512BW-NEXT: retq
249 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
250 ; AVX512VLBW: # %bb.0:
251 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
252 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
253 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
254 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
255 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
256 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
257 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
258 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
259 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
260 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
261 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
262 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm2, %zmm5
263 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
264 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
265 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm2, %zmm5
266 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
267 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
268 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
269 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
270 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
271 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
272 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
273 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
274 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
275 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
276 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
277 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
278 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
279 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
280 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
281 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
282 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
283 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
284 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
285 ; AVX512VLBW-NEXT: retq
286 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
291 ; Uniform Variable Shifts
294 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
295 ; AVX512-LABEL: splatvar_funnnel_v8i64:
297 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
298 ; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
300 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
301 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
305 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
306 ; AVX512-LABEL: splatvar_funnnel_v16i32:
308 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
309 ; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
311 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
312 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
316 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
317 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
319 ; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
320 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
321 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
322 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
323 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
324 ; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2
325 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
326 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
327 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
328 ; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
329 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
330 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
333 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
335 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
336 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
337 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
338 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
339 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
340 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2
341 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
342 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
343 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
344 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
345 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
346 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
347 ; AVX512VL-NEXT: retq
349 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
351 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
352 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
353 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
354 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
355 ; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
356 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
357 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
358 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
359 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
360 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
361 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
362 ; AVX512BW-NEXT: retq
364 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
365 ; AVX512VLBW: # %bb.0:
366 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
367 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
368 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
369 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
370 ; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
371 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
372 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
373 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
374 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
375 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
376 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
377 ; AVX512VLBW-NEXT: retq
378 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
379 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
383 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
384 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
386 ; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
387 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
388 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
389 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
390 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
391 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
392 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
393 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
394 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
395 ; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
396 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
397 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
398 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
399 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
400 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
401 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
402 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
403 ; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
404 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
405 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
406 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
407 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
410 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
412 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
413 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
414 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
415 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
416 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
417 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
418 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
419 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
420 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
421 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
422 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
423 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
424 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
425 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
426 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
427 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
428 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
429 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
430 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
431 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
432 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
433 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
434 ; AVX512VL-NEXT: retq
436 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
438 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
439 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
440 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
441 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
442 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
443 ; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
444 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
445 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
446 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
447 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
448 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
449 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
450 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
451 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
452 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
453 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
454 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
455 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
456 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
457 ; AVX512BW-NEXT: retq
459 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
460 ; AVX512VLBW: # %bb.0:
461 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
462 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
463 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
464 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
465 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
466 ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
467 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
468 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
469 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
470 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
471 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
472 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
473 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
474 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
475 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
476 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
477 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
478 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
479 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
480 ; AVX512VLBW-NEXT: retq
481 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
482 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
490 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
491 ; AVX512-LABEL: constant_funnnel_v8i64:
493 ; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
495 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
499 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
500 ; AVX512-LABEL: constant_funnnel_v16i32:
502 ; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
504 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
508 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
509 ; AVX512F-LABEL: constant_funnnel_v32i16:
511 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
512 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
513 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
514 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
515 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
516 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
517 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
518 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
519 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
520 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
521 ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
522 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
525 ; AVX512VL-LABEL: constant_funnnel_v32i16:
527 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
528 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
529 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
530 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
531 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
532 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
533 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
534 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
535 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
536 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
537 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
538 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
539 ; AVX512VL-NEXT: retq
541 ; AVX512BW-LABEL: constant_funnnel_v32i16:
543 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
544 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
545 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
546 ; AVX512BW-NEXT: retq
548 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
549 ; AVX512VLBW: # %bb.0:
550 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
551 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
552 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
553 ; AVX512VLBW-NEXT: retq
554 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
558 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
559 ; AVX512F-LABEL: constant_funnnel_v64i8:
561 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
562 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
563 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
564 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
565 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
566 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
567 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
568 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
569 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
570 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
571 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
572 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
573 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
574 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
575 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
576 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
577 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
578 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
579 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
580 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
581 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
582 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
583 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
584 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
585 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
586 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0
587 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
588 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
589 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
590 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
591 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
592 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
593 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
594 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
595 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
596 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
597 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
598 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
599 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
600 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
601 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
602 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
603 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
606 ; AVX512VL-LABEL: constant_funnnel_v64i8:
608 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
609 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
610 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
611 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
612 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
613 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
614 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
615 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
616 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
617 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
618 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
619 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
620 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
621 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
622 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
623 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
624 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
625 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
626 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
627 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
628 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
629 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
630 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
631 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
632 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
633 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
634 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
635 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
636 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
637 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
638 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
639 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
640 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
641 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
642 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
643 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
644 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
645 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
646 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
647 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
648 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
649 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
650 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
651 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
652 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
653 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
654 ; AVX512VL-NEXT: retq
656 ; AVX512BW-LABEL: constant_funnnel_v64i8:
658 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
659 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
660 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
661 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
662 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
663 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
664 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
665 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
666 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
667 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
668 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
669 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
670 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
671 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
672 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
673 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
674 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
675 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
676 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
677 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
678 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
679 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
680 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
681 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
682 ; AVX512BW-NEXT: retq
684 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
685 ; AVX512VLBW: # %bb.0:
686 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
687 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
688 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
689 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
690 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
691 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
692 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
693 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
694 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
695 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
696 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
697 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
698 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
699 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
700 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
701 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
702 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
703 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
704 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
705 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
706 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
707 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
708 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
709 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
710 ; AVX512VLBW-NEXT: retq
711 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
716 ; Uniform Constant Shifts
719 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
720 ; AVX512-LABEL: splatconstant_funnnel_v8i64:
722 ; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
724 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
728 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
729 ; AVX512-LABEL: splatconstant_funnnel_v16i32:
731 ; AVX512-NEXT: vprold $4, %zmm0, %zmm0
733 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
737 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
738 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
740 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
741 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
742 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
743 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
744 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
745 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
748 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
750 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
751 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
752 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
753 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
754 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
755 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
756 ; AVX512VL-NEXT: retq
758 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
760 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm1
761 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
762 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
763 ; AVX512BW-NEXT: retq
765 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
766 ; AVX512VLBW: # %bb.0:
767 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm1
768 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
769 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
770 ; AVX512VLBW-NEXT: retq
771 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
775 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
776 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
778 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
779 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
780 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
781 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
782 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
783 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
784 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
785 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
786 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
787 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
788 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
791 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
793 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
794 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
795 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
796 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
797 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
798 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
799 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
800 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
801 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
802 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
803 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
804 ; AVX512VL-NEXT: retq
806 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
808 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
809 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
810 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
811 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
812 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
813 ; AVX512BW-NEXT: retq
815 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
816 ; AVX512VLBW: # %bb.0:
817 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
818 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
819 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
820 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
821 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
822 ; AVX512VLBW-NEXT: retq
823 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)