1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
9 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
10 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
11 declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
12 declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
18 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
19 ; AVX512F-LABEL: var_funnnel_v8i64:
21 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
22 ; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
23 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
24 ; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
25 ; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
26 ; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
27 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
28 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
29 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
32 ; AVX512VL-LABEL: var_funnnel_v8i64:
34 ; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
35 ; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
36 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
37 ; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
38 ; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
39 ; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
40 ; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
41 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
42 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
45 ; AVX512BW-LABEL: var_funnnel_v8i64:
47 ; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
48 ; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
49 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
50 ; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
51 ; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
52 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
53 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
54 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
55 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
58 ; AVX512VBMI2-LABEL: var_funnnel_v8i64:
59 ; AVX512VBMI2: # %bb.0:
60 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
61 ; AVX512VBMI2-NEXT: retq
63 ; AVX512VLBW-LABEL: var_funnnel_v8i64:
64 ; AVX512VLBW: # %bb.0:
65 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
66 ; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
67 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
68 ; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
69 ; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
70 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
71 ; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
72 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
73 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
74 ; AVX512VLBW-NEXT: retq
76 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i64:
77 ; AVX512VLVBMI2: # %bb.0:
78 ; AVX512VLVBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
79 ; AVX512VLVBMI2-NEXT: retq
80 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
84 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
85 ; AVX512F-LABEL: var_funnnel_v16i32:
87 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
88 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
89 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
90 ; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
91 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
92 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
93 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
94 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
95 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
98 ; AVX512VL-LABEL: var_funnnel_v16i32:
100 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
101 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
102 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
103 ; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
104 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
105 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
106 ; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
107 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
108 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
109 ; AVX512VL-NEXT: retq
111 ; AVX512BW-LABEL: var_funnnel_v16i32:
113 ; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
114 ; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
115 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
116 ; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
117 ; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
118 ; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
119 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
120 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
121 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
122 ; AVX512BW-NEXT: retq
124 ; AVX512VBMI2-LABEL: var_funnnel_v16i32:
125 ; AVX512VBMI2: # %bb.0:
126 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
127 ; AVX512VBMI2-NEXT: retq
129 ; AVX512VLBW-LABEL: var_funnnel_v16i32:
130 ; AVX512VLBW: # %bb.0:
131 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
132 ; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
133 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
134 ; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
135 ; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
136 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
137 ; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
138 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
139 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
140 ; AVX512VLBW-NEXT: retq
142 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i32:
143 ; AVX512VLVBMI2: # %bb.0:
144 ; AVX512VLVBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
145 ; AVX512VLVBMI2-NEXT: retq
146 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt)
150 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
151 ; AVX512F-LABEL: var_funnnel_v32i16:
153 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
154 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
155 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
156 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
157 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
158 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
159 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
160 ; AVX512F-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
161 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
162 ; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9
163 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
164 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
165 ; AVX512F-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3
166 ; AVX512F-NEXT: vpord %zmm3, %zmm7, %zmm3
167 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
168 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
169 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
170 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
171 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
172 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
173 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
174 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
175 ; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
176 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
177 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
178 ; AVX512F-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1
179 ; AVX512F-NEXT: vpord %zmm1, %zmm4, %zmm1
180 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
181 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
182 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
183 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
186 ; AVX512VL-LABEL: var_funnnel_v32i16:
188 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
189 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
190 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
191 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
192 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
193 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
194 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
195 ; AVX512VL-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
196 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
197 ; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9
198 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
199 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
200 ; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3
201 ; AVX512VL-NEXT: vpord %zmm3, %zmm7, %zmm3
202 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
203 ; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
204 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
205 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
206 ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
207 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
208 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
209 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
210 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
211 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
212 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
213 ; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1
214 ; AVX512VL-NEXT: vpord %zmm1, %zmm4, %zmm1
215 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
216 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
217 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
218 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
219 ; AVX512VL-NEXT: retq
221 ; AVX512BW-LABEL: var_funnnel_v32i16:
223 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
224 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
225 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
226 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
227 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
228 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
229 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
230 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
231 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
232 ; AVX512BW-NEXT: retq
234 ; AVX512VBMI2-LABEL: var_funnnel_v32i16:
235 ; AVX512VBMI2: # %bb.0:
236 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
237 ; AVX512VBMI2-NEXT: retq
239 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
240 ; AVX512VLBW: # %bb.0:
241 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
242 ; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
243 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
244 ; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
245 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
246 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
247 ; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
248 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
249 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
250 ; AVX512VLBW-NEXT: retq
252 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
253 ; AVX512VLVBMI2: # %bb.0:
254 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
255 ; AVX512VLVBMI2-NEXT: retq
256 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt)
260 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
261 ; AVX512F-LABEL: var_funnnel_v64i8:
263 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8
264 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
265 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
266 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm5
267 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
268 ; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
269 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
270 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9
271 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
272 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7
273 ; AVX512F-NEXT: vpsllw $2, %ymm7, %ymm11
274 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
275 ; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11
276 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
277 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7
278 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm11
279 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
280 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm10
281 ; AVX512F-NEXT: vpsrlw $4, %ymm8, %ymm11
282 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
283 ; AVX512F-NEXT: vpand %ymm7, %ymm11, %ymm11
284 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
285 ; AVX512F-NEXT: vpsubb %ymm9, %ymm12, %ymm13
286 ; AVX512F-NEXT: vpsllw $5, %ymm13, %ymm13
287 ; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
288 ; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11
289 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
290 ; AVX512F-NEXT: vpand %ymm14, %ymm11, %ymm11
291 ; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
292 ; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
293 ; AVX512F-NEXT: vpsrlw $1, %ymm8, %ymm11
294 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
295 ; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
296 ; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
297 ; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
298 ; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8
299 ; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
300 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
301 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
302 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm8
303 ; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
304 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
305 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5
306 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
307 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm8
308 ; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
309 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
310 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
311 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6
312 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
313 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
314 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
315 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
316 ; AVX512F-NEXT: vpsubb %ymm2, %ymm12, %ymm6
317 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
318 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
319 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5
320 ; AVX512F-NEXT: vpand %ymm14, %ymm5, %ymm5
321 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
322 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
323 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5
324 ; AVX512F-NEXT: vpand %ymm15, %ymm5, %ymm5
325 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
326 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
327 ; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
328 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm2, %ymm2
329 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
330 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
333 ; AVX512VL-LABEL: var_funnnel_v64i8:
335 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
336 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
337 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
338 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
339 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
340 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
341 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
342 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
343 ; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9
344 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6
345 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm10
346 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
347 ; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
348 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
349 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
350 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm10
351 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
352 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
353 ; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm9
354 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
355 ; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
356 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
357 ; AVX512VL-NEXT: vpsubb %ymm5, %ymm12, %ymm13
358 ; AVX512VL-NEXT: vpsllw $5, %ymm13, %ymm13
359 ; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
360 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9
361 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
362 ; AVX512VL-NEXT: vpand %ymm14, %ymm9, %ymm9
363 ; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
364 ; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
365 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9
366 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
367 ; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
368 ; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
369 ; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
370 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
371 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
372 ; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
373 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
374 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
375 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
376 ; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
377 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5
378 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
379 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
380 ; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7
381 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
382 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
383 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
384 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
385 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
386 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
387 ; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5
388 ; AVX512VL-NEXT: vpsubb %ymm2, %ymm12, %ymm7
389 ; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
390 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
391 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5
392 ; AVX512VL-NEXT: vpand %ymm14, %ymm5, %ymm5
393 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
394 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
395 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
396 ; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5
397 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
398 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
399 ; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1
400 ; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
401 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
402 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
403 ; AVX512VL-NEXT: retq
405 ; AVX512BW-LABEL: var_funnnel_v64i8:
407 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
408 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
409 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
410 ; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
411 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
412 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
413 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
414 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
415 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
416 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
417 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm3
418 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
419 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
420 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm3
421 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
422 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
423 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
424 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
425 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
426 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
427 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
428 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
429 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
430 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
431 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
432 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
433 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
434 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
435 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
436 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
437 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
438 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
439 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
440 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
441 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
442 ; AVX512BW-NEXT: retq
444 ; AVX512VBMI2-LABEL: var_funnnel_v64i8:
445 ; AVX512VBMI2: # %bb.0:
446 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
447 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
448 ; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
449 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
450 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
451 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
452 ; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
453 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
454 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
455 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
456 ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
457 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
458 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
459 ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
460 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
461 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
462 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
463 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
464 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
465 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
466 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
467 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
468 ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
469 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
470 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
471 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
472 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
473 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
474 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
475 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
476 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
477 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
478 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
479 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
480 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
481 ; AVX512VBMI2-NEXT: retq
483 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
484 ; AVX512VLBW: # %bb.0:
485 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
486 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
487 ; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
488 ; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
489 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
490 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
491 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
492 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
493 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
494 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
495 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm3
496 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
497 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
498 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm3
499 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
500 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
501 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
502 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
503 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
504 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
505 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
506 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
507 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
508 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
509 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
510 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
511 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
512 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
513 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
514 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
515 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
516 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
517 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
518 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
519 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
520 ; AVX512VLBW-NEXT: retq
522 ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
523 ; AVX512VLVBMI2: # %bb.0:
524 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
525 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
526 ; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
527 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
528 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
529 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
530 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
531 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
532 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
533 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
534 ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
535 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
536 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
537 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
538 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
539 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
540 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
541 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
542 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
543 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
544 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
545 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
546 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
547 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
548 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
549 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
550 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
551 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
552 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
553 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
554 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
555 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
556 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
557 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
558 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
559 ; AVX512VLVBMI2-NEXT: retq
560 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
565 ; Uniform Variable Shifts
568 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
569 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
571 ; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
572 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
573 ; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3
574 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
575 ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
576 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
577 ; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
578 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
579 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
580 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
583 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
585 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
586 ; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
587 ; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3
588 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
589 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
590 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
591 ; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
592 ; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
593 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
594 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
595 ; AVX512VL-NEXT: retq
597 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
599 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
600 ; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
601 ; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
602 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
603 ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
604 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
605 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
606 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
607 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
608 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
609 ; AVX512BW-NEXT: retq
611 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64:
612 ; AVX512VBMI2: # %bb.0:
613 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %zmm2
614 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
615 ; AVX512VBMI2-NEXT: retq
617 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
618 ; AVX512VLBW: # %bb.0:
619 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
620 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
621 ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
622 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
623 ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
624 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
625 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
626 ; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
627 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
628 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
629 ; AVX512VLBW-NEXT: retq
631 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64:
632 ; AVX512VLVBMI2: # %bb.0:
633 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %zmm2
634 ; AVX512VLVBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
635 ; AVX512VLVBMI2-NEXT: retq
636 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
637 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat)
641 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
642 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
644 ; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
645 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
646 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
647 ; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3
648 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
649 ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
650 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
651 ; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
652 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
653 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
654 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
655 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
658 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
660 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
661 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
662 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
663 ; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3
664 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
665 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
666 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
667 ; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
668 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
669 ; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
670 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
671 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
672 ; AVX512VL-NEXT: retq
674 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
676 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
677 ; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
678 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
679 ; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3
680 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
681 ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
682 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
683 ; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
684 ; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
685 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
686 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
687 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
688 ; AVX512BW-NEXT: retq
690 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32:
691 ; AVX512VBMI2: # %bb.0:
692 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %zmm2
693 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
694 ; AVX512VBMI2-NEXT: retq
696 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
697 ; AVX512VLBW: # %bb.0:
698 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
699 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
700 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
701 ; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3
702 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
703 ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
704 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
705 ; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
706 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
707 ; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
708 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
709 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
710 ; AVX512VLBW-NEXT: retq
712 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32:
713 ; AVX512VLVBMI2: # %bb.0:
714 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %zmm2
715 ; AVX512VLVBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
716 ; AVX512VLVBMI2-NEXT: retq
717 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
718 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat)
722 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
723 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
725 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
726 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
727 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
728 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
729 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
730 ; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6
731 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
732 ; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7
733 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
734 ; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
735 ; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3
736 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
737 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
738 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
739 ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm4
740 ; AVX512F-NEXT: vpsrlw %xmm7, %ymm1, %ymm1
741 ; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
742 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
743 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
746 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
748 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
749 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
750 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
751 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
752 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
753 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6
754 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
755 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7
756 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
757 ; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
758 ; AVX512VL-NEXT: vpor %ymm3, %ymm6, %ymm3
759 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
760 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
761 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
762 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm4
763 ; AVX512VL-NEXT: vpsrlw %xmm7, %ymm1, %ymm1
764 ; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1
765 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
766 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
767 ; AVX512VL-NEXT: retq
769 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
771 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
772 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
773 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
774 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
775 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
776 ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
777 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
778 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
779 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
780 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
781 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
782 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
783 ; AVX512BW-NEXT: retq
785 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
786 ; AVX512VBMI2: # %bb.0:
787 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %zmm2
788 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
789 ; AVX512VBMI2-NEXT: retq
791 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
792 ; AVX512VLBW: # %bb.0:
793 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
794 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
795 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
796 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
797 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
798 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
799 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
800 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
801 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
802 ; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
803 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
804 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
805 ; AVX512VLBW-NEXT: retq
807 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
808 ; AVX512VLVBMI2: # %bb.0:
809 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %zmm2
810 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
811 ; AVX512VLVBMI2-NEXT: retq
812 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
813 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat)
817 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
818 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
820 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm9
821 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
822 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
823 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
824 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
825 ; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6
826 ; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
827 ; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7
828 ; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
829 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10
830 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
831 ; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
832 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
833 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm9, %ymm9
834 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm8, %xmm6
835 ; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6
836 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
837 ; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8
838 ; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8
839 ; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
840 ; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
841 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
842 ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm5
843 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
844 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
845 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
846 ; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
847 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
848 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
851 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
853 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm9
854 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
855 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
856 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
857 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
858 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6
859 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
860 ; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7
861 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
862 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10
863 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
864 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
865 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
866 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm9, %ymm9
867 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6
868 ; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6
869 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
870 ; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8
871 ; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8
872 ; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
873 ; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
874 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
875 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5
876 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
877 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
878 ; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
879 ; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
880 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
881 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
882 ; AVX512VL-NEXT: retq
884 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
886 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
887 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
888 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
889 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
890 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
891 ; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
892 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
893 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
894 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
895 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
896 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
897 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
898 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
899 ; AVX512BW-NEXT: vpsrlw $8, %xmm4, %xmm4
900 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
901 ; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1
902 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
903 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
904 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
905 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
906 ; AVX512BW-NEXT: retq
908 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
909 ; AVX512VBMI2: # %bb.0:
910 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
911 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
912 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
913 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
914 ; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
915 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
916 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
917 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
918 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
919 ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
920 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
921 ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
922 ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
923 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
924 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
925 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
926 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
927 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
928 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
929 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
930 ; AVX512VBMI2-NEXT: retq
932 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
933 ; AVX512VLBW: # %bb.0:
934 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
935 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
936 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
937 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
938 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
939 ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
940 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
941 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
942 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
943 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
944 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
945 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
946 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
947 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm4, %xmm4
948 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
949 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1
950 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
951 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
952 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
953 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
954 ; AVX512VLBW-NEXT: retq
956 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
957 ; AVX512VLVBMI2: # %bb.0:
958 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
959 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
960 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
961 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
962 ; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
963 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
964 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
965 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
966 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
967 ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
968 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
969 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
970 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
971 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
972 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
973 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
974 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
975 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
976 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
977 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
978 ; AVX512VLVBMI2-NEXT: retq
979 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
980 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
988 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
989 ; AVX512F-LABEL: constant_funnnel_v8i64:
991 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
992 ; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
993 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
996 ; AVX512VL-LABEL: constant_funnnel_v8i64:
998 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
999 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
1000 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
1001 ; AVX512VL-NEXT: retq
1003 ; AVX512BW-LABEL: constant_funnnel_v8i64:
1004 ; AVX512BW: # %bb.0:
1005 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1006 ; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
1007 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1008 ; AVX512BW-NEXT: retq
1010 ; AVX512VBMI2-LABEL: constant_funnnel_v8i64:
1011 ; AVX512VBMI2: # %bb.0:
1012 ; AVX512VBMI2-NEXT: vpshldvq {{.*}}(%rip), %zmm1, %zmm0
1013 ; AVX512VBMI2-NEXT: retq
1015 ; AVX512VLBW-LABEL: constant_funnnel_v8i64:
1016 ; AVX512VLBW: # %bb.0:
1017 ; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1018 ; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
1019 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1020 ; AVX512VLBW-NEXT: retq
1022 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i64:
1023 ; AVX512VLVBMI2: # %bb.0:
1024 ; AVX512VLVBMI2-NEXT: vpshldvq {{.*}}(%rip), %zmm1, %zmm0
1025 ; AVX512VLVBMI2-NEXT: retq
1026 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
1030 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1031 ; AVX512F-LABEL: constant_funnnel_v16i32:
1033 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1034 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1035 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1036 ; AVX512F-NEXT: retq
1038 ; AVX512VL-LABEL: constant_funnnel_v16i32:
1039 ; AVX512VL: # %bb.0:
1040 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1041 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1042 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1043 ; AVX512VL-NEXT: retq
1045 ; AVX512BW-LABEL: constant_funnnel_v16i32:
1046 ; AVX512BW: # %bb.0:
1047 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1048 ; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1049 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
1050 ; AVX512BW-NEXT: retq
1052 ; AVX512VBMI2-LABEL: constant_funnnel_v16i32:
1053 ; AVX512VBMI2: # %bb.0:
1054 ; AVX512VBMI2-NEXT: vpshldvd {{.*}}(%rip), %zmm1, %zmm0
1055 ; AVX512VBMI2-NEXT: retq
1057 ; AVX512VLBW-LABEL: constant_funnnel_v16i32:
1058 ; AVX512VLBW: # %bb.0:
1059 ; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1060 ; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1061 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
1062 ; AVX512VLBW-NEXT: retq
1064 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i32:
1065 ; AVX512VLVBMI2: # %bb.0:
1066 ; AVX512VLVBMI2-NEXT: vpshldvd {{.*}}(%rip), %zmm1, %zmm0
1067 ; AVX512VLVBMI2-NEXT: retq
1068 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1072 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1073 ; AVX512F-LABEL: constant_funnnel_v32i16:
1075 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1076 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1077 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
1078 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3
1079 ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm5
1080 ; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3
1081 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
1082 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
1083 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1
1084 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm3
1085 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
1086 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1087 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1088 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1089 ; AVX512F-NEXT: retq
1091 ; AVX512VL-LABEL: constant_funnnel_v32i16:
1092 ; AVX512VL: # %bb.0:
1093 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1094 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1095 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
1096 ; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3
1097 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm5
1098 ; AVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3
1099 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
1100 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
1101 ; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1
1102 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm3
1103 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
1104 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
1105 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1106 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1107 ; AVX512VL-NEXT: retq
1109 ; AVX512BW-LABEL: constant_funnnel_v32i16:
1110 ; AVX512BW: # %bb.0:
1111 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
1112 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm2
1113 ; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1
1114 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
1115 ; AVX512BW-NEXT: kmovd %eax, %k1
1116 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
1117 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1118 ; AVX512BW-NEXT: retq
1120 ; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
1121 ; AVX512VBMI2: # %bb.0:
1122 ; AVX512VBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm1, %zmm0
1123 ; AVX512VBMI2-NEXT: retq
1125 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
1126 ; AVX512VLBW: # %bb.0:
1127 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
1128 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm2
1129 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm2, %zmm1
1130 ; AVX512VLBW-NEXT: movl $65537, %eax # imm = 0x10001
1131 ; AVX512VLBW-NEXT: kmovd %eax, %k1
1132 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
1133 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
1134 ; AVX512VLBW-NEXT: retq
1136 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
1137 ; AVX512VLVBMI2: # %bb.0:
1138 ; AVX512VLVBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm1, %zmm0
1139 ; AVX512VLVBMI2-NEXT: retq
1140 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1144 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1145 ; AVX512F-LABEL: constant_funnnel_v64i8:
1147 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
1148 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1149 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
1150 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1151 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
1152 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1153 ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
1154 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4
1155 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7
1156 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1157 ; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7
1158 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm9
1159 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
1160 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
1161 ; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm10
1162 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
1163 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
1164 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31]
1165 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1166 ; AVX512F-NEXT: # ymm12 = mem[0,1,0,1]
1167 ; AVX512F-NEXT: vpmullw %ymm12, %ymm11, %ymm11
1168 ; AVX512F-NEXT: vpsrlw $8, %ymm11, %ymm11
1169 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23]
1170 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
1171 ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
1172 ; AVX512F-NEXT: vpmullw %ymm13, %ymm2, %ymm2
1173 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
1174 ; AVX512F-NEXT: vpackuswb %ymm11, %ymm2, %ymm2
1175 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
1176 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
1177 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
1178 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
1179 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
1180 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3
1181 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm5
1182 ; AVX512F-NEXT: vpand %ymm8, %ymm5, %ymm5
1183 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
1184 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm5
1185 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
1186 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31]
1187 ; AVX512F-NEXT: vpmullw %ymm12, %ymm5, %ymm5
1188 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
1189 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23]
1190 ; AVX512F-NEXT: vpmullw %ymm13, %ymm1, %ymm1
1191 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
1192 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
1193 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
1194 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
1195 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1196 ; AVX512F-NEXT: retq
1198 ; AVX512VL-LABEL: constant_funnnel_v64i8:
1199 ; AVX512VL: # %bb.0:
1200 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
1201 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1202 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
1203 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1204 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
1205 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1206 ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
1207 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4
1208 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
1209 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1210 ; AVX512VL-NEXT: vpand %ymm8, %ymm7, %ymm7
1211 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm9
1212 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
1213 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
1214 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm10
1215 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
1216 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1217 ; AVX512VL-NEXT: vpsrlw $8, %ymm7, %ymm7
1218 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1219 ; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
1220 ; AVX512VL-NEXT: vpmullw %ymm11, %ymm7, %ymm7
1221 ; AVX512VL-NEXT: vpsrlw $8, %ymm7, %ymm7
1222 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1223 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1224 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
1225 ; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1]
1226 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm2, %ymm2
1227 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1228 ; AVX512VL-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
1229 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
1230 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
1231 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
1232 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
1233 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
1234 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3
1235 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
1236 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
1237 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
1238 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5
1239 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
1240 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1241 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
1242 ; AVX512VL-NEXT: vpmullw %ymm11, %ymm5, %ymm5
1243 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
1244 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1245 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
1246 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm1, %ymm1
1247 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
1248 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
1249 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
1250 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
1251 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1252 ; AVX512VL-NEXT: retq
1254 ; AVX512BW-LABEL: constant_funnnel_v64i8:
1255 ; AVX512BW: # %bb.0:
1256 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1257 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1258 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1259 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
1260 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1261 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1262 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
1263 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1264 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1265 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1266 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1267 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1268 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1269 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1270 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1271 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
1272 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1273 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
1274 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1275 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
1276 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1277 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
1278 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1279 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
1280 ; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1281 ; AVX512BW-NEXT: kmovq %rax, %k1
1282 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1283 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1284 ; AVX512BW-NEXT: retq
1286 ; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
1287 ; AVX512VBMI2: # %bb.0:
1288 ; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1289 ; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1290 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1291 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
1292 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1293 ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1294 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
1295 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1296 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1297 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1298 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1299 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1300 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1301 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1302 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1303 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1304 ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1305 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1306 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1307 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1308 ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1309 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1310 ; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1311 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
1312 ; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1313 ; AVX512VBMI2-NEXT: kmovq %rax, %k1
1314 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1315 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1316 ; AVX512VBMI2-NEXT: retq
1318 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
1319 ; AVX512VLBW: # %bb.0:
1320 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1321 ; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1322 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1323 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
1324 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1325 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1326 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
1327 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1328 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1329 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1330 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1331 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1332 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1333 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1334 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1335 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
1336 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1337 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
1338 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1339 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
1340 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1341 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
1342 ; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1343 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
1344 ; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1345 ; AVX512VLBW-NEXT: kmovq %rax, %k1
1346 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1347 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
1348 ; AVX512VLBW-NEXT: retq
1350 ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
1351 ; AVX512VLVBMI2: # %bb.0:
1352 ; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1353 ; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1354 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1355 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
1356 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1357 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1358 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
1359 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1360 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1361 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1362 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1363 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1364 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1365 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1366 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1367 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1368 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1369 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1370 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1371 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1372 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1373 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1374 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1375 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
1376 ; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1377 ; AVX512VLVBMI2-NEXT: kmovq %rax, %k1
1378 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1379 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1380 ; AVX512VLVBMI2-NEXT: retq
1381 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1386 ; Uniform Constant Shifts
1389 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
1390 ; AVX512F-LABEL: splatconstant_funnnel_v8i64:
1392 ; AVX512F-NEXT: vpsrlq $50, %zmm1, %zmm1
1393 ; AVX512F-NEXT: vpsllq $14, %zmm0, %zmm0
1394 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
1395 ; AVX512F-NEXT: retq
1397 ; AVX512VL-LABEL: splatconstant_funnnel_v8i64:
1398 ; AVX512VL: # %bb.0:
1399 ; AVX512VL-NEXT: vpsrlq $50, %zmm1, %zmm1
1400 ; AVX512VL-NEXT: vpsllq $14, %zmm0, %zmm0
1401 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
1402 ; AVX512VL-NEXT: retq
1404 ; AVX512BW-LABEL: splatconstant_funnnel_v8i64:
1405 ; AVX512BW: # %bb.0:
1406 ; AVX512BW-NEXT: vpsrlq $50, %zmm1, %zmm1
1407 ; AVX512BW-NEXT: vpsllq $14, %zmm0, %zmm0
1408 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1409 ; AVX512BW-NEXT: retq
1411 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i64:
1412 ; AVX512VBMI2: # %bb.0:
1413 ; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
1414 ; AVX512VBMI2-NEXT: retq
1416 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i64:
1417 ; AVX512VLBW: # %bb.0:
1418 ; AVX512VLBW-NEXT: vpsrlq $50, %zmm1, %zmm1
1419 ; AVX512VLBW-NEXT: vpsllq $14, %zmm0, %zmm0
1420 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1421 ; AVX512VLBW-NEXT: retq
1423 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i64:
1424 ; AVX512VLVBMI2: # %bb.0:
1425 ; AVX512VLVBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
1426 ; AVX512VLVBMI2-NEXT: retq
1427 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
1431 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1432 ; AVX512F-LABEL: splatconstant_funnnel_v16i32:
1434 ; AVX512F-NEXT: vpsrld $28, %zmm1, %zmm1
1435 ; AVX512F-NEXT: vpslld $4, %zmm0, %zmm0
1436 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1437 ; AVX512F-NEXT: retq
1439 ; AVX512VL-LABEL: splatconstant_funnnel_v16i32:
1440 ; AVX512VL: # %bb.0:
1441 ; AVX512VL-NEXT: vpsrld $28, %zmm1, %zmm1
1442 ; AVX512VL-NEXT: vpslld $4, %zmm0, %zmm0
1443 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1444 ; AVX512VL-NEXT: retq
1446 ; AVX512BW-LABEL: splatconstant_funnnel_v16i32:
1447 ; AVX512BW: # %bb.0:
1448 ; AVX512BW-NEXT: vpsrld $28, %zmm1, %zmm1
1449 ; AVX512BW-NEXT: vpslld $4, %zmm0, %zmm0
1450 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
1451 ; AVX512BW-NEXT: retq
1453 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i32:
1454 ; AVX512VBMI2: # %bb.0:
1455 ; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
1456 ; AVX512VBMI2-NEXT: retq
1458 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i32:
1459 ; AVX512VLBW: # %bb.0:
1460 ; AVX512VLBW-NEXT: vpsrld $28, %zmm1, %zmm1
1461 ; AVX512VLBW-NEXT: vpslld $4, %zmm0, %zmm0
1462 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
1463 ; AVX512VLBW-NEXT: retq
1465 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i32:
1466 ; AVX512VLVBMI2: # %bb.0:
1467 ; AVX512VLVBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
1468 ; AVX512VLVBMI2-NEXT: retq
1469 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1473 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1474 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
1476 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1477 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1478 ; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm3
1479 ; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
1480 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
1481 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
1482 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1483 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1484 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1485 ; AVX512F-NEXT: retq
1487 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
1488 ; AVX512VL: # %bb.0:
1489 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1490 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1491 ; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm3
1492 ; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2
1493 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
1494 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
1495 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1496 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1497 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1498 ; AVX512VL-NEXT: retq
1500 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
1501 ; AVX512BW: # %bb.0:
1502 ; AVX512BW-NEXT: vpsrlw $9, %zmm1, %zmm1
1503 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
1504 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1505 ; AVX512BW-NEXT: retq
1507 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
1508 ; AVX512VBMI2: # %bb.0:
1509 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
1510 ; AVX512VBMI2-NEXT: retq
1512 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
1513 ; AVX512VLBW: # %bb.0:
1514 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm1, %zmm1
1515 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
1516 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1517 ; AVX512VLBW-NEXT: retq
1519 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
1520 ; AVX512VLVBMI2: # %bb.0:
1521 ; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
1522 ; AVX512VLVBMI2-NEXT: retq
1523 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1527 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1528 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
1530 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1531 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1532 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3
1533 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1534 ; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
1535 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
1536 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
1537 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
1538 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
1539 ; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1
1540 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1541 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
1542 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1543 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1544 ; AVX512F-NEXT: retq
1546 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
1547 ; AVX512VL: # %bb.0:
1548 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1549 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1550 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
1551 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1552 ; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3
1553 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
1554 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
1555 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
1556 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
1557 ; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1
1558 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1559 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
1560 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1561 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1562 ; AVX512VL-NEXT: retq
1564 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
1565 ; AVX512BW: # %bb.0:
1566 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm1
1567 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1568 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
1569 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1570 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1571 ; AVX512BW-NEXT: retq
1573 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
1574 ; AVX512VBMI2: # %bb.0:
1575 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1576 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1577 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1578 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1579 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1580 ; AVX512VBMI2-NEXT: retq
1582 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
1583 ; AVX512VLBW: # %bb.0:
1584 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm1
1585 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1586 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
1587 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1588 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1589 ; AVX512VLBW-NEXT: retq
1591 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
1592 ; AVX512VLVBMI2: # %bb.0:
1593 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1594 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1595 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1596 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1597 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1598 ; AVX512VLVBMI2-NEXT: retq
1599 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)