1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
9 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
10 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
11 declare <32 x i16> @llvm.fshl.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
12 declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
18 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
19 ; AVX512F-LABEL: var_funnnel_v8i64:
21 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
22 ; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
23 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
24 ; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
25 ; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
26 ; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
27 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
28 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
29 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
32 ; AVX512VL-LABEL: var_funnnel_v8i64:
34 ; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
35 ; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
36 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
37 ; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
38 ; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
39 ; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
40 ; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
41 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
42 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
45 ; AVX512BW-LABEL: var_funnnel_v8i64:
47 ; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
48 ; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
49 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
50 ; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
51 ; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
52 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
53 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
54 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
55 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
58 ; AVX512VBMI2-LABEL: var_funnnel_v8i64:
59 ; AVX512VBMI2: # %bb.0:
60 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
61 ; AVX512VBMI2-NEXT: retq
63 ; AVX512VLBW-LABEL: var_funnnel_v8i64:
64 ; AVX512VLBW: # %bb.0:
65 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
66 ; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
67 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
68 ; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
69 ; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
70 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
71 ; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
72 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
73 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
74 ; AVX512VLBW-NEXT: retq
76 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i64:
77 ; AVX512VLVBMI2: # %bb.0:
78 ; AVX512VLVBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
79 ; AVX512VLVBMI2-NEXT: retq
80 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
84 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
85 ; AVX512F-LABEL: var_funnnel_v16i32:
87 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
88 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
89 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
90 ; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
91 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
92 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
93 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
94 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
95 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
98 ; AVX512VL-LABEL: var_funnnel_v16i32:
100 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
101 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
102 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
103 ; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
104 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
105 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
106 ; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
107 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
108 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
109 ; AVX512VL-NEXT: retq
111 ; AVX512BW-LABEL: var_funnnel_v16i32:
113 ; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
114 ; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
115 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
116 ; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
117 ; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
118 ; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
119 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
120 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
121 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
122 ; AVX512BW-NEXT: retq
124 ; AVX512VBMI2-LABEL: var_funnnel_v16i32:
125 ; AVX512VBMI2: # %bb.0:
126 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
127 ; AVX512VBMI2-NEXT: retq
129 ; AVX512VLBW-LABEL: var_funnnel_v16i32:
130 ; AVX512VLBW: # %bb.0:
131 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
132 ; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
133 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
134 ; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
135 ; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
136 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
137 ; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
138 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
139 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
140 ; AVX512VLBW-NEXT: retq
142 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i32:
143 ; AVX512VLVBMI2: # %bb.0:
144 ; AVX512VLVBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
145 ; AVX512VLVBMI2-NEXT: retq
146 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt)
150 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
151 ; AVX512F-LABEL: var_funnnel_v32i16:
153 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
154 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
155 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
156 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
157 ; AVX512F-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
158 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
159 ; AVX512F-NEXT: vpsubw %ymm4, %ymm8, %ymm9
160 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
161 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
162 ; AVX512F-NEXT: vpsrlvd %zmm9, %zmm2, %zmm2
163 ; AVX512F-NEXT: vpord %zmm2, %zmm7, %zmm2
164 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
165 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
166 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4
167 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
168 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm2
169 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
170 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
171 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
172 ; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
173 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
174 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
175 ; AVX512F-NEXT: vpsrlvd %zmm5, %zmm3, %zmm3
176 ; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3
177 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
178 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
179 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
182 ; AVX512VL-LABEL: var_funnnel_v32i16:
184 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
185 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
186 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
187 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
188 ; AVX512VL-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
189 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
190 ; AVX512VL-NEXT: vpsubw %ymm4, %ymm8, %ymm9
191 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
192 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
193 ; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm2, %zmm2
194 ; AVX512VL-NEXT: vpord %zmm2, %zmm7, %zmm2
195 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
196 ; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
197 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4
198 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
199 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm2
200 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
201 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
202 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
203 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
204 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
205 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
206 ; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm3, %zmm3
207 ; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3
208 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
209 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
210 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
211 ; AVX512VL-NEXT: retq
213 ; AVX512BW-LABEL: var_funnnel_v32i16:
215 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
216 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
217 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
218 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
219 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
220 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
221 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
222 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
223 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
224 ; AVX512BW-NEXT: retq
226 ; AVX512VBMI2-LABEL: var_funnnel_v32i16:
227 ; AVX512VBMI2: # %bb.0:
228 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
229 ; AVX512VBMI2-NEXT: retq
231 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
232 ; AVX512VLBW: # %bb.0:
233 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
234 ; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
235 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
236 ; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
237 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
238 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
239 ; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
240 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
241 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
242 ; AVX512VLBW-NEXT: retq
244 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
245 ; AVX512VLVBMI2: # %bb.0:
246 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
247 ; AVX512VLVBMI2-NEXT: retq
248 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt)
252 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
253 ; AVX512F-LABEL: var_funnnel_v64i8:
255 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm7
256 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
257 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm8
258 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
259 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm9
260 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
261 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm8, %ymm0, %ymm8
262 ; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11
263 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
264 ; AVX512F-NEXT: vpand %ymm4, %ymm11, %ymm11
265 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
266 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8
267 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11
268 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
269 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm10
270 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm11
271 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
272 ; AVX512F-NEXT: vpand %ymm8, %ymm11, %ymm11
273 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
274 ; AVX512F-NEXT: vpsubb %ymm9, %ymm12, %ymm13
275 ; AVX512F-NEXT: vpsllw $5, %ymm13, %ymm13
276 ; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
277 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm11
278 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
279 ; AVX512F-NEXT: vpand %ymm14, %ymm11, %ymm11
280 ; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
281 ; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
282 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm11
283 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
284 ; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
285 ; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
286 ; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
287 ; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2
288 ; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
289 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
290 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0
291 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
292 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
293 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
294 ; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm6
295 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
296 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm7
297 ; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4
298 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
299 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
300 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4
301 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
302 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
303 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
304 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
305 ; AVX512F-NEXT: vpsubb %ymm5, %ymm12, %ymm6
306 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
307 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
308 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm4
309 ; AVX512F-NEXT: vpand %ymm14, %ymm4, %ymm4
310 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
311 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
312 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm4
313 ; AVX512F-NEXT: vpand %ymm15, %ymm4, %ymm4
314 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
315 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
316 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
317 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm5, %ymm3
318 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
321 ; AVX512VL-LABEL: var_funnnel_v64i8:
323 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
324 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
325 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
326 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
327 ; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
328 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm9
329 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm6
330 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm10
331 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
332 ; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
333 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
334 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
335 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm10
336 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
337 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
338 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm9
339 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
340 ; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
341 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
342 ; AVX512VL-NEXT: vpsubb %ymm4, %ymm12, %ymm13
343 ; AVX512VL-NEXT: vpsllw $5, %ymm13, %ymm13
344 ; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
345 ; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm9
346 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
347 ; AVX512VL-NEXT: vpand %ymm14, %ymm9, %ymm9
348 ; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
349 ; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
350 ; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm9
351 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
352 ; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
353 ; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
354 ; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
355 ; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2
356 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
357 ; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
358 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
359 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
360 ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
361 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm4
362 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm5
363 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm2
364 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm7
365 ; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7
366 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
367 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
368 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm7
369 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
370 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
371 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm5
372 ; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5
373 ; AVX512VL-NEXT: vpsubb %ymm4, %ymm12, %ymm7
374 ; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
375 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
376 ; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm5
377 ; AVX512VL-NEXT: vpand %ymm14, %ymm5, %ymm5
378 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
379 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
380 ; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm5
381 ; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5
382 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
383 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
384 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
385 ; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm3
386 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
387 ; AVX512VL-NEXT: retq
389 ; AVX512BW-LABEL: var_funnnel_v64i8:
391 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
392 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
393 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
394 ; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
395 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
396 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
397 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
398 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
399 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
400 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
401 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm3
402 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
403 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
404 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm3
405 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
406 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
407 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
408 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
409 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
410 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
411 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
412 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
413 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
414 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
415 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
416 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
417 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
418 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
419 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
420 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
421 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
422 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
423 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
424 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
425 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
426 ; AVX512BW-NEXT: retq
428 ; AVX512VBMI2-LABEL: var_funnnel_v64i8:
429 ; AVX512VBMI2: # %bb.0:
430 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
431 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
432 ; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
433 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
434 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
435 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
436 ; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
437 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
438 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
439 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
440 ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
441 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
442 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
443 ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
444 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
445 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
446 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
447 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
448 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
449 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
450 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
451 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
452 ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
453 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
454 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
455 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
456 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
457 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
458 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
459 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
460 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
461 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
462 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
463 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
464 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
465 ; AVX512VBMI2-NEXT: retq
467 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
468 ; AVX512VLBW: # %bb.0:
469 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
470 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
471 ; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
472 ; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
473 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
474 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
475 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
476 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
477 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
478 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
479 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm3
480 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
481 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
482 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm3
483 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
484 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
485 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
486 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
487 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
488 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
489 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
490 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
491 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
492 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
493 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
494 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
495 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
496 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
497 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
498 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
499 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
500 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
501 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
502 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
503 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
504 ; AVX512VLBW-NEXT: retq
506 ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
507 ; AVX512VLVBMI2: # %bb.0:
508 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
509 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
510 ; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
511 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
512 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
513 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
514 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
515 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
516 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
517 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
518 ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
519 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
520 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
521 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
522 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
523 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
524 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
525 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
526 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
527 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
528 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
529 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
530 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
531 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
532 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
533 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
534 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
535 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
536 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
537 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
538 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
539 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
540 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
541 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
542 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
543 ; AVX512VLVBMI2-NEXT: retq
544 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
549 ; Uniform Variable Shifts
552 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
553 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
555 ; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
556 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
557 ; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3
558 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
559 ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
560 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
561 ; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
562 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
563 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
564 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
567 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
569 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
570 ; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
571 ; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3
572 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
573 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
574 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
575 ; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
576 ; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
577 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
578 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
579 ; AVX512VL-NEXT: retq
581 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
583 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
584 ; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
585 ; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
586 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
587 ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
588 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
589 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
590 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
591 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
592 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
593 ; AVX512BW-NEXT: retq
595 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64:
596 ; AVX512VBMI2: # %bb.0:
597 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %zmm2
598 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
599 ; AVX512VBMI2-NEXT: retq
601 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
602 ; AVX512VLBW: # %bb.0:
603 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
604 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
605 ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
606 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
607 ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
608 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
609 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
610 ; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
611 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
612 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
613 ; AVX512VLBW-NEXT: retq
615 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64:
616 ; AVX512VLVBMI2: # %bb.0:
617 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %zmm2
618 ; AVX512VLVBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
619 ; AVX512VLVBMI2-NEXT: retq
620 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
621 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat)
625 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
626 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
628 ; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
629 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
630 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
631 ; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3
632 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
633 ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
634 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
635 ; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
636 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
637 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
638 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
639 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
642 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
644 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
645 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
646 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
647 ; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3
648 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
649 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
650 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
651 ; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
652 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
653 ; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
654 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
655 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
656 ; AVX512VL-NEXT: retq
658 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
660 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
661 ; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
662 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
663 ; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3
664 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
665 ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
666 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
667 ; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
668 ; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
669 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
670 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
671 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
672 ; AVX512BW-NEXT: retq
674 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32:
675 ; AVX512VBMI2: # %bb.0:
676 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %zmm2
677 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
678 ; AVX512VBMI2-NEXT: retq
680 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
681 ; AVX512VLBW: # %bb.0:
682 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
683 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
684 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
685 ; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3
686 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
687 ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
688 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
689 ; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
690 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
691 ; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
692 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
693 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
694 ; AVX512VLBW-NEXT: retq
696 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32:
697 ; AVX512VLVBMI2: # %bb.0:
698 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %zmm2
699 ; AVX512VLVBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
700 ; AVX512VLVBMI2-NEXT: retq
701 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
702 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat)
706 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
707 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
709 ; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4
710 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
711 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
712 ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6
713 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
714 ; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7
715 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
716 ; AVX512F-NEXT: vpsrlw %xmm7, %ymm2, %ymm2
717 ; AVX512F-NEXT: vpor %ymm2, %ymm6, %ymm2
718 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
719 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4
720 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
721 ; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm2
722 ; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
723 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
724 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
727 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
729 ; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4
730 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
731 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
732 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6
733 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
734 ; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7
735 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
736 ; AVX512VL-NEXT: vpsrlw %xmm7, %ymm2, %ymm2
737 ; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2
738 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
739 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4
740 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
741 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm2
742 ; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
743 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
744 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
745 ; AVX512VL-NEXT: retq
747 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
749 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
750 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
751 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
752 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
753 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
754 ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
755 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
756 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
757 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
758 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
759 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
760 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
761 ; AVX512BW-NEXT: retq
763 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
764 ; AVX512VBMI2: # %bb.0:
765 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %zmm2
766 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
767 ; AVX512VBMI2-NEXT: retq
769 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
770 ; AVX512VLBW: # %bb.0:
771 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
772 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
773 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
774 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
775 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
776 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
777 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
778 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
779 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
780 ; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
781 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
782 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
783 ; AVX512VLBW-NEXT: retq
785 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
786 ; AVX512VLVBMI2: # %bb.0:
787 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %zmm2
788 ; AVX512VLVBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
789 ; AVX512VLVBMI2-NEXT: retq
790 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
791 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat)
795 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
796 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
798 ; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
799 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
800 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
801 ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6
802 ; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
803 ; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7
804 ; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
805 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9
806 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
807 ; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6
808 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
809 ; AVX512F-NEXT: vpsrlw %xmm6, %ymm2, %ymm10
810 ; AVX512F-NEXT: vpsrlw %xmm6, %xmm8, %xmm2
811 ; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
812 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
813 ; AVX512F-NEXT: vpand %ymm2, %ymm10, %ymm8
814 ; AVX512F-NEXT: vpor %ymm8, %ymm9, %ymm8
815 ; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
816 ; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
817 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0
818 ; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm5
819 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
820 ; AVX512F-NEXT: vpsrlw %xmm6, %ymm3, %ymm3
821 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
822 ; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
823 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
826 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
828 ; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
829 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
830 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
831 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6
832 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
833 ; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7
834 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
835 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9
836 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
837 ; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6
838 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
839 ; AVX512VL-NEXT: vpsrlw %xmm6, %ymm2, %ymm10
840 ; AVX512VL-NEXT: vpsrlw %xmm6, %xmm8, %xmm2
841 ; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
842 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
843 ; AVX512VL-NEXT: vpand %ymm2, %ymm10, %ymm8
844 ; AVX512VL-NEXT: vpor %ymm8, %ymm9, %ymm8
845 ; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
846 ; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
847 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0
848 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm5
849 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
850 ; AVX512VL-NEXT: vpsrlw %xmm6, %ymm3, %ymm3
851 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
852 ; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
853 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
854 ; AVX512VL-NEXT: retq
856 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
858 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
859 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
860 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
861 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
862 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
863 ; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
864 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
865 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
866 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
867 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
868 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
869 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
870 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
871 ; AVX512BW-NEXT: vpsrlw $8, %xmm4, %xmm4
872 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
873 ; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1
874 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
875 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
876 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
877 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
878 ; AVX512BW-NEXT: retq
880 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
881 ; AVX512VBMI2: # %bb.0:
882 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
883 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
884 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
885 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
886 ; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
887 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
888 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
889 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
890 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
891 ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
892 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
893 ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
894 ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
895 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
896 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
897 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
898 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
899 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
900 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
901 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
902 ; AVX512VBMI2-NEXT: retq
904 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
905 ; AVX512VLBW: # %bb.0:
906 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
907 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
908 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
909 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
910 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
911 ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
912 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
913 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
914 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
915 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
916 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
917 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
918 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
919 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm4, %xmm4
920 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
921 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1
922 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
923 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
924 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
925 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
926 ; AVX512VLBW-NEXT: retq
928 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
929 ; AVX512VLVBMI2: # %bb.0:
930 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
931 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
932 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
933 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
934 ; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
935 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
936 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
937 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
938 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
939 ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
940 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
941 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
942 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
943 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
944 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
945 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
946 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
947 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
948 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
949 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
950 ; AVX512VLVBMI2-NEXT: retq
951 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
952 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
960 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
961 ; AVX512F-LABEL: constant_funnnel_v8i64:
963 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
964 ; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
965 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
968 ; AVX512VL-LABEL: constant_funnnel_v8i64:
970 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
971 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
972 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
973 ; AVX512VL-NEXT: retq
975 ; AVX512BW-LABEL: constant_funnnel_v8i64:
977 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
978 ; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
979 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
980 ; AVX512BW-NEXT: retq
982 ; AVX512VBMI2-LABEL: constant_funnnel_v8i64:
983 ; AVX512VBMI2: # %bb.0:
984 ; AVX512VBMI2-NEXT: vpshldvq {{.*}}(%rip), %zmm1, %zmm0
985 ; AVX512VBMI2-NEXT: retq
987 ; AVX512VLBW-LABEL: constant_funnnel_v8i64:
988 ; AVX512VLBW: # %bb.0:
989 ; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
990 ; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
991 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
992 ; AVX512VLBW-NEXT: retq
994 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i64:
995 ; AVX512VLVBMI2: # %bb.0:
996 ; AVX512VLVBMI2-NEXT: vpshldvq {{.*}}(%rip), %zmm1, %zmm0
997 ; AVX512VLVBMI2-NEXT: retq
998 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
1002 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1003 ; AVX512F-LABEL: constant_funnnel_v16i32:
1005 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1006 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1007 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1008 ; AVX512F-NEXT: retq
1010 ; AVX512VL-LABEL: constant_funnnel_v16i32:
1011 ; AVX512VL: # %bb.0:
1012 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1013 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1014 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1015 ; AVX512VL-NEXT: retq
1017 ; AVX512BW-LABEL: constant_funnnel_v16i32:
1018 ; AVX512BW: # %bb.0:
1019 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1020 ; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1021 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
1022 ; AVX512BW-NEXT: retq
1024 ; AVX512VBMI2-LABEL: constant_funnnel_v16i32:
1025 ; AVX512VBMI2: # %bb.0:
1026 ; AVX512VBMI2-NEXT: vpshldvd {{.*}}(%rip), %zmm1, %zmm0
1027 ; AVX512VBMI2-NEXT: retq
1029 ; AVX512VLBW-LABEL: constant_funnnel_v16i32:
1030 ; AVX512VLBW: # %bb.0:
1031 ; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1032 ; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1033 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
1034 ; AVX512VLBW-NEXT: retq
1036 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i32:
1037 ; AVX512VLVBMI2: # %bb.0:
1038 ; AVX512VLVBMI2-NEXT: vpshldvd {{.*}}(%rip), %zmm1, %zmm0
1039 ; AVX512VLVBMI2-NEXT: retq
1040 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1044 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1045 ; AVX512F-LABEL: constant_funnnel_v32i16:
1047 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
1048 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm2, %ymm2
1049 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm5
1050 ; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
1051 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
1052 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1053 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2
1054 ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm3
1055 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
1056 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
1057 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1058 ; AVX512F-NEXT: retq
1060 ; AVX512VL-LABEL: constant_funnnel_v32i16:
1061 ; AVX512VL: # %bb.0:
1062 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
1063 ; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm2, %ymm2
1064 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm5
1065 ; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
1066 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
1067 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1068 ; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2
1069 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm3
1070 ; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
1071 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
1072 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1073 ; AVX512VL-NEXT: retq
1075 ; AVX512BW-LABEL: constant_funnnel_v32i16:
1076 ; AVX512BW: # %bb.0:
1077 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
1078 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm2
1079 ; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1
1080 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
1081 ; AVX512BW-NEXT: kmovd %eax, %k1
1082 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
1083 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1084 ; AVX512BW-NEXT: retq
1086 ; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
1087 ; AVX512VBMI2: # %bb.0:
1088 ; AVX512VBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm1, %zmm0
1089 ; AVX512VBMI2-NEXT: retq
1091 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
1092 ; AVX512VLBW: # %bb.0:
1093 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
1094 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm2
1095 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm2, %zmm1
1096 ; AVX512VLBW-NEXT: movl $65537, %eax # imm = 0x10001
1097 ; AVX512VLBW-NEXT: kmovd %eax, %k1
1098 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
1099 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
1100 ; AVX512VLBW-NEXT: retq
1102 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
1103 ; AVX512VLVBMI2: # %bb.0:
1104 ; AVX512VLVBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm1, %zmm0
1105 ; AVX512VLVBMI2-NEXT: retq
1106 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1110 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1111 ; AVX512F-LABEL: constant_funnnel_v64i8:
1113 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
1114 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1115 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
1116 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1117 ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
1118 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
1119 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7
1120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1121 ; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7
1122 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm9
1123 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
1124 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
1125 ; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm10
1126 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
1127 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
1128 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31]
1129 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1130 ; AVX512F-NEXT: # ymm12 = mem[0,1,0,1]
1131 ; AVX512F-NEXT: vpmullw %ymm12, %ymm11, %ymm11
1132 ; AVX512F-NEXT: vpsrlw $8, %ymm11, %ymm11
1133 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23]
1134 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
1135 ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
1136 ; AVX512F-NEXT: vpmullw %ymm13, %ymm2, %ymm2
1137 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
1138 ; AVX512F-NEXT: vpackuswb %ymm11, %ymm2, %ymm2
1139 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
1140 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
1141 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
1142 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
1143 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
1144 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
1145 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
1146 ; AVX512F-NEXT: vpand %ymm8, %ymm5, %ymm5
1147 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2
1148 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
1149 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2
1150 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15],ymm3[24],ymm7[24],ymm3[25],ymm7[25],ymm3[26],ymm7[26],ymm3[27],ymm7[27],ymm3[28],ymm7[28],ymm3[29],ymm7[29],ymm3[30],ymm7[30],ymm3[31],ymm7[31]
1151 ; AVX512F-NEXT: vpmullw %ymm12, %ymm5, %ymm5
1152 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
1153 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[16],ymm7[16],ymm3[17],ymm7[17],ymm3[18],ymm7[18],ymm3[19],ymm7[19],ymm3[20],ymm7[20],ymm3[21],ymm7[21],ymm3[22],ymm7[22],ymm3[23],ymm7[23]
1154 ; AVX512F-NEXT: vpmullw %ymm13, %ymm3, %ymm3
1155 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1156 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
1157 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
1158 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
1159 ; AVX512F-NEXT: retq
1161 ; AVX512VL-LABEL: constant_funnnel_v64i8:
1162 ; AVX512VL: # %bb.0:
1163 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
1164 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1165 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
1166 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1167 ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
1168 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
1169 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
1170 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1171 ; AVX512VL-NEXT: vpand %ymm8, %ymm7, %ymm7
1172 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm9
1173 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
1174 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
1175 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm10
1176 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
1177 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1178 ; AVX512VL-NEXT: vpsrlw $8, %ymm7, %ymm7
1179 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1180 ; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
1181 ; AVX512VL-NEXT: vpmullw %ymm11, %ymm7, %ymm7
1182 ; AVX512VL-NEXT: vpsrlw $8, %ymm7, %ymm7
1183 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1184 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1185 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
1186 ; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1]
1187 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm2, %ymm2
1188 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1189 ; AVX512VL-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
1190 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
1191 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
1192 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
1193 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
1194 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
1195 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
1196 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
1197 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
1198 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2
1199 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
1200 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2
1201 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1202 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
1203 ; AVX512VL-NEXT: vpmullw %ymm11, %ymm5, %ymm5
1204 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
1205 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1206 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1207 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm3, %ymm3
1208 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1209 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
1210 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
1211 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
1212 ; AVX512VL-NEXT: retq
1214 ; AVX512BW-LABEL: constant_funnnel_v64i8:
1215 ; AVX512BW: # %bb.0:
1216 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1217 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1218 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1219 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
1220 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1221 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1222 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
1223 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1224 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1225 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1226 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1227 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1228 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1229 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1230 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1231 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
1232 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1233 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
1234 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1235 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
1236 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1237 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
1238 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1239 ; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
1240 ; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1241 ; AVX512BW-NEXT: kmovq %rax, %k1
1242 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1243 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
1244 ; AVX512BW-NEXT: retq
1246 ; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
1247 ; AVX512VBMI2: # %bb.0:
1248 ; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1249 ; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1250 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1251 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
1252 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1253 ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1254 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
1255 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1256 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1257 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1258 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1259 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1260 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1261 ; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1262 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1263 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1264 ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1265 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1266 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1267 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1268 ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1269 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1270 ; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1271 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
1272 ; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1273 ; AVX512VBMI2-NEXT: kmovq %rax, %k1
1274 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1275 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1276 ; AVX512VBMI2-NEXT: retq
1278 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
1279 ; AVX512VLBW: # %bb.0:
1280 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1281 ; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1282 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1283 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
1284 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1285 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1286 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
1287 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1288 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1289 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1290 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1291 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1292 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1293 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1294 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1295 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
1296 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1297 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
1298 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1299 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
1300 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1301 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
1302 ; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1303 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
1304 ; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1305 ; AVX512VLBW-NEXT: kmovq %rax, %k1
1306 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1307 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
1308 ; AVX512VLBW-NEXT: retq
1310 ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
1311 ; AVX512VLVBMI2: # %bb.0:
1312 ; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1313 ; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1314 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1315 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
1316 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1317 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
1318 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
1319 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
1320 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1321 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1322 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
1323 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1324 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1325 ; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
1326 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1327 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1328 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1329 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1330 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1331 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1332 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
1333 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1
1334 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
1335 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
1336 ; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1337 ; AVX512VLVBMI2-NEXT: kmovq %rax, %k1
1338 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
1339 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1340 ; AVX512VLVBMI2-NEXT: retq
1341 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1346 ; Uniform Constant Shifts
1349 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
1350 ; AVX512F-LABEL: splatconstant_funnnel_v8i64:
1352 ; AVX512F-NEXT: vpsrlq $50, %zmm1, %zmm1
1353 ; AVX512F-NEXT: vpsllq $14, %zmm0, %zmm0
1354 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
1355 ; AVX512F-NEXT: retq
1357 ; AVX512VL-LABEL: splatconstant_funnnel_v8i64:
1358 ; AVX512VL: # %bb.0:
1359 ; AVX512VL-NEXT: vpsrlq $50, %zmm1, %zmm1
1360 ; AVX512VL-NEXT: vpsllq $14, %zmm0, %zmm0
1361 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
1362 ; AVX512VL-NEXT: retq
1364 ; AVX512BW-LABEL: splatconstant_funnnel_v8i64:
1365 ; AVX512BW: # %bb.0:
1366 ; AVX512BW-NEXT: vpsrlq $50, %zmm1, %zmm1
1367 ; AVX512BW-NEXT: vpsllq $14, %zmm0, %zmm0
1368 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1369 ; AVX512BW-NEXT: retq
1371 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i64:
1372 ; AVX512VBMI2: # %bb.0:
1373 ; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
1374 ; AVX512VBMI2-NEXT: retq
1376 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i64:
1377 ; AVX512VLBW: # %bb.0:
1378 ; AVX512VLBW-NEXT: vpsrlq $50, %zmm1, %zmm1
1379 ; AVX512VLBW-NEXT: vpsllq $14, %zmm0, %zmm0
1380 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1381 ; AVX512VLBW-NEXT: retq
1383 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i64:
1384 ; AVX512VLVBMI2: # %bb.0:
1385 ; AVX512VLVBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
1386 ; AVX512VLVBMI2-NEXT: retq
1387 %res = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
1391 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1392 ; AVX512F-LABEL: splatconstant_funnnel_v16i32:
1394 ; AVX512F-NEXT: vpsrld $28, %zmm1, %zmm1
1395 ; AVX512F-NEXT: vpslld $4, %zmm0, %zmm0
1396 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1397 ; AVX512F-NEXT: retq
1399 ; AVX512VL-LABEL: splatconstant_funnnel_v16i32:
1400 ; AVX512VL: # %bb.0:
1401 ; AVX512VL-NEXT: vpsrld $28, %zmm1, %zmm1
1402 ; AVX512VL-NEXT: vpslld $4, %zmm0, %zmm0
1403 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1404 ; AVX512VL-NEXT: retq
1406 ; AVX512BW-LABEL: splatconstant_funnnel_v16i32:
1407 ; AVX512BW: # %bb.0:
1408 ; AVX512BW-NEXT: vpsrld $28, %zmm1, %zmm1
1409 ; AVX512BW-NEXT: vpslld $4, %zmm0, %zmm0
1410 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
1411 ; AVX512BW-NEXT: retq
1413 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i32:
1414 ; AVX512VBMI2: # %bb.0:
1415 ; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
1416 ; AVX512VBMI2-NEXT: retq
1418 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i32:
1419 ; AVX512VLBW: # %bb.0:
1420 ; AVX512VLBW-NEXT: vpsrld $28, %zmm1, %zmm1
1421 ; AVX512VLBW-NEXT: vpslld $4, %zmm0, %zmm0
1422 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
1423 ; AVX512VLBW-NEXT: retq
1425 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i32:
1426 ; AVX512VLVBMI2: # %bb.0:
1427 ; AVX512VLVBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
1428 ; AVX512VLVBMI2-NEXT: retq
1429 %res = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1433 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1434 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
1436 ; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm2
1437 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1438 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1439 ; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm2
1440 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
1441 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
1442 ; AVX512F-NEXT: retq
1444 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
1445 ; AVX512VL: # %bb.0:
1446 ; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm2
1447 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1448 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1449 ; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm2
1450 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
1451 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
1452 ; AVX512VL-NEXT: retq
1454 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
1455 ; AVX512BW: # %bb.0:
1456 ; AVX512BW-NEXT: vpsrlw $9, %zmm1, %zmm1
1457 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
1458 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1459 ; AVX512BW-NEXT: retq
1461 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
1462 ; AVX512VBMI2: # %bb.0:
1463 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
1464 ; AVX512VBMI2-NEXT: retq
1466 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
1467 ; AVX512VLBW: # %bb.0:
1468 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm1, %zmm1
1469 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
1470 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1471 ; AVX512VLBW-NEXT: retq
1473 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
1474 ; AVX512VLVBMI2: # %bb.0:
1475 ; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
1476 ; AVX512VLVBMI2-NEXT: retq
1477 %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1481 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1482 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
1484 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
1485 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1486 ; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
1487 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1488 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
1489 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1490 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2
1491 ; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
1492 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
1493 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
1494 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
1495 ; AVX512F-NEXT: retq
1497 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
1498 ; AVX512VL: # %bb.0:
1499 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
1500 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1501 ; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2
1502 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1503 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
1504 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1505 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2
1506 ; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2
1507 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
1508 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
1509 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
1510 ; AVX512VL-NEXT: retq
1512 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
1513 ; AVX512BW: # %bb.0:
1514 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm1
1515 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1516 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
1517 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1518 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1519 ; AVX512BW-NEXT: retq
1521 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
1522 ; AVX512VBMI2: # %bb.0:
1523 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1524 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1525 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1526 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1527 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1528 ; AVX512VBMI2-NEXT: retq
1530 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
1531 ; AVX512VLBW: # %bb.0:
1532 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm1
1533 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1534 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
1535 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1536 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1537 ; AVX512VLBW-NEXT: retq
1539 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
1540 ; AVX512VLVBMI2: # %bb.0:
1541 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1542 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1543 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1544 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1545 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1546 ; AVX512VLVBMI2-NEXT: retq
1547 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)