1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
9 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
10 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
11 declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
12 declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
18 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
19 ; AVX512F-LABEL: var_funnnel_v8i64:
21 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
22 ; AVX512F-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
23 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
24 ; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
25 ; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
26 ; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
27 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
28 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
31 ; AVX512VL-LABEL: var_funnnel_v8i64:
33 ; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
34 ; AVX512VL-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
35 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
36 ; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
37 ; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
38 ; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
39 ; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
40 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
43 ; AVX512BW-LABEL: var_funnnel_v8i64:
45 ; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
46 ; AVX512BW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
47 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
48 ; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
49 ; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
50 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
51 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
52 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
55 ; AVX512VBMI2-LABEL: var_funnnel_v8i64:
56 ; AVX512VBMI2: # %bb.0:
57 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
58 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
59 ; AVX512VBMI2-NEXT: retq
61 ; AVX512VLBW-LABEL: var_funnnel_v8i64:
62 ; AVX512VLBW: # %bb.0:
63 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
64 ; AVX512VLBW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
65 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
66 ; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
67 ; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
68 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
69 ; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
70 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
71 ; AVX512VLBW-NEXT: retq
73 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i64:
74 ; AVX512VLVBMI2: # %bb.0:
75 ; AVX512VLVBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
76 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
77 ; AVX512VLVBMI2-NEXT: retq
78 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
82 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
83 ; AVX512F-LABEL: var_funnnel_v16i32:
85 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
86 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
87 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
88 ; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
89 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
90 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
91 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
92 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
95 ; AVX512VL-LABEL: var_funnnel_v16i32:
97 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
98 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
99 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
100 ; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
101 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
102 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
103 ; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
104 ; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
105 ; AVX512VL-NEXT: retq
107 ; AVX512BW-LABEL: var_funnnel_v16i32:
109 ; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
110 ; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
111 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
112 ; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
113 ; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
114 ; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
115 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
116 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
117 ; AVX512BW-NEXT: retq
119 ; AVX512VBMI2-LABEL: var_funnnel_v16i32:
120 ; AVX512VBMI2: # %bb.0:
121 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
122 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
123 ; AVX512VBMI2-NEXT: retq
125 ; AVX512VLBW-LABEL: var_funnnel_v16i32:
126 ; AVX512VLBW: # %bb.0:
127 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
128 ; AVX512VLBW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
129 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
130 ; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
131 ; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
132 ; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
133 ; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
134 ; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
135 ; AVX512VLBW-NEXT: retq
137 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i32:
138 ; AVX512VLVBMI2: # %bb.0:
139 ; AVX512VLVBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
140 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
141 ; AVX512VLVBMI2-NEXT: retq
142 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt)
146 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
147 ; AVX512F-LABEL: var_funnnel_v32i16:
149 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
150 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
151 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
152 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
153 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
154 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
155 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
156 ; AVX512F-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7
157 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
158 ; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9
159 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
160 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
161 ; AVX512F-NEXT: vpsllvd %zmm9, %zmm3, %zmm3
162 ; AVX512F-NEXT: vpord %zmm7, %zmm3, %zmm3
163 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
164 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
165 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
166 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
167 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
168 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
169 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
170 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4
171 ; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
172 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
173 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
174 ; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
175 ; AVX512F-NEXT: vpord %zmm4, %zmm0, %zmm0
176 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
177 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
178 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
179 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
182 ; AVX512VL-LABEL: var_funnnel_v32i16:
184 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
185 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
186 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
187 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
188 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
189 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
190 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
191 ; AVX512VL-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7
192 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
193 ; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9
194 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
195 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
196 ; AVX512VL-NEXT: vpsllvd %zmm9, %zmm3, %zmm3
197 ; AVX512VL-NEXT: vpord %zmm7, %zmm3, %zmm3
198 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
199 ; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
200 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
201 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
202 ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
203 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
204 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
205 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4
206 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
207 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
208 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
209 ; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
210 ; AVX512VL-NEXT: vpord %zmm4, %zmm0, %zmm0
211 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
212 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
213 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
214 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
215 ; AVX512VL-NEXT: retq
217 ; AVX512BW-LABEL: var_funnnel_v32i16:
219 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
220 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
221 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
222 ; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
223 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
224 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
225 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
226 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
227 ; AVX512BW-NEXT: retq
229 ; AVX512VBMI2-LABEL: var_funnnel_v32i16:
230 ; AVX512VBMI2: # %bb.0:
231 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
232 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
233 ; AVX512VBMI2-NEXT: retq
235 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
236 ; AVX512VLBW: # %bb.0:
237 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
238 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
239 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
240 ; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
241 ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
242 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
243 ; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
244 ; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
245 ; AVX512VLBW-NEXT: retq
247 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
248 ; AVX512VLVBMI2: # %bb.0:
249 ; AVX512VLVBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
250 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
251 ; AVX512VLVBMI2-NEXT: retq
252 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt)
256 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
257 ; AVX512F-LABEL: var_funnnel_v64i8:
259 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8
260 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
261 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
262 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5
263 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
264 ; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
265 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
266 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9
267 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
268 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7
269 ; AVX512F-NEXT: vpsrlw $2, %ymm7, %ymm11
270 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
271 ; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11
272 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
273 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm11
274 ; AVX512F-NEXT: vpsrlw $1, %ymm11, %ymm12
275 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
276 ; AVX512F-NEXT: vpand %ymm7, %ymm12, %ymm12
277 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
278 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10
279 ; AVX512F-NEXT: vpsllw $4, %ymm8, %ymm11
280 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
281 ; AVX512F-NEXT: vpand %ymm12, %ymm11, %ymm11
282 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
283 ; AVX512F-NEXT: vpsubb %ymm9, %ymm13, %ymm14
284 ; AVX512F-NEXT: vpsllw $5, %ymm14, %ymm14
285 ; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
286 ; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11
287 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
288 ; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
289 ; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14
290 ; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
291 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11
292 ; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14
293 ; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
294 ; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8
295 ; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
296 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
297 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
298 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm8
299 ; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
300 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
301 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5
302 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4
303 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm8
304 ; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
305 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
306 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
307 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
308 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
309 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
310 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
311 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
312 ; AVX512F-NEXT: vpand %ymm12, %ymm5, %ymm5
313 ; AVX512F-NEXT: vpsubb %ymm2, %ymm13, %ymm6
314 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
315 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
316 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5
317 ; AVX512F-NEXT: vpand %ymm15, %ymm5, %ymm5
318 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
319 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
320 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5
321 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
322 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
323 ; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
324 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm2, %ymm2
325 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
326 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
329 ; AVX512VL-LABEL: var_funnnel_v64i8:
331 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
332 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
333 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
334 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm6
335 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
336 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
337 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
338 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
339 ; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9
340 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6
341 ; AVX512VL-NEXT: vpsrlw $2, %ymm6, %ymm10
342 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
343 ; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
344 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
345 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
346 ; AVX512VL-NEXT: vpsrlw $1, %ymm6, %ymm10
347 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
348 ; AVX512VL-NEXT: vpand %ymm12, %ymm10, %ymm10
349 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
350 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
351 ; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm9
352 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
353 ; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
354 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
355 ; AVX512VL-NEXT: vpsubb %ymm5, %ymm13, %ymm14
356 ; AVX512VL-NEXT: vpsllw $5, %ymm14, %ymm14
357 ; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
358 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9
359 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
360 ; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
361 ; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14
362 ; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
363 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9
364 ; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14
365 ; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
366 ; AVX512VL-NEXT: vpor %ymm6, %ymm4, %ymm4
367 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
368 ; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
369 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
370 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
371 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
372 ; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
373 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5
374 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4
375 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm7
376 ; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7
377 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
378 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
379 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm7
380 ; AVX512VL-NEXT: vpand %ymm12, %ymm7, %ymm7
381 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
382 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
383 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
384 ; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5
385 ; AVX512VL-NEXT: vpsubb %ymm2, %ymm13, %ymm7
386 ; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
387 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
388 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5
389 ; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5
390 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
391 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
392 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5
393 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
394 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
395 ; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
396 ; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
397 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
398 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
399 ; AVX512VL-NEXT: retq
401 ; AVX512BW-LABEL: var_funnnel_v64i8:
403 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
404 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
405 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
406 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
407 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
408 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
409 ; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5
410 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
411 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
412 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
413 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
414 ; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5
415 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
416 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
417 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
418 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
419 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
420 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
421 ; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
422 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
423 ; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
424 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k2
425 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4
426 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
427 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
428 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm4
429 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
430 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
431 ; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
432 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
433 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
434 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
435 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
436 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
437 ; AVX512BW-NEXT: retq
439 ; AVX512VBMI2-LABEL: var_funnnel_v64i8:
440 ; AVX512VBMI2: # %bb.0:
441 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
442 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
443 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
444 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
445 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
446 ; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
447 ; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
448 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
449 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
450 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
451 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
452 ; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
453 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
454 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
455 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
456 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
457 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
458 ; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
459 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
460 ; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
461 ; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
462 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2
463 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
464 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
465 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
466 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
467 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
468 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
469 ; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
470 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
471 ; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
472 ; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
473 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
474 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
475 ; AVX512VBMI2-NEXT: retq
477 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
478 ; AVX512VLBW: # %bb.0:
479 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
480 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
481 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
482 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
483 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
484 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
485 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5
486 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
487 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
488 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
489 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
490 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5
491 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
492 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
493 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
494 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
495 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
496 ; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
497 ; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
498 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
499 ; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
500 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2
501 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4
502 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
503 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
504 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm4
505 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
506 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
507 ; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
508 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
509 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
510 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
511 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
512 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
513 ; AVX512VLBW-NEXT: retq
515 ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
516 ; AVX512VLVBMI2: # %bb.0:
517 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
518 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
519 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
520 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
521 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
522 ; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
523 ; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
524 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
525 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
526 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
527 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
528 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
529 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
530 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
531 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
532 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
533 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
534 ; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
535 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
536 ; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
537 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
538 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2
539 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
540 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
541 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k2}
542 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
543 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
544 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
545 ; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
546 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
547 ; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
548 ; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
549 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
550 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
551 ; AVX512VLVBMI2-NEXT: retq
552 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
557 ; Uniform Variable Shifts
560 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
561 ; AVX512F-LABEL: splatvar_funnnel_v8i64:
563 ; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
564 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
565 ; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
566 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
567 ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
568 ; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0
569 ; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
570 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
571 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
574 ; AVX512VL-LABEL: splatvar_funnnel_v8i64:
576 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
577 ; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
578 ; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
579 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
580 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
581 ; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0
582 ; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
583 ; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
584 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
585 ; AVX512VL-NEXT: retq
587 ; AVX512BW-LABEL: splatvar_funnnel_v8i64:
589 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
590 ; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
591 ; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
592 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
593 ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
594 ; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
595 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
596 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
597 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
598 ; AVX512BW-NEXT: retq
600 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i64:
601 ; AVX512VBMI2: # %bb.0:
602 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %zmm2
603 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
604 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
605 ; AVX512VBMI2-NEXT: retq
607 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
608 ; AVX512VLBW: # %bb.0:
609 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
610 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
611 ; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
612 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
613 ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
614 ; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
615 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
616 ; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
617 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
618 ; AVX512VLBW-NEXT: retq
620 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i64:
621 ; AVX512VLVBMI2: # %bb.0:
622 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %zmm2
623 ; AVX512VLVBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
624 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
625 ; AVX512VLVBMI2-NEXT: retq
626 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
627 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %splat)
631 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
632 ; AVX512F-LABEL: splatvar_funnnel_v16i32:
634 ; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
635 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
636 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
637 ; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3
638 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
639 ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
640 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
641 ; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
642 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
643 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
644 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
647 ; AVX512VL-LABEL: splatvar_funnnel_v16i32:
649 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
650 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
651 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
652 ; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3
653 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
654 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
655 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
656 ; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
657 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
658 ; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
659 ; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
660 ; AVX512VL-NEXT: retq
662 ; AVX512BW-LABEL: splatvar_funnnel_v16i32:
664 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
665 ; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
666 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
667 ; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
668 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
669 ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
670 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
671 ; AVX512BW-NEXT: vpslld %xmm4, %zmm0, %zmm0
672 ; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
673 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
674 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
675 ; AVX512BW-NEXT: retq
677 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i32:
678 ; AVX512VBMI2: # %bb.0:
679 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %zmm2
680 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
681 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
682 ; AVX512VBMI2-NEXT: retq
684 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
685 ; AVX512VLBW: # %bb.0:
686 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
687 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
688 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
689 ; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
690 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
691 ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
692 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
693 ; AVX512VLBW-NEXT: vpslld %xmm4, %zmm0, %zmm0
694 ; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
695 ; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
696 ; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
697 ; AVX512VLBW-NEXT: retq
699 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i32:
700 ; AVX512VLVBMI2: # %bb.0:
701 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %zmm2
702 ; AVX512VLVBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
703 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
704 ; AVX512VLVBMI2-NEXT: retq
705 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
706 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %splat)
710 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
711 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
713 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
714 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
715 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
716 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
717 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
718 ; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
719 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
720 ; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7
721 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
722 ; AVX512F-NEXT: vpsllw %xmm7, %ymm3, %ymm3
723 ; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3
724 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
725 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
726 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
727 ; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm4
728 ; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0
729 ; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
730 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
731 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
734 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
736 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
737 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
738 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
739 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
740 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
741 ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
742 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
743 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7
744 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
745 ; AVX512VL-NEXT: vpsllw %xmm7, %ymm3, %ymm3
746 ; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3
747 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
748 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
749 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
750 ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm4
751 ; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0
752 ; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
753 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
754 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
755 ; AVX512VL-NEXT: retq
757 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
759 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
760 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
761 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
762 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
763 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
764 ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
765 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
766 ; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
767 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
768 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
769 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
770 ; AVX512BW-NEXT: retq
772 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
773 ; AVX512VBMI2: # %bb.0:
774 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %zmm2
775 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
776 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
777 ; AVX512VBMI2-NEXT: retq
779 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
780 ; AVX512VLBW: # %bb.0:
781 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
782 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
783 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
784 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
785 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
786 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
787 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
788 ; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
789 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
790 ; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
791 ; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
792 ; AVX512VLBW-NEXT: retq
794 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
795 ; AVX512VLVBMI2: # %bb.0:
796 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %zmm2
797 ; AVX512VLVBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
798 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
799 ; AVX512VLVBMI2-NEXT: retq
800 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
801 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %splat)
805 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
806 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
808 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm9
809 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
810 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
811 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
812 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
813 ; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
814 ; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
815 ; AVX512F-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
816 ; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7
817 ; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
818 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10
819 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
820 ; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
821 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
822 ; AVX512F-NEXT: vpsllw %xmm3, %ymm9, %ymm9
823 ; AVX512F-NEXT: vpsllw %xmm3, %xmm8, %xmm6
824 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
825 ; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8
826 ; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8
827 ; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
828 ; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
829 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
830 ; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
831 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
832 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0
833 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
834 ; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
835 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
836 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
839 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
841 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9
842 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
843 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
844 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
845 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
846 ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
847 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
848 ; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
849 ; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7
850 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
851 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10
852 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
853 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
854 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
855 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9
856 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6
857 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
858 ; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8
859 ; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8
860 ; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
861 ; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
862 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
863 ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
864 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
865 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0
866 ; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0
867 ; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
868 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
869 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
870 ; AVX512VL-NEXT: retq
872 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
874 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
875 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
876 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
877 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
878 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
879 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
880 ; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
881 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
882 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
883 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
884 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
885 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
886 ; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
887 ; AVX512BW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
888 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
889 ; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
890 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
891 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
892 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
893 ; AVX512BW-NEXT: retq
895 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
896 ; AVX512VBMI2: # %bb.0:
897 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
898 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
899 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
900 ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
901 ; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
902 ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
903 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
904 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
905 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
906 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
907 ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
908 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
909 ; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
910 ; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
911 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
912 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
913 ; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
914 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
915 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
916 ; AVX512VBMI2-NEXT: retq
918 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
919 ; AVX512VLBW: # %bb.0:
920 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
921 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
922 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
923 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
924 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
925 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
926 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
927 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
928 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
929 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
930 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
931 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
932 ; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
933 ; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
934 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
935 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0
936 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
937 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
938 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
939 ; AVX512VLBW-NEXT: retq
941 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
942 ; AVX512VLVBMI2: # %bb.0:
943 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
944 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
945 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
946 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
947 ; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
948 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
949 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
950 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
951 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
952 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
953 ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
954 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
955 ; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
956 ; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
957 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
958 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
959 ; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
960 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
961 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
962 ; AVX512VLVBMI2-NEXT: retq
963 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
964 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat)
972 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
973 ; AVX512F-LABEL: constant_funnnel_v8i64:
975 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
976 ; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
977 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
980 ; AVX512VL-LABEL: constant_funnnel_v8i64:
982 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
983 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
984 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
985 ; AVX512VL-NEXT: retq
987 ; AVX512BW-LABEL: constant_funnnel_v8i64:
989 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
990 ; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
991 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
992 ; AVX512BW-NEXT: retq
994 ; AVX512VBMI2-LABEL: constant_funnnel_v8i64:
995 ; AVX512VBMI2: # %bb.0:
996 ; AVX512VBMI2-NEXT: vpshrdvq {{.*}}(%rip), %zmm0, %zmm1
997 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
998 ; AVX512VBMI2-NEXT: retq
1000 ; AVX512VLBW-LABEL: constant_funnnel_v8i64:
1001 ; AVX512VLBW: # %bb.0:
1002 ; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1003 ; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
1004 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1005 ; AVX512VLBW-NEXT: retq
1007 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i64:
1008 ; AVX512VLVBMI2: # %bb.0:
1009 ; AVX512VLVBMI2-NEXT: vpshrdvq {{.*}}(%rip), %zmm0, %zmm1
1010 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1011 ; AVX512VLVBMI2-NEXT: retq
1012 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
1016 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1017 ; AVX512F-LABEL: constant_funnnel_v16i32:
1019 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1020 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1021 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1022 ; AVX512F-NEXT: retq
1024 ; AVX512VL-LABEL: constant_funnnel_v16i32:
1025 ; AVX512VL: # %bb.0:
1026 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1027 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1028 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1029 ; AVX512VL-NEXT: retq
1031 ; AVX512BW-LABEL: constant_funnnel_v16i32:
1032 ; AVX512BW: # %bb.0:
1033 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1034 ; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1035 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
1036 ; AVX512BW-NEXT: retq
1038 ; AVX512VBMI2-LABEL: constant_funnnel_v16i32:
1039 ; AVX512VBMI2: # %bb.0:
1040 ; AVX512VBMI2-NEXT: vpshrdvd {{.*}}(%rip), %zmm0, %zmm1
1041 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1042 ; AVX512VBMI2-NEXT: retq
1044 ; AVX512VLBW-LABEL: constant_funnnel_v16i32:
1045 ; AVX512VLBW: # %bb.0:
1046 ; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1047 ; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1048 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
1049 ; AVX512VLBW-NEXT: retq
1051 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i32:
1052 ; AVX512VLVBMI2: # %bb.0:
1053 ; AVX512VLVBMI2-NEXT: vpshrdvd {{.*}}(%rip), %zmm0, %zmm1
1054 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1055 ; AVX512VLVBMI2-NEXT: retq
1056 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1060 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1061 ; AVX512F-LABEL: constant_funnnel_v32i16:
1063 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1064 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1065 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
1066 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5
1067 ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
1068 ; AVX512F-NEXT: vpor %ymm5, %ymm2, %ymm2
1069 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
1070 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1071 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3
1072 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
1073 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
1074 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
1075 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1076 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1077 ; AVX512F-NEXT: retq
1079 ; AVX512VL-LABEL: constant_funnnel_v32i16:
1080 ; AVX512VL: # %bb.0:
1081 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1082 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1083 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
1084 ; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5
1085 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2
1086 ; AVX512VL-NEXT: vpor %ymm5, %ymm2, %ymm2
1087 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
1088 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1089 ; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3
1090 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
1091 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
1092 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
1093 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1094 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1095 ; AVX512VL-NEXT: retq
1097 ; AVX512BW-LABEL: constant_funnnel_v32i16:
1098 ; AVX512BW: # %bb.0:
1099 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm2
1100 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1101 ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
1102 ; AVX512BW-NEXT: movl $65537, %eax # imm = 0x10001
1103 ; AVX512BW-NEXT: kmovd %eax, %k1
1104 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1105 ; AVX512BW-NEXT: retq
1107 ; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
1108 ; AVX512VBMI2: # %bb.0:
1109 ; AVX512VBMI2-NEXT: vpshrdvw {{.*}}(%rip), %zmm0, %zmm1
1110 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1111 ; AVX512VBMI2-NEXT: retq
1113 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
1114 ; AVX512VLBW: # %bb.0:
1115 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm2
1116 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1117 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
1118 ; AVX512VLBW-NEXT: movl $65537, %eax # imm = 0x10001
1119 ; AVX512VLBW-NEXT: kmovd %eax, %k1
1120 ; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1121 ; AVX512VLBW-NEXT: retq
1123 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
1124 ; AVX512VLVBMI2: # %bb.0:
1125 ; AVX512VLVBMI2-NEXT: vpshrdvw {{.*}}(%rip), %zmm0, %zmm1
1126 ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
1127 ; AVX512VLVBMI2-NEXT: retq
1128 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1132 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1133 ; AVX512F-LABEL: constant_funnnel_v64i8:
1135 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
1136 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1137 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
1138 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1139 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
1140 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1141 ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
1142 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
1143 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
1144 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1145 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
1146 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
1147 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
1148 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
1149 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9
1150 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
1151 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
1152 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
1153 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
1154 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
1155 ; AVX512F-NEXT: vpmullw %ymm11, %ymm10, %ymm10
1156 ; AVX512F-NEXT: vpsrlw $8, %ymm10, %ymm10
1157 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
1158 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1159 ; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
1160 ; AVX512F-NEXT: vpmullw %ymm13, %ymm12, %ymm12
1161 ; AVX512F-NEXT: vpsrlw $8, %ymm12, %ymm12
1162 ; AVX512F-NEXT: vpackuswb %ymm10, %ymm12, %ymm10
1163 ; AVX512F-NEXT: vpor %ymm10, %ymm3, %ymm3
1164 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
1165 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
1166 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
1167 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
1168 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
1169 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
1170 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
1171 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
1172 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
1173 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0
1174 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
1175 ; AVX512F-NEXT: vpmullw %ymm11, %ymm3, %ymm3
1176 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1177 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
1178 ; AVX512F-NEXT: vpmullw %ymm13, %ymm4, %ymm4
1179 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
1180 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
1181 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
1182 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
1183 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1184 ; AVX512F-NEXT: retq
1186 ; AVX512VL-LABEL: constant_funnnel_v64i8:
1187 ; AVX512VL: # %bb.0:
1188 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
1189 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1190 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
1191 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1192 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
1193 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1194 ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
1195 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
1196 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
1197 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1198 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
1199 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
1200 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
1201 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
1202 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9
1203 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
1204 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1205 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
1206 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
1207 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
1208 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm4, %ymm4
1209 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
1210 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1211 ; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11
1212 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1213 ; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1]
1214 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11
1215 ; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11
1216 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm11, %ymm4
1217 ; AVX512VL-NEXT: vpor %ymm4, %ymm3, %ymm3
1218 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
1219 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
1220 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
1221 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
1222 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
1223 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
1224 ; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
1225 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
1226 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
1227 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0
1228 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1229 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1230 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm3, %ymm3
1231 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1232 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1233 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
1234 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm5, %ymm5
1235 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
1236 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
1237 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
1238 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
1239 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1240 ; AVX512VL-NEXT: retq
1242 ; AVX512BW-LABEL: constant_funnnel_v64i8:
1243 ; AVX512BW: # %bb.0:
1244 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1245 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1246 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1247 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
1248 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1249 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1250 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm3
1251 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1252 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1253 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1254 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1255 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1256 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
1257 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
1258 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1259 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
1260 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1261 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
1262 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1263 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
1264 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
1265 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
1266 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
1267 ; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
1268 ; AVX512BW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1269 ; AVX512BW-NEXT: kmovq %rax, %k1
1270 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1271 ; AVX512BW-NEXT: retq
1273 ; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
1274 ; AVX512VBMI2: # %bb.0:
1275 ; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1276 ; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1277 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1278 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
1279 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1280 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1281 ; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm3
1282 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1283 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1284 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1285 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1286 ; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1287 ; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
1288 ; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
1289 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1290 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1291 ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1292 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1293 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1294 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
1295 ; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
1296 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
1297 ; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
1298 ; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
1299 ; AVX512VBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1300 ; AVX512VBMI2-NEXT: kmovq %rax, %k1
1301 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1302 ; AVX512VBMI2-NEXT: retq
1304 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
1305 ; AVX512VLBW: # %bb.0:
1306 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1307 ; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1308 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1309 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
1310 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1311 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1312 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm3
1313 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1314 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1315 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1316 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1317 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1318 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
1319 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
1320 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1321 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
1322 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1323 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2
1324 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1325 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
1326 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
1327 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
1328 ; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
1329 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
1330 ; AVX512VLBW-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1331 ; AVX512VLBW-NEXT: kmovq %rax, %k1
1332 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1333 ; AVX512VLBW-NEXT: retq
1335 ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
1336 ; AVX512VLVBMI2: # %bb.0:
1337 ; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1338 ; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1339 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1340 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
1341 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1342 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1343 ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm3
1344 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
1345 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1346 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1347 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
1348 ; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
1349 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
1350 ; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
1351 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1352 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1353 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2
1354 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2
1355 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1356 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
1357 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
1358 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
1359 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
1360 ; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
1361 ; AVX512VLVBMI2-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
1362 ; AVX512VLVBMI2-NEXT: kmovq %rax, %k1
1363 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1364 ; AVX512VLVBMI2-NEXT: retq
1365 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1370 ; Uniform Constant Shifts
1373 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
1374 ; AVX512F-LABEL: splatconstant_funnnel_v8i64:
1376 ; AVX512F-NEXT: vpsrlq $14, %zmm1, %zmm1
1377 ; AVX512F-NEXT: vpsllq $50, %zmm0, %zmm0
1378 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
1379 ; AVX512F-NEXT: retq
1381 ; AVX512VL-LABEL: splatconstant_funnnel_v8i64:
1382 ; AVX512VL: # %bb.0:
1383 ; AVX512VL-NEXT: vpsrlq $14, %zmm1, %zmm1
1384 ; AVX512VL-NEXT: vpsllq $50, %zmm0, %zmm0
1385 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
1386 ; AVX512VL-NEXT: retq
1388 ; AVX512BW-LABEL: splatconstant_funnnel_v8i64:
1389 ; AVX512BW: # %bb.0:
1390 ; AVX512BW-NEXT: vpsrlq $14, %zmm1, %zmm1
1391 ; AVX512BW-NEXT: vpsllq $50, %zmm0, %zmm0
1392 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1393 ; AVX512BW-NEXT: retq
1395 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i64:
1396 ; AVX512VBMI2: # %bb.0:
1397 ; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0
1398 ; AVX512VBMI2-NEXT: retq
1400 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i64:
1401 ; AVX512VLBW: # %bb.0:
1402 ; AVX512VLBW-NEXT: vpsrlq $14, %zmm1, %zmm1
1403 ; AVX512VLBW-NEXT: vpsllq $50, %zmm0, %zmm0
1404 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1405 ; AVX512VLBW-NEXT: retq
1407 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i64:
1408 ; AVX512VLVBMI2: # %bb.0:
1409 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0
1410 ; AVX512VLVBMI2-NEXT: retq
1411 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
1415 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
1416 ; AVX512F-LABEL: splatconstant_funnnel_v16i32:
1418 ; AVX512F-NEXT: vpsrld $4, %zmm1, %zmm1
1419 ; AVX512F-NEXT: vpslld $28, %zmm0, %zmm0
1420 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1421 ; AVX512F-NEXT: retq
1423 ; AVX512VL-LABEL: splatconstant_funnnel_v16i32:
1424 ; AVX512VL: # %bb.0:
1425 ; AVX512VL-NEXT: vpsrld $4, %zmm1, %zmm1
1426 ; AVX512VL-NEXT: vpslld $28, %zmm0, %zmm0
1427 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1428 ; AVX512VL-NEXT: retq
1430 ; AVX512BW-LABEL: splatconstant_funnnel_v16i32:
1431 ; AVX512BW: # %bb.0:
1432 ; AVX512BW-NEXT: vpsrld $4, %zmm1, %zmm1
1433 ; AVX512BW-NEXT: vpslld $28, %zmm0, %zmm0
1434 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
1435 ; AVX512BW-NEXT: retq
1437 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i32:
1438 ; AVX512VBMI2: # %bb.0:
1439 ; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
1440 ; AVX512VBMI2-NEXT: retq
1442 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i32:
1443 ; AVX512VLBW: # %bb.0:
1444 ; AVX512VLBW-NEXT: vpsrld $4, %zmm1, %zmm1
1445 ; AVX512VLBW-NEXT: vpslld $28, %zmm0, %zmm0
1446 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
1447 ; AVX512VLBW-NEXT: retq
1449 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i32:
1450 ; AVX512VLVBMI2: # %bb.0:
1451 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
1452 ; AVX512VLVBMI2-NEXT: retq
1453 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1457 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
1458 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
1460 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1461 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1462 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3
1463 ; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2
1464 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
1465 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1
1466 ; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
1467 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1468 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1469 ; AVX512F-NEXT: retq
1471 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
1472 ; AVX512VL: # %bb.0:
1473 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1474 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1475 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3
1476 ; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2
1477 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
1478 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm1
1479 ; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
1480 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1481 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1482 ; AVX512VL-NEXT: retq
1484 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
1485 ; AVX512BW: # %bb.0:
1486 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
1487 ; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm0
1488 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1489 ; AVX512BW-NEXT: retq
1491 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
1492 ; AVX512VBMI2: # %bb.0:
1493 ; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0
1494 ; AVX512VBMI2-NEXT: retq
1496 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
1497 ; AVX512VLBW: # %bb.0:
1498 ; AVX512VLBW-NEXT: vpsrlw $7, %zmm1, %zmm1
1499 ; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm0
1500 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1501 ; AVX512VLBW-NEXT: retq
1503 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
1504 ; AVX512VLVBMI2: # %bb.0:
1505 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0
1506 ; AVX512VLVBMI2-NEXT: retq
1507 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1511 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
1512 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
1514 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1515 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1516 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3
1517 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1518 ; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
1519 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
1520 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
1521 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
1522 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
1523 ; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1
1524 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1525 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
1526 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1527 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1528 ; AVX512F-NEXT: retq
1530 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
1531 ; AVX512VL: # %bb.0:
1532 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1533 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1534 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
1535 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1536 ; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3
1537 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
1538 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
1539 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
1540 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
1541 ; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1
1542 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
1543 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
1544 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1545 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
1546 ; AVX512VL-NEXT: retq
1548 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
1549 ; AVX512BW: # %bb.0:
1550 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm1
1551 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1552 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
1553 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1554 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1555 ; AVX512BW-NEXT: retq
1557 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
1558 ; AVX512VBMI2: # %bb.0:
1559 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1560 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1561 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1562 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1563 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1564 ; AVX512VBMI2-NEXT: retq
1566 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
1567 ; AVX512VLBW: # %bb.0:
1568 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm1
1569 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1570 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
1571 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1572 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1573 ; AVX512VLBW-NEXT: retq
1575 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
1576 ; AVX512VLVBMI2: # %bb.0:
1577 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1578 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
1579 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1580 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
1581 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1582 ; AVX512VLVBMI2-NEXT: retq
1583 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)