1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
11 declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
12 declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
13 declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
14 declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
20 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
21 ; AVX1-LABEL: var_funnnel_v4i64:
23 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
24 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3
25 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
26 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm5
27 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
28 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm4
29 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
30 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm5
31 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
32 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
33 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
34 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
35 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
36 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
37 ; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
38 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
39 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
40 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm7
41 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
42 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm2
43 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
44 ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
45 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
46 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm4
47 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
48 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
49 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
50 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
51 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0
54 ; AVX2-LABEL: var_funnnel_v4i64:
56 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
57 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
58 ; AVX2-NEXT: vpsrlvq %ymm3, %ymm0, %ymm3
59 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
60 ; AVX2-NEXT: vpsubq %ymm1, %ymm4, %ymm1
61 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
62 ; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
63 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
66 ; AVX512F-LABEL: var_funnnel_v4i64:
68 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
69 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
70 ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0
71 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
74 ; AVX512VL-LABEL: var_funnnel_v4i64:
76 ; AVX512VL-NEXT: vprorvq %ymm1, %ymm0, %ymm0
79 ; AVX512BW-LABEL: var_funnnel_v4i64:
81 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
82 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
83 ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0
84 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
87 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
88 ; AVX512VLBW: # %bb.0:
89 ; AVX512VLBW-NEXT: vprorvq %ymm1, %ymm0, %ymm0
90 ; AVX512VLBW-NEXT: retq
92 ; XOPAVX1-LABEL: var_funnnel_v4i64:
94 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
95 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
96 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
97 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
98 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm4, %xmm2
99 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm1
100 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
101 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
104 ; XOPAVX2-LABEL: var_funnnel_v4i64:
106 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
107 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
108 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
109 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
110 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
111 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
112 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
114 %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
118 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
119 ; AVX1-LABEL: var_funnnel_v8i32:
121 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
122 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
123 ; AVX1-NEXT: vpsubd %xmm2, %xmm8, %xmm2
124 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
125 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
126 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
127 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
128 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
129 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
130 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
131 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
132 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
133 ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
134 ; AVX1-NEXT: vpmuludq %xmm2, %xmm7, %xmm2
135 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
136 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
137 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
138 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
139 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
140 ; AVX1-NEXT: vpsubd %xmm1, %xmm8, %xmm1
141 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
142 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
143 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
144 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
145 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
146 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
147 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
148 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
149 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
150 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
151 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
152 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
153 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
154 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
157 ; AVX2-LABEL: var_funnnel_v8i32:
159 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
160 ; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm1
161 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
162 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
163 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
164 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
165 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
166 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
167 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
170 ; AVX512F-LABEL: var_funnnel_v8i32:
172 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
173 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
174 ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
175 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
178 ; AVX512VL-LABEL: var_funnnel_v8i32:
180 ; AVX512VL-NEXT: vprorvd %ymm1, %ymm0, %ymm0
181 ; AVX512VL-NEXT: retq
183 ; AVX512BW-LABEL: var_funnnel_v8i32:
185 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
186 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
187 ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
188 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
189 ; AVX512BW-NEXT: retq
191 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
192 ; AVX512VLBW: # %bb.0:
193 ; AVX512VLBW-NEXT: vprorvd %ymm1, %ymm0, %ymm0
194 ; AVX512VLBW-NEXT: retq
196 ; XOPAVX1-LABEL: var_funnnel_v8i32:
198 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
199 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
200 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
201 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
202 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm4, %xmm2
203 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
204 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
205 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
208 ; XOPAVX2-LABEL: var_funnnel_v8i32:
210 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
211 ; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm1
212 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
213 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
214 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
215 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
216 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
218 %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
222 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
223 ; AVX1-LABEL: var_funnnel_v16i16:
225 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
226 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
227 ; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
228 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
229 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
230 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
231 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
232 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
233 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
234 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
235 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
236 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
237 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
238 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
239 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
240 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
241 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7
242 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
243 ; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
244 ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
245 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
246 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
247 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
248 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
249 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
250 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
251 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
252 ; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
253 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
254 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
255 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
256 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
257 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
258 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
261 ; AVX2-LABEL: var_funnnel_v16i16:
263 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
264 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
265 ; AVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm1
266 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
267 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
268 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
269 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
270 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
271 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
272 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
273 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
274 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
275 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
276 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
277 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
278 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
279 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
280 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
281 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
282 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
283 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
284 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
287 ; AVX512F-LABEL: var_funnnel_v16i16:
289 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
290 ; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm1
291 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
292 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
293 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
294 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
295 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
296 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
297 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
298 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
299 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
300 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
303 ; AVX512VL-LABEL: var_funnnel_v16i16:
305 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
306 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm1
307 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
308 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
309 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
310 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
311 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
312 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
313 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
314 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
315 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
316 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
317 ; AVX512VL-NEXT: retq
319 ; AVX512BW-LABEL: var_funnnel_v16i16:
321 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
322 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
323 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm2, %ymm1
324 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
325 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
326 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
327 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
328 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
329 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
330 ; AVX512BW-NEXT: retq
332 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
333 ; AVX512VLBW: # %bb.0:
334 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
335 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm2, %ymm1
336 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
337 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
338 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
339 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
340 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
341 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
342 ; AVX512VLBW-NEXT: retq
344 ; XOPAVX1-LABEL: var_funnnel_v16i16:
346 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
347 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
348 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
349 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
350 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm4, %xmm2
351 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
352 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
353 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
356 ; XOPAVX2-LABEL: var_funnnel_v16i16:
358 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
359 ; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm1
360 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
361 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
362 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
363 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
364 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
366 %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
370 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
371 ; AVX1-LABEL: var_funnnel_v32i8:
373 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
374 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
375 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
376 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
377 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
378 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
379 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
380 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
381 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
382 ; AVX1-NEXT: vpsubb %xmm5, %xmm8, %xmm5
383 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
384 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
385 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
386 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
387 ; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3
388 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm6
389 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
390 ; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
391 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
392 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
393 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
394 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
395 ; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3
396 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm6
397 ; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
398 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
399 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
400 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
401 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
402 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
403 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
404 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
405 ; AVX1-NEXT: vpsubb %xmm1, %xmm8, %xmm1
406 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
407 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
408 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
409 ; AVX1-NEXT: vpandn %xmm3, %xmm7, %xmm3
410 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
411 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
412 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
413 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
414 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
415 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
416 ; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3
417 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
418 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
419 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
420 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
421 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
424 ; AVX2-LABEL: var_funnnel_v32i8:
426 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
427 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
428 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
429 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
430 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
431 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
432 ; AVX2-NEXT: vpsubb %ymm1, %ymm3, %ymm1
433 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
434 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
436 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
437 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
438 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
439 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
440 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
441 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
442 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
443 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
444 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
445 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
446 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
447 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
450 ; AVX512F-LABEL: var_funnnel_v32i8:
452 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
453 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
454 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
455 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
456 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
457 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
458 ; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm1
459 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
460 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
461 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
462 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
463 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
464 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
465 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
466 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
467 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
468 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
469 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
470 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
471 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
472 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
473 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
476 ; AVX512VL-LABEL: var_funnnel_v32i8:
478 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
479 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
480 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
481 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
482 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
483 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
484 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
485 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
486 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
487 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
488 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
489 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
490 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
491 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
492 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
493 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
494 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
495 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
496 ; AVX512VL-NEXT: retq
498 ; AVX512BW-LABEL: var_funnnel_v32i8:
500 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
501 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3
502 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
503 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
504 ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
505 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
506 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
507 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
508 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
509 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
510 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
511 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
512 ; AVX512BW-NEXT: retq
514 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
515 ; AVX512VLBW: # %bb.0:
516 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
517 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3
518 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
519 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
520 ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
521 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
522 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
523 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1
524 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
525 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
526 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
527 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
528 ; AVX512VLBW-NEXT: retq
530 ; XOPAVX1-LABEL: var_funnnel_v32i8:
532 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
533 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
534 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
535 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
536 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm4, %xmm2
537 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
538 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
539 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
542 ; XOPAVX2-LABEL: var_funnnel_v32i8:
544 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
545 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm1
546 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
547 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
548 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
549 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
550 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
552 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
557 ; Uniform Variable Shifts
560 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
561 ; AVX1-LABEL: splatvar_funnnel_v4i64:
563 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
564 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
565 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
566 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
567 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
568 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
569 ; AVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm5
570 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
571 ; AVX1-NEXT: vpsllq %xmm6, %xmm4, %xmm7
572 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
573 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm2
574 ; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm6
575 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
576 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
577 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
578 ; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm3
579 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
580 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
581 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
584 ; AVX2-LABEL: splatvar_funnnel_v4i64:
586 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
587 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
588 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
589 ; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3
590 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
591 ; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
592 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
593 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
594 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
597 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
599 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
600 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
601 ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0
602 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
605 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
607 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
608 ; AVX512VL-NEXT: vprorvq %ymm1, %ymm0, %ymm0
609 ; AVX512VL-NEXT: retq
611 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
613 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
614 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
615 ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0
616 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
617 ; AVX512BW-NEXT: retq
619 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
620 ; AVX512VLBW: # %bb.0:
621 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
622 ; AVX512VLBW-NEXT: vprorvq %ymm1, %ymm0, %ymm0
623 ; AVX512VLBW-NEXT: retq
625 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
627 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
628 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
629 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
630 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
631 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
632 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
633 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
636 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
638 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1
639 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
640 ; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
641 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
642 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
643 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
644 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
645 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
647 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
648 %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
652 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
653 ; AVX1-LABEL: splatvar_funnnel_v8i32:
655 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
656 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
657 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
658 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
659 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
660 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
661 ; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm4
662 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
663 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
664 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
665 ; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm3
666 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
667 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
668 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
669 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
670 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
673 ; AVX2-LABEL: splatvar_funnnel_v8i32:
675 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
676 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
677 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
678 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
679 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
680 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
681 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
682 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
683 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
684 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
685 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
686 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
689 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
691 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
692 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
693 ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
694 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
697 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
699 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
700 ; AVX512VL-NEXT: vprorvd %ymm1, %ymm0, %ymm0
701 ; AVX512VL-NEXT: retq
703 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
705 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
706 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
707 ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
708 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
709 ; AVX512BW-NEXT: retq
711 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
712 ; AVX512VLBW: # %bb.0:
713 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
714 ; AVX512VLBW-NEXT: vprorvd %ymm1, %ymm0, %ymm0
715 ; AVX512VLBW-NEXT: retq
717 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
719 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
720 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
721 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
722 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
723 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
724 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
725 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
728 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
730 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
731 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
732 ; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm1
733 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
734 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
735 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
736 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
737 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
739 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
740 %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
744 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
745 ; AVX1-LABEL: splatvar_funnnel_v16i16:
747 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
748 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
749 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
750 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
751 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
752 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
753 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
754 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm4
755 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
756 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
757 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
758 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
759 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
760 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2
761 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
762 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
763 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
766 ; AVX2-LABEL: splatvar_funnnel_v16i16:
768 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
769 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
770 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
771 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
772 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
773 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
774 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
775 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
776 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
777 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
778 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
781 ; AVX512-LABEL: splatvar_funnnel_v16i16:
783 ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
784 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
785 ; AVX512-NEXT: vpsubw %xmm1, %xmm2, %xmm1
786 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
787 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
788 ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2
789 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
790 ; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1
791 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
792 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
793 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
796 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
798 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
799 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
800 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
801 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
802 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
803 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
804 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
805 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
808 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
810 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1
811 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
812 ; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm1
813 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
814 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
815 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
816 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
817 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
819 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
820 %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
824 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
825 ; AVX1-LABEL: splatvar_funnnel_v32i8:
827 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
828 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
829 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
830 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
831 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
832 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
833 ; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5
834 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
835 ; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7
836 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
837 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
838 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
839 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
840 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
841 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
842 ; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6
843 ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
844 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
845 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
846 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
847 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
848 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
849 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
850 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
851 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
854 ; AVX2-LABEL: splatvar_funnnel_v32i8:
856 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
857 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
858 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
859 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
860 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
861 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
862 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
863 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
864 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
865 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
866 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
867 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
868 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
869 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
870 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
871 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
872 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
873 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
874 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
877 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
879 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
880 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
881 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
882 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
883 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
884 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
885 ; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
886 ; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
887 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
888 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
889 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
890 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
891 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
892 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
893 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
894 ; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
895 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
896 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
897 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
900 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
902 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
903 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
904 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
905 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
906 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
907 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
908 ; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
909 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
910 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
911 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
912 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
913 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
914 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
915 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
916 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
917 ; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
918 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
919 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
920 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
921 ; AVX512VL-NEXT: retq
923 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
925 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
926 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
927 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3
928 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
929 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
930 ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
931 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
932 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
933 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
934 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
935 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
936 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
937 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
938 ; AVX512BW-NEXT: retq
940 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
941 ; AVX512VLBW: # %bb.0:
942 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1
943 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
944 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3
945 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
946 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
947 ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
948 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
949 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
950 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1
951 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
952 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
953 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
954 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
955 ; AVX512VLBW-NEXT: retq
957 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
959 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
960 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
961 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
962 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
963 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
964 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
965 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
968 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
970 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
971 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
972 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm1
973 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
974 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
975 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
976 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
977 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
979 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
980 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
988 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
989 ; AVX1-LABEL: constant_funnnel_v4i64:
991 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
992 ; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2
993 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm3
994 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
995 ; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm3
996 ; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm4
997 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
998 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
999 ; AVX1-NEXT: vpsllq $4, %xmm1, %xmm3
1000 ; AVX1-NEXT: vpsllq $14, %xmm1, %xmm1
1001 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1002 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm3
1003 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
1004 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1005 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1006 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
1009 ; AVX2-LABEL: constant_funnnel_v4i64:
1011 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm1
1012 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
1013 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1016 ; AVX512F-LABEL: constant_funnnel_v4i64:
1018 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1019 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1020 ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0
1021 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1022 ; AVX512F-NEXT: retq
1024 ; AVX512VL-LABEL: constant_funnnel_v4i64:
1025 ; AVX512VL: # %bb.0:
1026 ; AVX512VL-NEXT: vprorvq {{.*}}(%rip), %ymm0, %ymm0
1027 ; AVX512VL-NEXT: retq
1029 ; AVX512BW-LABEL: constant_funnnel_v4i64:
1030 ; AVX512BW: # %bb.0:
1031 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1032 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1033 ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0
1034 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1035 ; AVX512BW-NEXT: retq
1037 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1038 ; AVX512VLBW: # %bb.0:
1039 ; AVX512VLBW-NEXT: vprorvq {{.*}}(%rip), %ymm0, %ymm0
1040 ; AVX512VLBW-NEXT: retq
1042 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
1044 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
1045 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1046 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
1047 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1048 ; XOPAVX1-NEXT: retq
1050 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
1052 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
1053 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1054 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
1055 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1056 ; XOPAVX2-NEXT: retq
1057 %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1061 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
1062 ; AVX1-LABEL: constant_funnnel_v8i32:
1064 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16777216,8388608,4194304,2097152]
1065 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1066 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1067 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1068 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
1069 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1070 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1071 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1072 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1073 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1074 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1075 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
1076 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1077 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1078 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
1079 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
1080 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1081 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1082 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1083 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1084 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1085 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1088 ; AVX2-LABEL: constant_funnnel_v8i32:
1090 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1091 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1092 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1095 ; AVX512F-LABEL: constant_funnnel_v8i32:
1097 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1098 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1099 ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
1100 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1101 ; AVX512F-NEXT: retq
1103 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1104 ; AVX512VL: # %bb.0:
1105 ; AVX512VL-NEXT: vprorvd {{.*}}(%rip), %ymm0, %ymm0
1106 ; AVX512VL-NEXT: retq
1108 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1109 ; AVX512BW: # %bb.0:
1110 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1111 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1112 ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
1113 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1114 ; AVX512BW-NEXT: retq
1116 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1117 ; AVX512VLBW: # %bb.0:
1118 ; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %ymm0, %ymm0
1119 ; AVX512VLBW-NEXT: retq
1121 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1123 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1124 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1125 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1126 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1127 ; XOPAVX1-NEXT: retq
1129 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1131 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1132 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1133 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1134 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1135 ; XOPAVX2-NEXT: retq
1136 %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1140 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
1141 ; AVX1-LABEL: constant_funnnel_v16i16:
1143 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1144 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,128,64,32,16,8,4,2]
1145 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1146 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1147 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1148 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,32768,16384,8192,4096,2048,1024,512]
1149 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1150 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1151 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1152 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1155 ; AVX2-LABEL: constant_funnnel_v16i16:
1157 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1158 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1159 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1160 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1163 ; AVX512F-LABEL: constant_funnnel_v16i16:
1165 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1166 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1167 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1168 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1169 ; AVX512F-NEXT: retq
1171 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1172 ; AVX512VL: # %bb.0:
1173 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1174 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1175 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1176 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1177 ; AVX512VL-NEXT: retq
1179 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1180 ; AVX512BW: # %bb.0:
1181 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1182 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1183 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
1184 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1185 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1186 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1187 ; AVX512BW-NEXT: retq
1189 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1190 ; AVX512VLBW: # %bb.0:
1191 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1192 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1193 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1194 ; AVX512VLBW-NEXT: retq
1196 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
1198 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1199 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1200 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1201 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1202 ; XOPAVX1-NEXT: retq
1204 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
1206 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1207 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1208 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1209 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1210 ; XOPAVX2-NEXT: retq
1211 %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1215 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
1216 ; AVX1-LABEL: constant_funnnel_v32i8:
1218 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1219 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1220 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1221 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,2,4,8,16,32,64,128]
1222 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1223 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1224 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1225 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,128,64,32,16,8,4,2]
1226 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1227 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1228 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1229 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1230 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,4,8,16,32,64,128]
1231 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1232 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1233 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1234 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
1235 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1236 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1237 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1238 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1239 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1240 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1241 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1242 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1243 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1244 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1245 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1246 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1247 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1248 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1249 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1250 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1251 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1252 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1253 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1256 ; AVX2-LABEL: constant_funnnel_v32i8:
1258 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1259 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1260 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1261 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1262 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1263 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1264 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1265 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1266 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1267 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1268 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1269 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1270 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1271 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1272 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1273 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1274 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1275 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1276 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1277 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1278 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1281 ; AVX512F-LABEL: constant_funnnel_v32i8:
1283 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1284 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1285 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1286 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1287 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1288 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1289 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1290 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1291 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1292 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1293 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1294 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1295 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1296 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1297 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1298 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1299 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1300 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1301 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1302 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1303 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1304 ; AVX512F-NEXT: retq
1306 ; AVX512VL-LABEL: constant_funnnel_v32i8:
1307 ; AVX512VL: # %bb.0:
1308 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1309 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1310 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
1311 ; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1312 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1313 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1314 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1315 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1316 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1317 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1318 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1319 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1320 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1321 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1322 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1323 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1324 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1325 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1326 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1327 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1328 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1329 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1330 ; AVX512VL-NEXT: retq
1332 ; AVX512BW-LABEL: constant_funnnel_v32i8:
1333 ; AVX512BW: # %bb.0:
1334 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1335 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1336 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1337 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1338 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1339 ; AVX512BW-NEXT: retq
1341 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1342 ; AVX512VLBW: # %bb.0:
1343 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1344 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1345 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1346 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1347 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1348 ; AVX512VLBW-NEXT: retq
1350 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
1352 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1353 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,248,249,250,251,252,253,254,255]
1354 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1355 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1356 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1357 ; XOPAVX1-NEXT: retq
1359 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
1361 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1362 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,248,249,250,251,252,253,254,255]
1363 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1364 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1365 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1366 ; XOPAVX2-NEXT: retq
1367 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1372 ; Uniform Constant Shifts
1375 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
1376 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
1378 ; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm1
1379 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1380 ; AVX1-NEXT: vpsrlq $14, %xmm2, %xmm3
1381 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1382 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm0
1383 ; AVX1-NEXT: vpsllq $50, %xmm2, %xmm2
1384 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1385 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1388 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
1390 ; AVX2-NEXT: vpsrlq $14, %ymm0, %ymm1
1391 ; AVX2-NEXT: vpsllq $50, %ymm0, %ymm0
1392 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1395 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
1397 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1398 ; AVX512F-NEXT: vprorq $14, %zmm0, %zmm0
1399 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1400 ; AVX512F-NEXT: retq
1402 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
1403 ; AVX512VL: # %bb.0:
1404 ; AVX512VL-NEXT: vprorq $14, %ymm0, %ymm0
1405 ; AVX512VL-NEXT: retq
1407 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
1408 ; AVX512BW: # %bb.0:
1409 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1410 ; AVX512BW-NEXT: vprorq $14, %zmm0, %zmm0
1411 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1412 ; AVX512BW-NEXT: retq
1414 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
1415 ; AVX512VLBW: # %bb.0:
1416 ; AVX512VLBW-NEXT: vprorq $14, %ymm0, %ymm0
1417 ; AVX512VLBW-NEXT: retq
1419 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
1421 ; XOPAVX1-NEXT: vprotq $50, %xmm0, %xmm1
1422 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1423 ; XOPAVX1-NEXT: vprotq $50, %xmm0, %xmm0
1424 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1425 ; XOPAVX1-NEXT: retq
1427 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
1429 ; XOPAVX2-NEXT: vprotq $50, %xmm0, %xmm1
1430 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1431 ; XOPAVX2-NEXT: vprotq $50, %xmm0, %xmm0
1432 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1433 ; XOPAVX2-NEXT: retq
1434 %res = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
1438 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
1439 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
1441 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1442 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm2
1443 ; AVX1-NEXT: vpslld $28, %xmm1, %xmm1
1444 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1445 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm2
1446 ; AVX1-NEXT: vpslld $28, %xmm0, %xmm0
1447 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1448 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1451 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
1453 ; AVX2-NEXT: vpsrld $4, %ymm0, %ymm1
1454 ; AVX2-NEXT: vpslld $28, %ymm0, %ymm0
1455 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1458 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
1460 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1461 ; AVX512F-NEXT: vprord $4, %zmm0, %zmm0
1462 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1463 ; AVX512F-NEXT: retq
1465 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
1466 ; AVX512VL: # %bb.0:
1467 ; AVX512VL-NEXT: vprord $4, %ymm0, %ymm0
1468 ; AVX512VL-NEXT: retq
1470 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
1471 ; AVX512BW: # %bb.0:
1472 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1473 ; AVX512BW-NEXT: vprord $4, %zmm0, %zmm0
1474 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1475 ; AVX512BW-NEXT: retq
1477 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
1478 ; AVX512VLBW: # %bb.0:
1479 ; AVX512VLBW-NEXT: vprord $4, %ymm0, %ymm0
1480 ; AVX512VLBW-NEXT: retq
1482 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
1484 ; XOPAVX1-NEXT: vprotd $28, %xmm0, %xmm1
1485 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1486 ; XOPAVX1-NEXT: vprotd $28, %xmm0, %xmm0
1487 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1488 ; XOPAVX1-NEXT: retq
1490 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
1492 ; XOPAVX2-NEXT: vprotd $28, %xmm0, %xmm1
1493 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1494 ; XOPAVX2-NEXT: vprotd $28, %xmm0, %xmm0
1495 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1496 ; XOPAVX2-NEXT: retq
1497 %res = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1501 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
1502 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
1504 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1505 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2
1506 ; AVX1-NEXT: vpsllw $9, %xmm1, %xmm1
1507 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1508 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2
1509 ; AVX1-NEXT: vpsllw $9, %xmm0, %xmm0
1510 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1511 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1514 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
1516 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm1
1517 ; AVX2-NEXT: vpsllw $9, %ymm0, %ymm0
1518 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1521 ; AVX512-LABEL: splatconstant_funnnel_v16i16:
1523 ; AVX512-NEXT: vpsrlw $7, %ymm0, %ymm1
1524 ; AVX512-NEXT: vpsllw $9, %ymm0, %ymm0
1525 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1528 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
1530 ; XOPAVX1-NEXT: vprotw $9, %xmm0, %xmm1
1531 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1532 ; XOPAVX1-NEXT: vprotw $9, %xmm0, %xmm0
1533 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1534 ; XOPAVX1-NEXT: retq
1536 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
1538 ; XOPAVX2-NEXT: vprotw $9, %xmm0, %xmm1
1539 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1540 ; XOPAVX2-NEXT: vprotw $9, %xmm0, %xmm0
1541 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1542 ; XOPAVX2-NEXT: retq
1543 %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1547 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
1548 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
1550 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1551 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1552 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1553 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1554 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1555 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1556 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1557 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1558 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1559 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1560 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1561 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1562 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1565 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
1567 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1568 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1569 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1570 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1571 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1574 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
1576 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1577 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1578 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1579 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1580 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1581 ; AVX512F-NEXT: retq
1583 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
1584 ; AVX512VL: # %bb.0:
1585 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1586 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1587 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1588 ; AVX512VL-NEXT: retq
1590 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
1591 ; AVX512BW: # %bb.0:
1592 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm1
1593 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1594 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm0
1595 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1596 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1597 ; AVX512BW-NEXT: retq
1599 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
1600 ; AVX512VLBW: # %bb.0:
1601 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1602 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1603 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1604 ; AVX512VLBW-NEXT: retq
1606 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
1608 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1609 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1610 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1611 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1612 ; XOPAVX1-NEXT: retq
1614 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
1616 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1617 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1618 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1619 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1620 ; XOPAVX2-NEXT: retq
1621 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)