1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
8 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9 declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
10 declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
16 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
17 ; AVX512-LABEL: var_funnnel_v8i64:
19 ; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0
21 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
25 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
26 ; AVX512-LABEL: var_funnnel_v16i32:
28 ; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0
30 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
34 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
35 ; AVX512F-LABEL: var_funnnel_v32i16:
37 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
38 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
39 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
40 ; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm3
41 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
42 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
43 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
44 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
45 ; AVX512F-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
46 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
47 ; AVX512F-NEXT: vpsubw %ymm3, %ymm7, %ymm3
48 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
49 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
50 ; AVX512F-NEXT: vpord %zmm2, %zmm6, %zmm2
51 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
52 ; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1
53 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
54 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
55 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
56 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
57 ; AVX512F-NEXT: vpsubw %ymm1, %ymm7, %ymm1
58 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
59 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
60 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
61 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
62 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
65 ; AVX512VL-LABEL: var_funnnel_v32i16:
67 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
68 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
69 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
70 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm3
71 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
72 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
73 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
75 ; AVX512VL-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
76 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
77 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm7, %ymm3
78 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
79 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
80 ; AVX512VL-NEXT: vpord %zmm2, %zmm6, %zmm2
81 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
82 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1
83 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
84 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
85 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
86 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
87 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm7, %ymm1
88 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
89 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
90 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
91 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
92 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
95 ; AVX512BW-LABEL: var_funnnel_v32i16:
97 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
98 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
99 ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
100 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
101 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
102 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
103 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
104 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
105 ; AVX512BW-NEXT: retq
107 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
108 ; AVX512VLBW: # %bb.0:
109 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
110 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
111 ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
112 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
113 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
114 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
115 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
116 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
117 ; AVX512VLBW-NEXT: retq
118 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
122 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
123 ; AVX512F-LABEL: var_funnnel_v64i8:
125 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
126 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
127 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
129 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
130 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
131 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
132 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
133 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
134 ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
135 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
136 ; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
137 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
138 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
139 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
140 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
141 ; AVX512F-NEXT: vpandn %ymm4, %ymm8, %ymm4
142 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm9
143 ; AVX512F-NEXT: vpand %ymm8, %ymm9, %ymm9
144 ; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
145 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
146 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
147 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
148 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
149 ; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4
150 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm10
151 ; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4
152 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
153 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
154 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
155 ; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
156 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
157 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
158 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
159 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
160 ; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
161 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
162 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
163 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
164 ; AVX512F-NEXT: vpandn %ymm3, %ymm8, %ymm3
165 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
166 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
167 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
168 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
169 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
170 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
171 ; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3
172 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
173 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
174 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
175 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
176 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
179 ; AVX512VL-LABEL: var_funnnel_v64i8:
181 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
182 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
183 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
184 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
185 ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
186 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
187 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
188 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
189 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
190 ; AVX512VL-NEXT: vpsubb %ymm2, %ymm6, %ymm2
191 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
192 ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
193 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
194 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
195 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
196 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
197 ; AVX512VL-NEXT: vpandn %ymm4, %ymm8, %ymm4
198 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm9
199 ; AVX512VL-NEXT: vpand %ymm8, %ymm9, %ymm9
200 ; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4
201 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
202 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
203 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
204 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
205 ; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4
206 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10
207 ; AVX512VL-NEXT: vpor %ymm4, %ymm10, %ymm4
208 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
209 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
210 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
211 ; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3
212 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
213 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
214 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
215 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm1
216 ; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1
217 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
218 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
219 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
220 ; AVX512VL-NEXT: vpandn %ymm3, %ymm8, %ymm3
221 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
222 ; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
223 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
224 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
225 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
226 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
227 ; AVX512VL-NEXT: vpand %ymm9, %ymm3, %ymm3
228 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
229 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
230 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
231 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
232 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
233 ; AVX512VL-NEXT: retq
235 ; AVX512BW-LABEL: var_funnnel_v64i8:
237 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
238 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
239 ; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
240 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
241 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
242 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
243 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
244 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
245 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
246 ; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5
247 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
248 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
249 ; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5
250 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
251 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
252 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
253 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
254 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
255 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
256 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
257 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
258 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
259 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
260 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
261 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
262 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
263 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
264 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
265 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
266 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
267 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm1
268 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
269 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
270 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
271 ; AVX512BW-NEXT: retq
273 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
274 ; AVX512VLBW: # %bb.0:
275 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
276 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
277 ; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
278 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
279 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
280 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
281 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3
282 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
283 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
284 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5
285 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
286 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
287 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5
288 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
289 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
290 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
291 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
292 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
293 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
294 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
295 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
296 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
297 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
298 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
299 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
300 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
301 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
302 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
303 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
304 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
305 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm1
306 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
307 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
308 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
309 ; AVX512VLBW-NEXT: retq
310 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
315 ; Uniform Variable Shifts
318 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
319 ; AVX512-LABEL: splatvar_funnnel_v8i64:
321 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
322 ; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0
324 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
325 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
329 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
330 ; AVX512-LABEL: splatvar_funnnel_v16i32:
332 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
333 ; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0
335 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
336 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
340 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
341 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
343 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
344 ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
345 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
346 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
347 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
348 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
349 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
350 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
351 ; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
352 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
353 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
354 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
355 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
356 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
357 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
358 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
361 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
363 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
364 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
365 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
366 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
367 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
368 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
369 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
370 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
371 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
372 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
373 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
374 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
375 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
376 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
377 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
378 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
379 ; AVX512VL-NEXT: retq
381 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
383 ; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
384 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
385 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
386 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
387 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
388 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
389 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
390 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
391 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
392 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
393 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
394 ; AVX512BW-NEXT: retq
396 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
397 ; AVX512VLBW: # %bb.0:
398 ; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
399 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
400 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
401 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
402 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
403 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
404 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
405 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
406 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
407 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
408 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
409 ; AVX512VLBW-NEXT: retq
410 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
411 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
415 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
416 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
418 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
419 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
420 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
421 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
422 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
423 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
424 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
425 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
426 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
427 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
428 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
429 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
430 ; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
431 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
432 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
433 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
434 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
435 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
436 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
437 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
438 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
439 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
440 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
441 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
442 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
443 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
446 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
448 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
449 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
450 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
451 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
452 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
453 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
454 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
455 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
456 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
457 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
458 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
459 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
460 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
461 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
462 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
463 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
464 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
465 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
466 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
467 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
468 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
469 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
470 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
471 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
472 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
473 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
474 ; AVX512VL-NEXT: retq
476 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
478 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
479 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
480 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
481 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
482 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
483 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
484 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
485 ; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
486 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
487 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
488 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
489 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
490 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
491 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
492 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
493 ; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
494 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
495 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
496 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
497 ; AVX512BW-NEXT: retq
499 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
500 ; AVX512VLBW: # %bb.0:
501 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
502 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
503 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
504 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
505 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
506 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
507 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
508 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
509 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
510 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
511 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
512 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
513 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
514 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
515 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
516 ; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
517 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
518 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
519 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
520 ; AVX512VLBW-NEXT: retq
521 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
522 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
530 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
531 ; AVX512-LABEL: constant_funnnel_v8i64:
533 ; AVX512-NEXT: vprorvq {{.*}}(%rip), %zmm0, %zmm0
535 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
539 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
540 ; AVX512-LABEL: constant_funnnel_v16i32:
542 ; AVX512-NEXT: vprorvd {{.*}}(%rip), %zmm0, %zmm0
544 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
548 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
549 ; AVX512F-LABEL: constant_funnnel_v32i16:
551 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
552 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
553 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
554 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
555 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
556 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
557 ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
558 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
559 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
560 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
561 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
562 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
563 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
564 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
567 ; AVX512VL-LABEL: constant_funnnel_v32i16:
569 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
570 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
571 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
572 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
573 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
574 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
575 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
576 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
577 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
578 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
579 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
580 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
581 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
582 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
583 ; AVX512VL-NEXT: retq
585 ; AVX512BW-LABEL: constant_funnnel_v32i16:
587 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
588 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
589 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
590 ; AVX512BW-NEXT: retq
592 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
593 ; AVX512VLBW: # %bb.0:
594 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
595 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
596 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
597 ; AVX512VLBW-NEXT: retq
598 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
602 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
603 ; AVX512F-LABEL: constant_funnnel_v64i8:
605 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
606 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
607 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
608 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
609 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
610 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
611 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
612 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
613 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
614 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
615 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
616 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
617 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
618 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
619 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
620 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
621 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
622 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
623 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
624 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
625 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
626 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
627 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
628 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
629 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
630 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
631 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
632 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
633 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
634 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
635 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
636 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
637 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
638 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
639 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
640 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
641 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
642 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
643 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
644 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
645 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
646 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
647 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
648 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
649 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
652 ; AVX512VL-LABEL: constant_funnnel_v64i8:
654 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
655 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
656 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
657 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
658 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
659 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
660 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
661 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
662 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
663 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
664 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
665 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
666 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
667 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
668 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
669 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
670 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
671 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
672 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
673 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
674 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
675 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
676 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
677 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
678 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
679 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
680 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
681 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
682 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
683 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
684 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
685 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
686 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
687 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
688 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
689 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
690 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
691 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
692 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
693 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
694 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
695 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
696 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
697 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
698 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
699 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
700 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
701 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
702 ; AVX512VL-NEXT: retq
704 ; AVX512BW-LABEL: constant_funnnel_v64i8:
706 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
707 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
708 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
709 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
710 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
711 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
712 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
713 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
714 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
715 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
716 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
717 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
718 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
719 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
720 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
721 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
722 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
723 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
724 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
725 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
726 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
727 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
728 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
729 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
730 ; AVX512BW-NEXT: retq
732 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
733 ; AVX512VLBW: # %bb.0:
734 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
735 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
736 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
737 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
738 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
739 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
740 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
741 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
742 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
743 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
744 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
745 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
746 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
747 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
748 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
749 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
750 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
751 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
752 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
753 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
754 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
755 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
756 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
757 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
758 ; AVX512VLBW-NEXT: retq
759 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
764 ; Uniform Constant Shifts
767 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
768 ; AVX512-LABEL: splatconstant_funnnel_v8i64:
770 ; AVX512-NEXT: vprorq $14, %zmm0, %zmm0
772 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
776 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
777 ; AVX512-LABEL: splatconstant_funnnel_v16i32:
779 ; AVX512-NEXT: vprord $4, %zmm0, %zmm0
781 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
785 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
786 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
788 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
789 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
790 ; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1
791 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
792 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
793 ; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
794 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
795 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
798 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
800 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
801 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
802 ; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1
803 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
804 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
805 ; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
806 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
807 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
808 ; AVX512VL-NEXT: retq
810 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
812 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1
813 ; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm0
814 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
815 ; AVX512BW-NEXT: retq
817 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
818 ; AVX512VLBW: # %bb.0:
819 ; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm1
820 ; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm0
821 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
822 ; AVX512VLBW-NEXT: retq
823 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
827 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
828 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
830 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
831 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
832 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
833 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
834 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
835 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
836 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
837 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
838 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
839 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
840 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
841 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
842 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
845 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
847 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
848 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
849 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
850 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
851 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
852 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
853 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
854 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
855 ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
856 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
857 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
858 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
859 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
860 ; AVX512VL-NEXT: retq
862 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
864 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
865 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
866 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
867 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
868 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
869 ; AVX512BW-NEXT: retq
871 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
872 ; AVX512VLBW: # %bb.0:
873 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
874 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
875 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
876 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
877 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
878 ; AVX512VLBW-NEXT: retq
879 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)