1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
7 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
8 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9 declare <32 x i16> @llvm.fshr.v32i16(<32 x i16>, <32 x i16>, <32 x i16>)
10 declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
16 define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
17 ; AVX512-LABEL: var_funnnel_v8i64:
19 ; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0
21 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %amt)
25 define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
26 ; AVX512-LABEL: var_funnnel_v16i32:
28 ; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0
30 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %amt)
34 define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
35 ; AVX512F-LABEL: var_funnnel_v32i16:
37 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
38 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
39 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
40 ; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm3
41 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
42 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
43 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
44 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
45 ; AVX512F-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
46 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
47 ; AVX512F-NEXT: vpsubw %ymm3, %ymm7, %ymm3
48 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
49 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
50 ; AVX512F-NEXT: vpord %zmm2, %zmm6, %zmm2
51 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
52 ; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1
53 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
54 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
55 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
56 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
57 ; AVX512F-NEXT: vpsubw %ymm1, %ymm7, %ymm1
58 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
59 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
60 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
61 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
62 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
65 ; AVX512VL-LABEL: var_funnnel_v32i16:
67 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
68 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
69 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
70 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm3
71 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
72 ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
73 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
74 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
75 ; AVX512VL-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
76 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
77 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm7, %ymm3
78 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
79 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
80 ; AVX512VL-NEXT: vpord %zmm2, %zmm6, %zmm2
81 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
82 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1
83 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
84 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
85 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
86 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
87 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm7, %ymm1
88 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
89 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
90 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
91 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
92 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
95 ; AVX512BW-LABEL: var_funnnel_v32i16:
97 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
98 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
99 ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
100 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
101 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
102 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
103 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
104 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
105 ; AVX512BW-NEXT: retq
107 ; AVX512VLBW-LABEL: var_funnnel_v32i16:
108 ; AVX512VLBW: # %bb.0:
109 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
110 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
111 ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
112 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
113 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm4, %zmm1
114 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
115 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
116 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
117 ; AVX512VLBW-NEXT: retq
118 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
122 define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
123 ; AVX512F-LABEL: var_funnnel_v64i8:
125 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
126 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
127 ; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
128 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
129 ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
130 ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
131 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
132 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
133 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
134 ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
135 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
136 ; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
137 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
138 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
139 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
140 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
141 ; AVX512F-NEXT: vpandn %ymm4, %ymm8, %ymm4
142 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm9
143 ; AVX512F-NEXT: vpand %ymm8, %ymm9, %ymm9
144 ; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
145 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
146 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
147 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
148 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
149 ; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4
150 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm10
151 ; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4
152 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
153 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
154 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
155 ; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
156 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
157 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
158 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
159 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
160 ; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
161 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
162 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
163 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
164 ; AVX512F-NEXT: vpandn %ymm3, %ymm8, %ymm3
165 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
166 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
167 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
168 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
169 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
170 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
171 ; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3
172 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
173 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
174 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
175 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
176 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
179 ; AVX512VL-LABEL: var_funnnel_v64i8:
181 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
182 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
183 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
184 ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5
185 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
186 ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5
187 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
188 ; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm2
189 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
190 ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
191 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
192 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3
193 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5
194 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8
195 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
196 ; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm8
197 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
198 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm3
199 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5
200 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
201 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
202 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10
203 ; AVX512VL-NEXT: vpor %ymm5, %ymm10, %ymm5
204 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
205 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2
206 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
207 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
208 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5
209 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm4, %ymm1
210 ; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1
211 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
212 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
213 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
214 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
215 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm9, %ymm4
216 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
217 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
218 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
219 ; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3
220 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
221 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
222 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
223 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
224 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
225 ; AVX512VL-NEXT: retq
227 ; AVX512BW-LABEL: var_funnnel_v64i8:
229 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
230 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
231 ; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
232 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
233 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
234 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
235 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
236 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
237 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
238 ; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5
239 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
240 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
241 ; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5
242 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
243 ; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
244 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
245 ; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
246 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
247 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
248 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
249 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
250 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
251 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
252 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
253 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
254 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
255 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
256 ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
257 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
258 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
259 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm1
260 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
261 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
262 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
263 ; AVX512BW-NEXT: retq
265 ; AVX512VLBW-LABEL: var_funnnel_v64i8:
266 ; AVX512VLBW: # %bb.0:
267 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
268 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
269 ; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
270 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
271 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
272 ; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
273 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3
274 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
275 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
276 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5
277 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
278 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
279 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5
280 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
281 ; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
282 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
283 ; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
284 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
285 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
286 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
287 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
288 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
289 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
290 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
291 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
292 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
293 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
294 ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
295 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
296 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
297 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm1
298 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
299 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
300 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
301 ; AVX512VLBW-NEXT: retq
302 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
307 ; Uniform Variable Shifts
310 define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %amt) nounwind {
311 ; AVX512-LABEL: splatvar_funnnel_v8i64:
313 ; AVX512-NEXT: vpbroadcastq %xmm1, %zmm1
314 ; AVX512-NEXT: vprorvq %zmm1, %zmm0, %zmm0
316 %splat = shufflevector <8 x i64> %amt, <8 x i64> undef, <8 x i32> zeroinitializer
317 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> %splat)
321 define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind {
322 ; AVX512-LABEL: splatvar_funnnel_v16i32:
324 ; AVX512-NEXT: vpbroadcastd %xmm1, %zmm1
325 ; AVX512-NEXT: vprorvd %zmm1, %zmm0, %zmm0
327 %splat = shufflevector <16 x i32> %amt, <16 x i32> undef, <16 x i32> zeroinitializer
328 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> %splat)
332 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
333 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
335 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
336 ; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
337 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
338 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
339 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
340 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
341 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
342 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
343 ; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
344 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
345 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
346 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
347 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
348 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
349 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
350 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
353 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
355 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
356 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
357 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
358 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
359 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
360 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
361 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
362 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
363 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
364 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
365 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
366 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
367 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
368 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
369 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
370 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
371 ; AVX512VL-NEXT: retq
373 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
375 ; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1
376 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
377 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
378 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
379 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
380 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
381 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
382 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
383 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
384 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
385 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
386 ; AVX512BW-NEXT: retq
388 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
389 ; AVX512VLBW: # %bb.0:
390 ; AVX512VLBW-NEXT: vpbroadcastw %xmm1, %xmm1
391 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
392 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
393 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
394 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
395 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
396 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm4, %xmm1
397 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
398 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
399 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
400 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
401 ; AVX512VLBW-NEXT: retq
402 %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
403 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
407 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
408 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
410 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
411 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
412 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
413 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
414 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
415 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
416 ; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
417 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
418 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
419 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
420 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
421 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
422 ; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
423 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
424 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
425 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
426 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
427 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
428 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
429 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
430 ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
431 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
432 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
433 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
434 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
435 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
438 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
440 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
441 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
442 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
443 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
444 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
445 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
446 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
447 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
448 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
449 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
450 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
451 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
452 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
453 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
454 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
455 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
456 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
457 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
458 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
459 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
460 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
461 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
462 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
463 ; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
464 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
465 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
466 ; AVX512VL-NEXT: retq
468 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
470 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
471 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
472 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
473 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
474 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
475 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
476 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
477 ; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
478 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
479 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
480 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
481 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
482 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
483 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
484 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
485 ; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
486 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
487 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
488 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
489 ; AVX512BW-NEXT: retq
491 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
492 ; AVX512VLBW: # %bb.0:
493 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
494 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
495 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
496 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
497 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
498 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
499 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
500 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
501 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
502 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
503 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
504 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
505 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
506 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
507 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
508 ; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
509 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
510 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
511 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
512 ; AVX512VLBW-NEXT: retq
513 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
514 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
522 define <8 x i64> @constant_funnnel_v8i64(<8 x i64> %x) nounwind {
523 ; AVX512-LABEL: constant_funnnel_v8i64:
525 ; AVX512-NEXT: vprorvq {{.*}}(%rip), %zmm0, %zmm0
527 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>)
531 define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind {
532 ; AVX512-LABEL: constant_funnnel_v16i32:
534 ; AVX512-NEXT: vprorvd {{.*}}(%rip), %zmm0, %zmm0
536 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
540 define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
541 ; AVX512F-LABEL: constant_funnnel_v32i16:
543 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
544 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
545 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
546 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
547 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
548 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
549 ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
550 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
551 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
552 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
553 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
554 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
555 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
556 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
559 ; AVX512VL-LABEL: constant_funnnel_v32i16:
561 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
562 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
563 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
564 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
565 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
566 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
567 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
568 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
569 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
570 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
571 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
572 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
573 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
574 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
575 ; AVX512VL-NEXT: retq
577 ; AVX512BW-LABEL: constant_funnnel_v32i16:
579 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
580 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
581 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
582 ; AVX512BW-NEXT: retq
584 ; AVX512VLBW-LABEL: constant_funnnel_v32i16:
585 ; AVX512VLBW: # %bb.0:
586 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
587 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
588 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
589 ; AVX512VLBW-NEXT: retq
590 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
594 define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
595 ; AVX512F-LABEL: constant_funnnel_v64i8:
597 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
598 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
599 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
600 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
601 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
602 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
603 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
604 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
605 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
606 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
607 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
608 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
609 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
610 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
611 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
612 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
613 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
614 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
615 ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
616 ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
617 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
618 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
619 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
620 ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
621 ; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
622 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
623 ; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
624 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
625 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
626 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
627 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
628 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
629 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
630 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
631 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
632 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
633 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
634 ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
635 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
636 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
637 ; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
638 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
639 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
640 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
641 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
644 ; AVX512VL-LABEL: constant_funnnel_v64i8:
646 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
647 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
648 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
649 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
650 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
651 ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
652 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
653 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
654 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
655 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
656 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
657 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
658 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
659 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
660 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
661 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
662 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
663 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
664 ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
665 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
666 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
667 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
668 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
669 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
670 ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
671 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
672 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
673 ; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
674 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
675 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
676 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
677 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
678 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
679 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
680 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
681 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
682 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
683 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
684 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
685 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
686 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
687 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
688 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
689 ; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
690 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
691 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
692 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
693 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
694 ; AVX512VL-NEXT: retq
696 ; AVX512BW-LABEL: constant_funnnel_v64i8:
698 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
699 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
700 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
701 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
702 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
703 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
704 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
705 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
706 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
707 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
708 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
709 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
710 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
711 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
712 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
713 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
714 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
715 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
716 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
717 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
718 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
719 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
720 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
721 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
722 ; AVX512BW-NEXT: retq
724 ; AVX512VLBW-LABEL: constant_funnnel_v64i8:
725 ; AVX512VLBW: # %bb.0:
726 ; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
727 ; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
728 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
729 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
730 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
731 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
732 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
733 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
734 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
735 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
736 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
737 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
738 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
739 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
740 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
741 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
742 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1
743 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
744 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
745 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
746 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
747 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
748 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
749 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
750 ; AVX512VLBW-NEXT: retq
751 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
756 ; Uniform Constant Shifts
759 define <8 x i64> @splatconstant_funnnel_v8i64(<8 x i64> %x) nounwind {
760 ; AVX512-LABEL: splatconstant_funnnel_v8i64:
762 ; AVX512-NEXT: vprorq $14, %zmm0, %zmm0
764 %res = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %x, <8 x i64> %x, <8 x i64> <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>)
768 define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind {
769 ; AVX512-LABEL: splatconstant_funnnel_v16i32:
771 ; AVX512-NEXT: vprord $4, %zmm0, %zmm0
773 %res = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %x, <16 x i32> %x, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
777 define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
778 ; AVX512F-LABEL: splatconstant_funnnel_v32i16:
780 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
781 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
782 ; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1
783 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
784 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
785 ; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
786 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
787 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
790 ; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
792 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
793 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
794 ; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1
795 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
796 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
797 ; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
798 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
799 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
800 ; AVX512VL-NEXT: retq
802 ; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
804 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1
805 ; AVX512BW-NEXT: vpsllw $9, %zmm0, %zmm0
806 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
807 ; AVX512BW-NEXT: retq
809 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i16:
810 ; AVX512VLBW: # %bb.0:
811 ; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm1
812 ; AVX512VLBW-NEXT: vpsllw $9, %zmm0, %zmm0
813 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
814 ; AVX512VLBW-NEXT: retq
815 %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
819 define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
820 ; AVX512F-LABEL: splatconstant_funnnel_v64i8:
822 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
823 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
824 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
825 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
826 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
827 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
828 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
829 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
830 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
831 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
832 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
833 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
834 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
837 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
839 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
840 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
841 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
842 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
843 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1
844 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
845 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
846 ; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0
847 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
848 ; AVX512VL-NEXT: retq
850 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
852 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
853 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
854 ; AVX512BW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
855 ; AVX512BW-NEXT: retq
857 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
858 ; AVX512VLBW: # %bb.0:
859 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
860 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
861 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
862 ; AVX512VLBW-NEXT: retq
863 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)