1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1OR2,GFNIAVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1OR2,GFNIAVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512,GFNIAVX512BW
9 ; 128 Bit Vector Funnel Shifts
12 define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
13 ; GFNISSE-LABEL: var_fshl_v16i8:
15 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
16 ; GFNISSE-NEXT: pxor %xmm3, %xmm3
17 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
18 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
19 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
20 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
21 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
22 ; GFNISSE-NEXT: pslld $23, %xmm2
23 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
24 ; GFNISSE-NEXT: paddd %xmm6, %xmm2
25 ; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2
26 ; GFNISSE-NEXT: pslld $23, %xmm3
27 ; GFNISSE-NEXT: paddd %xmm6, %xmm3
28 ; GFNISSE-NEXT: cvttps2dq %xmm3, %xmm3
29 ; GFNISSE-NEXT: packusdw %xmm2, %xmm3
30 ; GFNISSE-NEXT: movdqa %xmm1, %xmm7
31 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
32 ; GFNISSE-NEXT: pmullw %xmm3, %xmm7
33 ; GFNISSE-NEXT: psrlw $8, %xmm7
34 ; GFNISSE-NEXT: pslld $23, %xmm4
35 ; GFNISSE-NEXT: paddd %xmm6, %xmm4
36 ; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm2
37 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
38 ; GFNISSE-NEXT: pslld $23, %xmm5
39 ; GFNISSE-NEXT: paddd %xmm6, %xmm5
40 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm3
41 ; GFNISSE-NEXT: packusdw %xmm3, %xmm2
42 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
43 ; GFNISSE-NEXT: pmullw %xmm1, %xmm2
44 ; GFNISSE-NEXT: psrlw $8, %xmm2
45 ; GFNISSE-NEXT: packuswb %xmm7, %xmm2
46 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0
49 ; GFNIAVX1-LABEL: var_fshl_v16i8:
51 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
52 ; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
53 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
54 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
55 ; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
56 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
57 ; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
58 ; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
59 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
60 ; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm3
61 ; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
62 ; GFNIAVX1-NEXT: vcvttps2dq %xmm3, %xmm3
63 ; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
64 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
65 ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
66 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
67 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
68 ; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
69 ; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
70 ; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
71 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
72 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
73 ; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
74 ; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
75 ; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
76 ; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
77 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
78 ; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
79 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
80 ; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
83 ; GFNIAVX2-LABEL: var_fshl_v16i8:
85 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
86 ; GFNIAVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
87 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
88 ; GFNIAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
89 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
90 ; GFNIAVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
91 ; GFNIAVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm3
92 ; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
93 ; GFNIAVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
94 ; GFNIAVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
95 ; GFNIAVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
96 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
97 ; GFNIAVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
98 ; GFNIAVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
99 ; GFNIAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
100 ; GFNIAVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
101 ; GFNIAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
102 ; GFNIAVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
103 ; GFNIAVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
104 ; GFNIAVX2-NEXT: vzeroupper
105 ; GFNIAVX2-NEXT: retq
107 ; GFNIAVX512VL-LABEL: var_fshl_v16i8:
108 ; GFNIAVX512VL: # %bb.0:
109 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
110 ; GFNIAVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
111 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
112 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
113 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
114 ; GFNIAVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
115 ; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
116 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
117 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
118 ; GFNIAVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
119 ; GFNIAVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
120 ; GFNIAVX512VL-NEXT: vpmovdb %zmm0, %xmm0
121 ; GFNIAVX512VL-NEXT: vzeroupper
122 ; GFNIAVX512VL-NEXT: retq
124 ; GFNIAVX512BW-LABEL: var_fshl_v16i8:
125 ; GFNIAVX512BW: # %bb.0:
126 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
127 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
128 ; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
129 ; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
130 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
131 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
132 ; GFNIAVX512BW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
133 ; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
134 ; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
135 ; GFNIAVX512BW-NEXT: vzeroupper
136 ; GFNIAVX512BW-NEXT: retq
137 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt)
141 define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
142 ; GFNISSE-LABEL: var_fshr_v16i8:
144 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3
145 ; GFNISSE-NEXT: movdqa %xmm0, %xmm2
146 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
147 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
148 ; GFNISSE-NEXT: pand %xmm5, %xmm0
149 ; GFNISSE-NEXT: psllw $5, %xmm0
150 ; GFNISSE-NEXT: movdqa %xmm0, %xmm4
151 ; GFNISSE-NEXT: paddb %xmm0, %xmm4
152 ; GFNISSE-NEXT: movdqa %xmm1, %xmm6
153 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
154 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
155 ; GFNISSE-NEXT: movdqa %xmm1, %xmm6
156 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
157 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
158 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
159 ; GFNISSE-NEXT: movdqa %xmm1, %xmm6
160 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
161 ; GFNISSE-NEXT: paddb %xmm4, %xmm4
162 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
163 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
164 ; GFNISSE-NEXT: pandn %xmm5, %xmm3
165 ; GFNISSE-NEXT: psllw $5, %xmm3
166 ; GFNISSE-NEXT: movdqa %xmm3, %xmm4
167 ; GFNISSE-NEXT: paddb %xmm3, %xmm4
168 ; GFNISSE-NEXT: paddb %xmm2, %xmm2
169 ; GFNISSE-NEXT: movdqa %xmm2, %xmm5
170 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
171 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
172 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
173 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3
174 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
175 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
176 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
177 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3
178 ; GFNISSE-NEXT: paddb %xmm2, %xmm3
179 ; GFNISSE-NEXT: paddb %xmm4, %xmm4
180 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
181 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
182 ; GFNISSE-NEXT: por %xmm1, %xmm2
183 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0
186 ; GFNIAVX1-LABEL: var_fshr_v16i8:
188 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
189 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
190 ; GFNIAVX1-NEXT: vpsllw $5, %xmm4, %xmm4
191 ; GFNIAVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
192 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm6
193 ; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
194 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
195 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
196 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
197 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
198 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
199 ; GFNIAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
200 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
201 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
202 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
203 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
204 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
205 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
206 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
207 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
208 ; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
209 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
210 ; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
211 ; GFNIAVX1-NEXT: retq
213 ; GFNIAVX2-LABEL: var_fshr_v16i8:
215 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
216 ; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
217 ; GFNIAVX2-NEXT: vpsllw $5, %xmm4, %xmm4
218 ; GFNIAVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
219 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm6
220 ; GFNIAVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
221 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
222 ; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
223 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
224 ; GFNIAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
225 ; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
226 ; GFNIAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
227 ; GFNIAVX2-NEXT: vpsllw $5, %xmm2, %xmm2
228 ; GFNIAVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
229 ; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
230 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
231 ; GFNIAVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
232 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
233 ; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
234 ; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
235 ; GFNIAVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
236 ; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
237 ; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
238 ; GFNIAVX2-NEXT: retq
240 ; GFNIAVX512VL-LABEL: var_fshr_v16i8:
241 ; GFNIAVX512VL: # %bb.0:
242 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
243 ; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
244 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
245 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
246 ; GFNIAVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
247 ; GFNIAVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
248 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
249 ; GFNIAVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
250 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
251 ; GFNIAVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
252 ; GFNIAVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
253 ; GFNIAVX512VL-NEXT: vpmovdb %zmm0, %xmm0
254 ; GFNIAVX512VL-NEXT: vzeroupper
255 ; GFNIAVX512VL-NEXT: retq
257 ; GFNIAVX512BW-LABEL: var_fshr_v16i8:
258 ; GFNIAVX512BW: # %bb.0:
259 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
260 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
261 ; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
262 ; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
263 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
264 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
265 ; GFNIAVX512BW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
266 ; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
267 ; GFNIAVX512BW-NEXT: vzeroupper
268 ; GFNIAVX512BW-NEXT: retq
269 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt)
273 define <16 x i8> @splatvar_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
274 ; GFNISSE-LABEL: splatvar_fshl_v16i8:
276 ; GFNISSE-NEXT: movdqa %xmm1, %xmm3
277 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
278 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
279 ; GFNISSE-NEXT: psllw %xmm2, %xmm3
280 ; GFNISSE-NEXT: psrlw $8, %xmm3
281 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
282 ; GFNISSE-NEXT: psllw %xmm2, %xmm1
283 ; GFNISSE-NEXT: psrlw $8, %xmm1
284 ; GFNISSE-NEXT: packuswb %xmm3, %xmm1
285 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0
288 ; GFNIAVX-LABEL: splatvar_fshl_v16i8:
290 ; GFNIAVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
291 ; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
292 ; GFNIAVX-NEXT: vpsllw %xmm2, %xmm3, %xmm3
293 ; GFNIAVX-NEXT: vpsrlw $8, %xmm3, %xmm3
294 ; GFNIAVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
295 ; GFNIAVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
296 ; GFNIAVX-NEXT: vpsrlw $8, %xmm0, %xmm0
297 ; GFNIAVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
299 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
300 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat)
304 define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
305 ; GFNISSE-LABEL: splatvar_fshr_v16i8:
307 ; GFNISSE-NEXT: movdqa %xmm1, %xmm4
308 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
309 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
310 ; GFNISSE-NEXT: psrlw %xmm2, %xmm4
311 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
312 ; GFNISSE-NEXT: pand %xmm3, %xmm4
313 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
314 ; GFNISSE-NEXT: psrlw %xmm2, %xmm1
315 ; GFNISSE-NEXT: pand %xmm1, %xmm3
316 ; GFNISSE-NEXT: packuswb %xmm4, %xmm3
317 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
320 ; GFNIAVX1-LABEL: splatvar_fshr_v16i8:
322 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
323 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
324 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
325 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
326 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
327 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
328 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
329 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
330 ; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
331 ; GFNIAVX1-NEXT: retq
333 ; GFNIAVX2-LABEL: splatvar_fshr_v16i8:
335 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
336 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
337 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
338 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
339 ; GFNIAVX2-NEXT: vpand %xmm4, %xmm3, %xmm3
340 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
341 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
342 ; GFNIAVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
343 ; GFNIAVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
344 ; GFNIAVX2-NEXT: retq
346 ; GFNIAVX512VL-LABEL: splatvar_fshr_v16i8:
347 ; GFNIAVX512VL: # %bb.0:
348 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
349 ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
350 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
351 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
352 ; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3
353 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
354 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
355 ; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0
356 ; GFNIAVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
357 ; GFNIAVX512VL-NEXT: retq
359 ; GFNIAVX512BW-LABEL: splatvar_fshr_v16i8:
360 ; GFNIAVX512BW: # %bb.0:
361 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
362 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
363 ; GFNIAVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
364 ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
365 ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
366 ; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
367 ; GFNIAVX512BW-NEXT: vzeroupper
368 ; GFNIAVX512BW-NEXT: retq
369 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
370 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat)
374 define <16 x i8> @constant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
375 ; GFNISSE-LABEL: constant_fshl_v16i8:
377 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2
378 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
379 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2]
380 ; GFNISSE-NEXT: psrlw $8, %xmm2
381 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
382 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128]
383 ; GFNISSE-NEXT: psrlw $8, %xmm1
384 ; GFNISSE-NEXT: packuswb %xmm2, %xmm1
385 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0
388 ; GFNIAVX1OR2-LABEL: constant_fshl_v16i8:
389 ; GFNIAVX1OR2: # %bb.0:
390 ; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
391 ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
392 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2
393 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
394 ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
395 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
396 ; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
397 ; GFNIAVX1OR2-NEXT: retq
399 ; GFNIAVX512VL-LABEL: constant_fshl_v16i8:
400 ; GFNIAVX512VL: # %bb.0:
401 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
402 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
403 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
404 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
405 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
406 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
407 ; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
408 ; GFNIAVX512VL-NEXT: retq
410 ; GFNIAVX512BW-LABEL: constant_fshl_v16i8:
411 ; GFNIAVX512BW: # %bb.0:
412 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
413 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
414 ; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
415 ; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
416 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
417 ; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
418 ; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
419 ; GFNIAVX512BW-NEXT: vzeroupper
420 ; GFNIAVX512BW-NEXT: retq
421 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
425 define <16 x i8> @constant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
426 ; GFNISSE-LABEL: constant_fshr_v16i8:
428 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2
429 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
430 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2]
431 ; GFNISSE-NEXT: psrlw $8, %xmm2
432 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
433 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128]
434 ; GFNISSE-NEXT: psrlw $8, %xmm1
435 ; GFNISSE-NEXT: packuswb %xmm2, %xmm1
436 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0
439 ; GFNIAVX1OR2-LABEL: constant_fshr_v16i8:
440 ; GFNIAVX1OR2: # %bb.0:
441 ; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
442 ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
443 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2
444 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
445 ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
446 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
447 ; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
448 ; GFNIAVX1OR2-NEXT: retq
450 ; GFNIAVX512VL-LABEL: constant_fshr_v16i8:
451 ; GFNIAVX512VL: # %bb.0:
452 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
453 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
454 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
455 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
456 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
457 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
458 ; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
459 ; GFNIAVX512VL-NEXT: retq
461 ; GFNIAVX512BW-LABEL: constant_fshr_v16i8:
462 ; GFNIAVX512BW: # %bb.0:
463 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
464 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
465 ; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
466 ; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
467 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
468 ; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
469 ; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
470 ; GFNIAVX512BW-NEXT: vzeroupper
471 ; GFNIAVX512BW-NEXT: retq
472 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
476 define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
477 ; GFNISSE-LABEL: splatconstant_fshl_v16i8:
479 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
480 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
481 ; GFNISSE-NEXT: por %xmm1, %xmm0
484 ; GFNIAVX1OR2-LABEL: splatconstant_fshl_v16i8:
485 ; GFNIAVX1OR2: # %bb.0:
486 ; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
487 ; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
488 ; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
489 ; GFNIAVX1OR2-NEXT: retq
491 ; GFNIAVX512-LABEL: splatconstant_fshl_v16i8:
492 ; GFNIAVX512: # %bb.0:
493 ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm2
494 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm1, %xmm0
495 ; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
496 ; GFNIAVX512-NEXT: retq
497 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
500 declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
502 define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
503 ; GFNISSE-LABEL: splatconstant_fshr_v16i8:
505 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
506 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
507 ; GFNISSE-NEXT: por %xmm1, %xmm0
510 ; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8:
511 ; GFNIAVX1OR2: # %bb.0:
512 ; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
513 ; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
514 ; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
515 ; GFNIAVX1OR2-NEXT: retq
517 ; GFNIAVX512-LABEL: splatconstant_fshr_v16i8:
518 ; GFNIAVX512: # %bb.0:
519 ; GFNIAVX512-NEXT: vpaddw %xmm0, %xmm0, %xmm2
520 ; GFNIAVX512-NEXT: vpsrlw $7, %xmm1, %xmm0
521 ; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
522 ; GFNIAVX512-NEXT: retq
523 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>)
526 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
529 ; 256 Bit Vector Funnel Shifts
532 define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
533 ; GFNISSE-LABEL: var_fshl_v32i8:
535 ; GFNISSE-NEXT: movdqa %xmm0, %xmm6
536 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
537 ; GFNISSE-NEXT: pand %xmm8, %xmm4
538 ; GFNISSE-NEXT: pxor %xmm7, %xmm7
539 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
540 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
541 ; GFNISSE-NEXT: movdqa %xmm4, %xmm10
542 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
543 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
544 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
545 ; GFNISSE-NEXT: pslld $23, %xmm10
546 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
547 ; GFNISSE-NEXT: paddd %xmm4, %xmm10
548 ; GFNISSE-NEXT: cvttps2dq %xmm10, %xmm10
549 ; GFNISSE-NEXT: pslld $23, %xmm11
550 ; GFNISSE-NEXT: paddd %xmm4, %xmm11
551 ; GFNISSE-NEXT: cvttps2dq %xmm11, %xmm11
552 ; GFNISSE-NEXT: packusdw %xmm10, %xmm11
553 ; GFNISSE-NEXT: movdqa %xmm2, %xmm10
554 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15]
555 ; GFNISSE-NEXT: pmullw %xmm11, %xmm10
556 ; GFNISSE-NEXT: psrlw $8, %xmm10
557 ; GFNISSE-NEXT: pslld $23, %xmm0
558 ; GFNISSE-NEXT: paddd %xmm4, %xmm0
559 ; GFNISSE-NEXT: cvttps2dq %xmm0, %xmm0
560 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
561 ; GFNISSE-NEXT: pslld $23, %xmm9
562 ; GFNISSE-NEXT: paddd %xmm4, %xmm9
563 ; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm9
564 ; GFNISSE-NEXT: packusdw %xmm9, %xmm0
565 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
566 ; GFNISSE-NEXT: pmullw %xmm2, %xmm0
567 ; GFNISSE-NEXT: psrlw $8, %xmm0
568 ; GFNISSE-NEXT: packuswb %xmm10, %xmm0
569 ; GFNISSE-NEXT: pand %xmm8, %xmm5
570 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
571 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
572 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
573 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
574 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
575 ; GFNISSE-NEXT: pslld $23, %xmm5
576 ; GFNISSE-NEXT: paddd %xmm4, %xmm5
577 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5
578 ; GFNISSE-NEXT: pslld $23, %xmm7
579 ; GFNISSE-NEXT: paddd %xmm4, %xmm7
580 ; GFNISSE-NEXT: cvttps2dq %xmm7, %xmm7
581 ; GFNISSE-NEXT: packusdw %xmm5, %xmm7
582 ; GFNISSE-NEXT: movdqa %xmm3, %xmm5
583 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
584 ; GFNISSE-NEXT: pmullw %xmm7, %xmm5
585 ; GFNISSE-NEXT: psrlw $8, %xmm5
586 ; GFNISSE-NEXT: pslld $23, %xmm2
587 ; GFNISSE-NEXT: paddd %xmm4, %xmm2
588 ; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2
589 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
590 ; GFNISSE-NEXT: pslld $23, %xmm6
591 ; GFNISSE-NEXT: paddd %xmm4, %xmm6
592 ; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm4
593 ; GFNISSE-NEXT: packusdw %xmm4, %xmm2
594 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
595 ; GFNISSE-NEXT: pmullw %xmm3, %xmm2
596 ; GFNISSE-NEXT: psrlw $8, %xmm2
597 ; GFNISSE-NEXT: packuswb %xmm5, %xmm2
598 ; GFNISSE-NEXT: movdqa %xmm2, %xmm1
601 ; GFNIAVX1-LABEL: var_fshl_v32i8:
603 ; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
604 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
605 ; GFNIAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
606 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
607 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
608 ; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm7
609 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
610 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
611 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
612 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
613 ; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
614 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
615 ; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
616 ; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
617 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
618 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
619 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
620 ; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
621 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
622 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
623 ; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9
624 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9
625 ; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9
626 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
627 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
628 ; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
629 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
630 ; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
631 ; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4
632 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
633 ; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
634 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
635 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
636 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
637 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
638 ; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
639 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
640 ; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
641 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
642 ; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
643 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
644 ; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
645 ; GFNIAVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
646 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
647 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
648 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
649 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
650 ; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
651 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
652 ; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
653 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
654 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
655 ; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
656 ; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
657 ; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
658 ; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
659 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
660 ; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
661 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
662 ; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
663 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
664 ; GFNIAVX1-NEXT: retq
666 ; GFNIAVX2-LABEL: var_fshl_v32i8:
668 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
669 ; GFNIAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
670 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
671 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
672 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
673 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
674 ; GFNIAVX2-NEXT: vpsllvd %ymm7, %ymm5, %ymm5
675 ; GFNIAVX2-NEXT: vpsrld $16, %ymm5, %ymm5
676 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
677 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
678 ; GFNIAVX2-NEXT: vpsllvd %ymm6, %ymm3, %ymm3
679 ; GFNIAVX2-NEXT: vpsrld $16, %ymm3, %ymm3
680 ; GFNIAVX2-NEXT: vpackusdw %ymm5, %ymm3, %ymm3
681 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
682 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
683 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
684 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
685 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
686 ; GFNIAVX2-NEXT: vpsllvd %ymm5, %ymm1, %ymm1
687 ; GFNIAVX2-NEXT: vpsrld $16, %ymm1, %ymm1
688 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
689 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
690 ; GFNIAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
691 ; GFNIAVX2-NEXT: vpsrld $16, %ymm0, %ymm0
692 ; GFNIAVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
693 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
694 ; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
695 ; GFNIAVX2-NEXT: retq
697 ; GFNIAVX512VL-LABEL: var_fshl_v32i8:
698 ; GFNIAVX512VL: # %bb.0:
699 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
700 ; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
701 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
702 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
703 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
704 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
705 ; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
706 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
707 ; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
708 ; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
709 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm5
710 ; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
711 ; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
712 ; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
713 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
714 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
715 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm4
716 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
717 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
718 ; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
719 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
720 ; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
721 ; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
722 ; GFNIAVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
723 ; GFNIAVX512VL-NEXT: retq
725 ; GFNIAVX512BW-LABEL: var_fshl_v32i8:
726 ; GFNIAVX512BW: # %bb.0:
727 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
728 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
729 ; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
730 ; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
731 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
732 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
733 ; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
734 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
735 ; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
736 ; GFNIAVX512BW-NEXT: retq
737 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt)
741 define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
742 ; GFNISSE-LABEL: var_fshr_v32i8:
744 ; GFNISSE-NEXT: movdqa %xmm4, %xmm6
745 ; GFNISSE-NEXT: movdqa %xmm0, %xmm4
746 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
747 ; GFNISSE-NEXT: movdqa %xmm2, %xmm9
748 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm9
749 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
750 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
751 ; GFNISSE-NEXT: pand %xmm7, %xmm0
752 ; GFNISSE-NEXT: psllw $5, %xmm0
753 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2
754 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
755 ; GFNISSE-NEXT: movdqa %xmm2, %xmm10
756 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10
757 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
758 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2
759 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
760 ; GFNISSE-NEXT: movdqa %xmm2, %xmm11
761 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11
762 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
763 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm2
764 ; GFNISSE-NEXT: paddb %xmm4, %xmm4
765 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm11 = [16909320,16909320]
766 ; GFNISSE-NEXT: movdqa %xmm4, %xmm12
767 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm12
768 ; GFNISSE-NEXT: pandn %xmm7, %xmm6
769 ; GFNISSE-NEXT: psllw $5, %xmm6
770 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
771 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4
772 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
773 ; GFNISSE-NEXT: movdqa %xmm4, %xmm13
774 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm13
775 ; GFNISSE-NEXT: paddb %xmm6, %xmm6
776 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
777 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
778 ; GFNISSE-NEXT: movdqa %xmm4, %xmm13
779 ; GFNISSE-NEXT: paddb %xmm4, %xmm13
780 ; GFNISSE-NEXT: paddb %xmm6, %xmm6
781 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
782 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
783 ; GFNISSE-NEXT: por %xmm2, %xmm4
784 ; GFNISSE-NEXT: movdqa %xmm3, %xmm2
785 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm2
786 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
787 ; GFNISSE-NEXT: pand %xmm7, %xmm0
788 ; GFNISSE-NEXT: psllw $5, %xmm0
789 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
790 ; GFNISSE-NEXT: movdqa %xmm3, %xmm2
791 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2
792 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
793 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
794 ; GFNISSE-NEXT: movdqa %xmm3, %xmm2
795 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm2
796 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
797 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
798 ; GFNISSE-NEXT: paddb %xmm1, %xmm1
799 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2
800 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm2
801 ; GFNISSE-NEXT: pandn %xmm7, %xmm5
802 ; GFNISSE-NEXT: psllw $5, %xmm5
803 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
804 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
805 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2
806 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm2
807 ; GFNISSE-NEXT: paddb %xmm5, %xmm5
808 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
809 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
810 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2
811 ; GFNISSE-NEXT: paddb %xmm1, %xmm2
812 ; GFNISSE-NEXT: paddb %xmm5, %xmm5
813 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
814 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
815 ; GFNISSE-NEXT: por %xmm3, %xmm1
816 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
819 ; GFNIAVX1-LABEL: var_fshr_v32i8:
821 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
822 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
823 ; GFNIAVX1-NEXT: # xmm5 = mem[0,0]
824 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm4, %xmm6
825 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
826 ; GFNIAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
827 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
828 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm8
829 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm4, %xmm4
830 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
831 ; GFNIAVX1-NEXT: # xmm6 = mem[0,0]
832 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm4, %xmm9
833 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
834 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm4, %xmm4
835 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
836 ; GFNIAVX1-NEXT: # xmm9 = mem[0,0]
837 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm4, %xmm10
838 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
839 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm4, %xmm4
840 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
841 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
842 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm10 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
843 ; GFNIAVX1-NEXT: # xmm10 = mem[0,0]
844 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm8, %xmm11
845 ; GFNIAVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
846 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7
847 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm11, %xmm8, %xmm8
848 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
849 ; GFNIAVX1-NEXT: # xmm11 = mem[0,0]
850 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm11, %xmm8, %xmm12
851 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
852 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm12, %xmm8, %xmm8
853 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm12
854 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
855 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm12, %xmm8, %xmm7
856 ; GFNIAVX1-NEXT: vpor %xmm4, %xmm7, %xmm4
857 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
858 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm7
859 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1
860 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm1, %xmm5
861 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm6
862 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
863 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm1, %xmm5
864 ; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
865 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
866 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
867 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm0, %xmm5
868 ; GFNIAVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
869 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
870 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm0, %xmm0
871 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm11, %xmm0, %xmm3
872 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
873 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
874 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
875 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
876 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
877 ; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
878 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
879 ; GFNIAVX1-NEXT: retq
881 ; GFNIAVX2-LABEL: var_fshr_v32i8:
883 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
884 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
885 ; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
886 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5
887 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6
888 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
889 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
890 ; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
891 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
892 ; GFNIAVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5
893 ; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
894 ; GFNIAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
895 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2
896 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
897 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
898 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
899 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
900 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
901 ; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
902 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
903 ; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
904 ; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
905 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
906 ; GFNIAVX2-NEXT: retq
908 ; GFNIAVX512VL-LABEL: var_fshr_v32i8:
909 ; GFNIAVX512VL: # %bb.0:
910 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
911 ; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
912 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
913 ; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
914 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6
915 ; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
916 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm4
917 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
918 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm4
919 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
920 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
921 ; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
922 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
923 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
924 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
925 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm4
926 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
927 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
928 ; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
929 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
930 ; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
931 ; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
932 ; GFNIAVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
933 ; GFNIAVX512VL-NEXT: retq
935 ; GFNIAVX512BW-LABEL: var_fshr_v32i8:
936 ; GFNIAVX512BW: # %bb.0:
937 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
938 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
939 ; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
940 ; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
941 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
942 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
943 ; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
944 ; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
945 ; GFNIAVX512BW-NEXT: retq
946 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt)
950 define <32 x i8> @splatvar_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
951 ; GFNISSE-LABEL: splatvar_fshl_v32i8:
953 ; GFNISSE-NEXT: movdqa %xmm2, %xmm5
954 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
955 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
956 ; GFNISSE-NEXT: psllw %xmm4, %xmm5
957 ; GFNISSE-NEXT: psrlw $8, %xmm5
958 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
959 ; GFNISSE-NEXT: psllw %xmm4, %xmm2
960 ; GFNISSE-NEXT: psrlw $8, %xmm2
961 ; GFNISSE-NEXT: packuswb %xmm5, %xmm2
962 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
963 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
964 ; GFNISSE-NEXT: psllw %xmm4, %xmm0
965 ; GFNISSE-NEXT: psrlw $8, %xmm0
966 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
967 ; GFNISSE-NEXT: psllw %xmm4, %xmm3
968 ; GFNISSE-NEXT: psrlw $8, %xmm3
969 ; GFNISSE-NEXT: packuswb %xmm0, %xmm3
970 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0
971 ; GFNISSE-NEXT: movdqa %xmm3, %xmm1
974 ; GFNIAVX1-LABEL: splatvar_fshl_v32i8:
976 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
977 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
978 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
979 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
980 ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5
981 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
982 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
983 ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
984 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
985 ; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
986 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
987 ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm4
988 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
989 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
990 ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
991 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
992 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
993 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
994 ; GFNIAVX1-NEXT: retq
996 ; GFNIAVX2-LABEL: splatvar_fshl_v32i8:
998 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
999 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1000 ; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1001 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1002 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1003 ; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1004 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1005 ; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1006 ; GFNIAVX2-NEXT: retq
1008 ; GFNIAVX512-LABEL: splatvar_fshl_v32i8:
1009 ; GFNIAVX512: # %bb.0:
1010 ; GFNIAVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1011 ; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1012 ; GFNIAVX512-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1013 ; GFNIAVX512-NEXT: vpsrlw $8, %ymm3, %ymm3
1014 ; GFNIAVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1015 ; GFNIAVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1016 ; GFNIAVX512-NEXT: vpsrlw $8, %ymm0, %ymm0
1017 ; GFNIAVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1018 ; GFNIAVX512-NEXT: retq
1019 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
1020 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat)
1024 define <32 x i8> @splatvar_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
1025 ; GFNISSE-LABEL: splatvar_fshr_v32i8:
1027 ; GFNISSE-NEXT: movdqa %xmm2, %xmm6
1028 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
1029 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
1030 ; GFNISSE-NEXT: psrlw %xmm4, %xmm6
1031 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1032 ; GFNISSE-NEXT: pand %xmm5, %xmm6
1033 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1034 ; GFNISSE-NEXT: psrlw %xmm4, %xmm2
1035 ; GFNISSE-NEXT: pand %xmm5, %xmm2
1036 ; GFNISSE-NEXT: packuswb %xmm6, %xmm2
1037 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
1038 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1039 ; GFNISSE-NEXT: psrlw %xmm4, %xmm0
1040 ; GFNISSE-NEXT: pand %xmm5, %xmm0
1041 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1042 ; GFNISSE-NEXT: psrlw %xmm4, %xmm3
1043 ; GFNISSE-NEXT: pand %xmm3, %xmm5
1044 ; GFNISSE-NEXT: packuswb %xmm0, %xmm5
1045 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0
1046 ; GFNISSE-NEXT: movdqa %xmm5, %xmm1
1047 ; GFNISSE-NEXT: retq
1049 ; GFNIAVX1-LABEL: splatvar_fshr_v32i8:
1050 ; GFNIAVX1: # %bb.0:
1051 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1052 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1053 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1054 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1055 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
1056 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
1057 ; GFNIAVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
1058 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1059 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1060 ; GFNIAVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
1061 ; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
1062 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1063 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
1064 ; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
1065 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1066 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1067 ; GFNIAVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
1068 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
1069 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1070 ; GFNIAVX1-NEXT: retq
1072 ; GFNIAVX2-LABEL: splatvar_fshr_v32i8:
1073 ; GFNIAVX2: # %bb.0:
1074 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1075 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1076 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
1077 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1078 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1079 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1080 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
1081 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1082 ; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1083 ; GFNIAVX2-NEXT: retq
1085 ; GFNIAVX512VL-LABEL: splatvar_fshr_v32i8:
1086 ; GFNIAVX512VL: # %bb.0:
1087 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1088 ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1089 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
1090 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1091 ; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
1092 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1093 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
1094 ; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
1095 ; GFNIAVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1096 ; GFNIAVX512VL-NEXT: retq
1098 ; GFNIAVX512BW-LABEL: splatvar_fshr_v32i8:
1099 ; GFNIAVX512BW: # %bb.0:
1100 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1101 ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1102 ; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
1103 ; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1104 ; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3
1105 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1106 ; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
1107 ; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm0, %ymm0
1108 ; GFNIAVX512BW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1109 ; GFNIAVX512BW-NEXT: retq
1110 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
1111 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat)
1115 define <32 x i8> @constant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1116 ; GFNISSE-LABEL: constant_fshl_v32i8:
1118 ; GFNISSE-NEXT: movdqa %xmm2, %xmm5
1119 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
1120 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2]
1121 ; GFNISSE-NEXT: pmullw %xmm6, %xmm5
1122 ; GFNISSE-NEXT: psrlw $8, %xmm5
1123 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1124 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1125 ; GFNISSE-NEXT: pmullw %xmm4, %xmm2
1126 ; GFNISSE-NEXT: psrlw $8, %xmm2
1127 ; GFNISSE-NEXT: packuswb %xmm5, %xmm2
1128 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
1129 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1130 ; GFNISSE-NEXT: pmullw %xmm6, %xmm0
1131 ; GFNISSE-NEXT: psrlw $8, %xmm0
1132 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1133 ; GFNISSE-NEXT: pmullw %xmm3, %xmm4
1134 ; GFNISSE-NEXT: psrlw $8, %xmm4
1135 ; GFNISSE-NEXT: packuswb %xmm0, %xmm4
1136 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0
1137 ; GFNISSE-NEXT: movdqa %xmm4, %xmm1
1138 ; GFNISSE-NEXT: retq
1140 ; GFNIAVX1-LABEL: constant_fshl_v32i8:
1141 ; GFNIAVX1: # %bb.0:
1142 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1143 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1144 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1145 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
1146 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
1147 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1148 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1149 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
1150 ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1151 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1152 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
1153 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1154 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
1155 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1156 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1157 ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
1158 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1159 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
1160 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1161 ; GFNIAVX1-NEXT: retq
1163 ; GFNIAVX2-LABEL: constant_fshl_v32i8:
1164 ; GFNIAVX2: # %bb.0:
1165 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1166 ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1167 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1168 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1169 ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1170 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1171 ; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1172 ; GFNIAVX2-NEXT: retq
1174 ; GFNIAVX512VL-LABEL: constant_fshl_v32i8:
1175 ; GFNIAVX512VL: # %bb.0:
1176 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1177 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1178 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1179 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1180 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1181 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1182 ; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1183 ; GFNIAVX512VL-NEXT: retq
1185 ; GFNIAVX512BW-LABEL: constant_fshl_v32i8:
1186 ; GFNIAVX512BW: # %bb.0:
1187 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1188 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1189 ; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
1190 ; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1191 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1192 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
1193 ; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1194 ; GFNIAVX512BW-NEXT: retq
1195 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1199 define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1200 ; GFNISSE-LABEL: constant_fshr_v32i8:
1202 ; GFNISSE-NEXT: movdqa %xmm2, %xmm5
1203 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
1204 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2]
1205 ; GFNISSE-NEXT: pmullw %xmm6, %xmm5
1206 ; GFNISSE-NEXT: psrlw $8, %xmm5
1207 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1208 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1209 ; GFNISSE-NEXT: pmullw %xmm4, %xmm2
1210 ; GFNISSE-NEXT: psrlw $8, %xmm2
1211 ; GFNISSE-NEXT: packuswb %xmm5, %xmm2
1212 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0
1213 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1214 ; GFNISSE-NEXT: pmullw %xmm6, %xmm0
1215 ; GFNISSE-NEXT: psrlw $8, %xmm0
1216 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1217 ; GFNISSE-NEXT: pmullw %xmm3, %xmm4
1218 ; GFNISSE-NEXT: psrlw $8, %xmm4
1219 ; GFNISSE-NEXT: packuswb %xmm0, %xmm4
1220 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0
1221 ; GFNISSE-NEXT: movdqa %xmm4, %xmm1
1222 ; GFNISSE-NEXT: retq
1224 ; GFNIAVX1-LABEL: constant_fshr_v32i8:
1225 ; GFNIAVX1: # %bb.0:
1226 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1227 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1228 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1229 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
1230 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
1231 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1232 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1233 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
1234 ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1235 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1236 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
1237 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1238 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
1239 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1240 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1241 ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
1242 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1243 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
1244 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1245 ; GFNIAVX1-NEXT: retq
1247 ; GFNIAVX2-LABEL: constant_fshr_v32i8:
1248 ; GFNIAVX2: # %bb.0:
1249 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1250 ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1251 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1252 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1253 ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1254 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1255 ; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1256 ; GFNIAVX2-NEXT: retq
1258 ; GFNIAVX512VL-LABEL: constant_fshr_v32i8:
1259 ; GFNIAVX512VL: # %bb.0:
1260 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1261 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1262 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1263 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1264 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1265 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1266 ; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1267 ; GFNIAVX512VL-NEXT: retq
1269 ; GFNIAVX512BW-LABEL: constant_fshr_v32i8:
1270 ; GFNIAVX512BW: # %bb.0:
1271 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1272 ; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1273 ; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
1274 ; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1275 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1276 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
1277 ; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1278 ; GFNIAVX512BW-NEXT: retq
1279 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1283 define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1284 ; GFNISSE-LABEL: splatconstant_fshl_v32i8:
1286 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
1287 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2
1288 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm5 = [16909320,16909320]
1289 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
1290 ; GFNISSE-NEXT: por %xmm2, %xmm0
1291 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3
1292 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
1293 ; GFNISSE-NEXT: por %xmm3, %xmm1
1294 ; GFNISSE-NEXT: retq
1296 ; GFNIAVX1-LABEL: splatconstant_fshl_v32i8:
1297 ; GFNIAVX1: # %bb.0:
1298 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1299 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1300 ; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1301 ; GFNIAVX1-NEXT: retq
1303 ; GFNIAVX2-LABEL: splatconstant_fshl_v32i8:
1304 ; GFNIAVX2: # %bb.0:
1305 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1306 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1307 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1308 ; GFNIAVX2-NEXT: retq
1310 ; GFNIAVX512-LABEL: splatconstant_fshl_v32i8:
1311 ; GFNIAVX512: # %bb.0:
1312 ; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm2
1313 ; GFNIAVX512-NEXT: vpsrlw $4, %ymm1, %ymm0
1314 ; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
1315 ; GFNIAVX512-NEXT: retq
1316 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
1319 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
1321 define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1322 ; GFNISSE-LABEL: splatconstant_fshr_v32i8:
1324 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
1325 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2
1326 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
1327 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
1328 ; GFNISSE-NEXT: por %xmm2, %xmm0
1329 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3
1330 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
1331 ; GFNISSE-NEXT: por %xmm3, %xmm1
1332 ; GFNISSE-NEXT: retq
1334 ; GFNIAVX1-LABEL: splatconstant_fshr_v32i8:
1335 ; GFNIAVX1: # %bb.0:
1336 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1337 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1338 ; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1339 ; GFNIAVX1-NEXT: retq
1341 ; GFNIAVX2-LABEL: splatconstant_fshr_v32i8:
1342 ; GFNIAVX2: # %bb.0:
1343 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1344 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1345 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1346 ; GFNIAVX2-NEXT: retq
1348 ; GFNIAVX512-LABEL: splatconstant_fshr_v32i8:
1349 ; GFNIAVX512: # %bb.0:
1350 ; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm2
1351 ; GFNIAVX512-NEXT: vpsrlw $6, %ymm1, %ymm0
1352 ; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
1353 ; GFNIAVX512-NEXT: retq
1354 %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>)
1357 declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
1360 ; 512 Bit Vector Funnel Shifts
1363 define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
1364 ; GFNISSE-LABEL: var_fshl_v64i8:
1366 ; GFNISSE-NEXT: movdqa %xmm1, %xmm8
1367 ; GFNISSE-NEXT: movdqa %xmm0, %xmm1
1368 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1369 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
1370 ; GFNISSE-NEXT: pand %xmm9, %xmm0
1371 ; GFNISSE-NEXT: pxor %xmm10, %xmm10
1372 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1373 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1374 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
1375 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1376 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1377 ; GFNISSE-NEXT: pslld $23, %xmm0
1378 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [1065353216,1065353216,1065353216,1065353216]
1379 ; GFNISSE-NEXT: paddd %xmm11, %xmm0
1380 ; GFNISSE-NEXT: cvttps2dq %xmm0, %xmm0
1381 ; GFNISSE-NEXT: pslld $23, %xmm14
1382 ; GFNISSE-NEXT: paddd %xmm11, %xmm14
1383 ; GFNISSE-NEXT: cvttps2dq %xmm14, %xmm14
1384 ; GFNISSE-NEXT: packusdw %xmm0, %xmm14
1385 ; GFNISSE-NEXT: movdqa %xmm4, %xmm15
1386 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
1387 ; GFNISSE-NEXT: pmullw %xmm14, %xmm15
1388 ; GFNISSE-NEXT: psrlw $8, %xmm15
1389 ; GFNISSE-NEXT: pslld $23, %xmm12
1390 ; GFNISSE-NEXT: paddd %xmm11, %xmm12
1391 ; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm0
1392 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7]
1393 ; GFNISSE-NEXT: pslld $23, %xmm13
1394 ; GFNISSE-NEXT: paddd %xmm11, %xmm13
1395 ; GFNISSE-NEXT: cvttps2dq %xmm13, %xmm12
1396 ; GFNISSE-NEXT: packusdw %xmm12, %xmm0
1397 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1398 ; GFNISSE-NEXT: pmullw %xmm4, %xmm0
1399 ; GFNISSE-NEXT: psrlw $8, %xmm0
1400 ; GFNISSE-NEXT: packuswb %xmm15, %xmm0
1401 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
1402 ; GFNISSE-NEXT: pand %xmm9, %xmm1
1403 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1404 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1405 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
1406 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1407 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1408 ; GFNISSE-NEXT: pslld $23, %xmm1
1409 ; GFNISSE-NEXT: paddd %xmm11, %xmm1
1410 ; GFNISSE-NEXT: cvttps2dq %xmm1, %xmm1
1411 ; GFNISSE-NEXT: pslld $23, %xmm13
1412 ; GFNISSE-NEXT: paddd %xmm11, %xmm13
1413 ; GFNISSE-NEXT: cvttps2dq %xmm13, %xmm13
1414 ; GFNISSE-NEXT: packusdw %xmm1, %xmm13
1415 ; GFNISSE-NEXT: movdqa %xmm5, %xmm14
1416 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
1417 ; GFNISSE-NEXT: pmullw %xmm13, %xmm14
1418 ; GFNISSE-NEXT: psrlw $8, %xmm14
1419 ; GFNISSE-NEXT: pslld $23, %xmm4
1420 ; GFNISSE-NEXT: paddd %xmm11, %xmm4
1421 ; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm1
1422 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7]
1423 ; GFNISSE-NEXT: pslld $23, %xmm12
1424 ; GFNISSE-NEXT: paddd %xmm11, %xmm12
1425 ; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm4
1426 ; GFNISSE-NEXT: packusdw %xmm4, %xmm1
1427 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1428 ; GFNISSE-NEXT: pmullw %xmm5, %xmm1
1429 ; GFNISSE-NEXT: psrlw $8, %xmm1
1430 ; GFNISSE-NEXT: packuswb %xmm14, %xmm1
1431 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4
1432 ; GFNISSE-NEXT: pand %xmm9, %xmm4
1433 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
1434 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
1435 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1436 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1437 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
1438 ; GFNISSE-NEXT: pslld $23, %xmm4
1439 ; GFNISSE-NEXT: paddd %xmm11, %xmm4
1440 ; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm4
1441 ; GFNISSE-NEXT: pslld $23, %xmm12
1442 ; GFNISSE-NEXT: paddd %xmm11, %xmm12
1443 ; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm12
1444 ; GFNISSE-NEXT: packusdw %xmm4, %xmm12
1445 ; GFNISSE-NEXT: movdqa %xmm6, %xmm13
1446 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
1447 ; GFNISSE-NEXT: pmullw %xmm12, %xmm13
1448 ; GFNISSE-NEXT: psrlw $8, %xmm13
1449 ; GFNISSE-NEXT: pslld $23, %xmm5
1450 ; GFNISSE-NEXT: paddd %xmm11, %xmm5
1451 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm4
1452 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
1453 ; GFNISSE-NEXT: pslld $23, %xmm8
1454 ; GFNISSE-NEXT: paddd %xmm11, %xmm8
1455 ; GFNISSE-NEXT: cvttps2dq %xmm8, %xmm5
1456 ; GFNISSE-NEXT: packusdw %xmm5, %xmm4
1457 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
1458 ; GFNISSE-NEXT: pmullw %xmm6, %xmm4
1459 ; GFNISSE-NEXT: psrlw $8, %xmm4
1460 ; GFNISSE-NEXT: packuswb %xmm13, %xmm4
1461 ; GFNISSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm9
1462 ; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
1463 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
1464 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
1465 ; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
1466 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
1467 ; GFNISSE-NEXT: pslld $23, %xmm9
1468 ; GFNISSE-NEXT: paddd %xmm11, %xmm9
1469 ; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm8
1470 ; GFNISSE-NEXT: pslld $23, %xmm5
1471 ; GFNISSE-NEXT: paddd %xmm11, %xmm5
1472 ; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5
1473 ; GFNISSE-NEXT: packusdw %xmm8, %xmm5
1474 ; GFNISSE-NEXT: movdqa %xmm7, %xmm8
1475 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15]
1476 ; GFNISSE-NEXT: pmullw %xmm5, %xmm8
1477 ; GFNISSE-NEXT: psrlw $8, %xmm8
1478 ; GFNISSE-NEXT: pslld $23, %xmm2
1479 ; GFNISSE-NEXT: paddd %xmm11, %xmm2
1480 ; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm5
1481 ; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1482 ; GFNISSE-NEXT: pslld $23, %xmm6
1483 ; GFNISSE-NEXT: paddd %xmm11, %xmm6
1484 ; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm2
1485 ; GFNISSE-NEXT: packusdw %xmm2, %xmm5
1486 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
1487 ; GFNISSE-NEXT: pmullw %xmm7, %xmm5
1488 ; GFNISSE-NEXT: psrlw $8, %xmm5
1489 ; GFNISSE-NEXT: packuswb %xmm8, %xmm5
1490 ; GFNISSE-NEXT: movdqa %xmm4, %xmm2
1491 ; GFNISSE-NEXT: movdqa %xmm5, %xmm3
1492 ; GFNISSE-NEXT: retq
1494 ; GFNIAVX1-LABEL: var_fshl_v64i8:
1495 ; GFNIAVX1: # %bb.0:
1496 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1497 ; GFNIAVX1-NEXT: vandps %ymm7, %ymm4, %ymm8
1498 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm8, %xmm9
1499 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
1500 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
1501 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4,4,5,5,6,6,7,7]
1502 ; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm11
1503 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
1504 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
1505 ; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
1506 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
1507 ; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
1508 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
1509 ; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
1510 ; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
1511 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
1512 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm12
1513 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
1514 ; GFNIAVX1-NEXT: vpmullw %xmm10, %xmm13, %xmm10
1515 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
1516 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
1517 ; GFNIAVX1-NEXT: vpslld $23, %xmm13, %xmm13
1518 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm13, %xmm13
1519 ; GFNIAVX1-NEXT: vcvttps2dq %xmm13, %xmm13
1520 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
1521 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
1522 ; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9
1523 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9
1524 ; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9
1525 ; GFNIAVX1-NEXT: vpackusdw %xmm9, %xmm13, %xmm9
1526 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1527 ; GFNIAVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm9
1528 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm9, %xmm9
1529 ; GFNIAVX1-NEXT: vpackuswb %xmm10, %xmm9, %xmm9
1530 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
1531 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4,4,5,5,6,6,7,7]
1532 ; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11
1533 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
1534 ; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
1535 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
1536 ; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
1537 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
1538 ; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
1539 ; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
1540 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1541 ; GFNIAVX1-NEXT: vpmullw %xmm10, %xmm11, %xmm10
1542 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
1543 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
1544 ; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11
1545 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
1546 ; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
1547 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
1548 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
1549 ; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
1550 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
1551 ; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
1552 ; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm11, %xmm8
1553 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1554 ; GFNIAVX1-NEXT: vpmullw %xmm0, %xmm8, %xmm0
1555 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1556 ; GFNIAVX1-NEXT: vpackuswb %xmm10, %xmm0, %xmm0
1557 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
1558 ; GFNIAVX1-NEXT: vandps %ymm7, %ymm5, %ymm2
1559 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1560 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
1561 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4,4,5,5,6,6,7,7]
1562 ; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
1563 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
1564 ; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
1565 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1566 ; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
1567 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
1568 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
1569 ; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
1570 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
1571 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
1572 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
1573 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm10, %xmm7
1574 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1575 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
1576 ; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
1577 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
1578 ; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
1579 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
1580 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1581 ; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
1582 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1583 ; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
1584 ; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm10, %xmm5
1585 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
1586 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm8, %xmm5
1587 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
1588 ; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5
1589 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
1590 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
1591 ; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
1592 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
1593 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
1594 ; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1595 ; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
1596 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
1597 ; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
1598 ; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
1599 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
1600 ; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
1601 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1602 ; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1603 ; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
1604 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
1605 ; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
1606 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1607 ; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1608 ; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
1609 ; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
1610 ; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
1611 ; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2
1612 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1613 ; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1614 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1615 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1
1616 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1617 ; GFNIAVX1-NEXT: retq
1619 ; GFNIAVX2-LABEL: var_fshl_v64i8:
1620 ; GFNIAVX2: # %bb.0:
1621 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
1622 ; GFNIAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
1623 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15]
1624 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1625 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm9, %ymm4
1626 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
1627 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15]
1628 ; GFNIAVX2-NEXT: vpsllvd %ymm11, %ymm8, %ymm8
1629 ; GFNIAVX2-NEXT: vpsrld $16, %ymm8, %ymm8
1630 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11]
1631 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11]
1632 ; GFNIAVX2-NEXT: vpsllvd %ymm10, %ymm7, %ymm7
1633 ; GFNIAVX2-NEXT: vpsrld $16, %ymm7, %ymm7
1634 ; GFNIAVX2-NEXT: vpackusdw %ymm8, %ymm7, %ymm7
1635 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm7, %ymm7
1636 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
1637 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15]
1638 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
1639 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
1640 ; GFNIAVX2-NEXT: vpsllvd %ymm8, %ymm2, %ymm2
1641 ; GFNIAVX2-NEXT: vpsrld $16, %ymm2, %ymm2
1642 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11]
1643 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
1644 ; GFNIAVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
1645 ; GFNIAVX2-NEXT: vpsrld $16, %ymm0, %ymm0
1646 ; GFNIAVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
1647 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1648 ; GFNIAVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0
1649 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
1650 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
1651 ; GFNIAVX2-NEXT: vpand %ymm5, %ymm9, %ymm5
1652 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
1653 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15]
1654 ; GFNIAVX2-NEXT: vpsllvd %ymm8, %ymm4, %ymm4
1655 ; GFNIAVX2-NEXT: vpsrld $16, %ymm4, %ymm4
1656 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
1657 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
1658 ; GFNIAVX2-NEXT: vpsllvd %ymm7, %ymm2, %ymm2
1659 ; GFNIAVX2-NEXT: vpsrld $16, %ymm2, %ymm2
1660 ; GFNIAVX2-NEXT: vpackusdw %ymm4, %ymm2, %ymm2
1661 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1662 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
1663 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15]
1664 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
1665 ; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
1666 ; GFNIAVX2-NEXT: vpsllvd %ymm5, %ymm3, %ymm3
1667 ; GFNIAVX2-NEXT: vpsrld $16, %ymm3, %ymm3
1668 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11]
1669 ; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
1670 ; GFNIAVX2-NEXT: vpsllvd %ymm4, %ymm1, %ymm1
1671 ; GFNIAVX2-NEXT: vpsrld $16, %ymm1, %ymm1
1672 ; GFNIAVX2-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
1673 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1674 ; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
1675 ; GFNIAVX2-NEXT: retq
1677 ; GFNIAVX512VL-LABEL: var_fshl_v64i8:
1678 ; GFNIAVX512VL: # %bb.0:
1679 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
1680 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
1681 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm3
1682 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
1683 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm6
1684 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1685 ; GFNIAVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2
1686 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm8
1687 ; GFNIAVX512VL-NEXT: vpxor %ymm7, %ymm8, %ymm9
1688 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9
1689 ; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
1690 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
1691 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm3, %ymm10
1692 ; GFNIAVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
1693 ; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm3, %ymm3
1694 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm10
1695 ; GFNIAVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
1696 ; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm3, %ymm3
1697 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
1698 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm5
1699 ; GFNIAVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm7
1700 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
1701 ; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
1702 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm5
1703 ; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm6
1704 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
1705 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm4
1706 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm5
1707 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
1708 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1709 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1710 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
1711 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
1712 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm8, %ymm6
1713 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3
1714 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
1715 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm7
1716 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
1717 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm3, %ymm3
1718 ; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm7
1719 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
1720 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm3, %ymm3
1721 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm4
1722 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
1723 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
1724 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm4
1725 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1726 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
1727 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
1728 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1729 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
1730 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1731 ; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
1732 ; GFNIAVX512VL-NEXT: retq
1734 ; GFNIAVX512BW-LABEL: var_fshl_v64i8:
1735 ; GFNIAVX512BW: # %bb.0:
1736 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
1737 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
1738 ; GFNIAVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
1739 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
1740 ; GFNIAVX512BW-NEXT: vpsllvw %zmm5, %zmm3, %zmm3
1741 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
1742 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
1743 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
1744 ; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1745 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
1746 ; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
1747 ; GFNIAVX512BW-NEXT: retq
1748 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt)
1752 define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
1753 ; GFNISSE-LABEL: var_fshr_v64i8:
1755 ; GFNISSE-NEXT: movdqa %xmm7, %xmm10
1756 ; GFNISSE-NEXT: movdqa %xmm6, %xmm7
1757 ; GFNISSE-NEXT: movdqa %xmm5, %xmm6
1758 ; GFNISSE-NEXT: movdqa %xmm4, %xmm5
1759 ; GFNISSE-NEXT: movdqa %xmm3, %xmm4
1760 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3
1761 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2
1762 ; GFNISSE-NEXT: movdqa %xmm0, %xmm1
1763 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1764 ; GFNISSE-NEXT: movdqa %xmm5, %xmm12
1765 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
1766 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1767 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1768 ; GFNISSE-NEXT: pand %xmm11, %xmm0
1769 ; GFNISSE-NEXT: psllw $5, %xmm0
1770 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm5
1771 ; GFNISSE-NEXT: movdqa %xmm5, %xmm13
1772 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm13
1773 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1774 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm5
1775 ; GFNISSE-NEXT: movdqa %xmm5, %xmm14
1776 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
1777 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1778 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm5
1779 ; GFNISSE-NEXT: paddb %xmm1, %xmm1
1780 ; GFNISSE-NEXT: movdqa %xmm1, %xmm15
1781 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
1782 ; GFNISSE-NEXT: movdqa %xmm11, %xmm12
1783 ; GFNISSE-NEXT: pandn %xmm11, %xmm9
1784 ; GFNISSE-NEXT: psllw $5, %xmm9
1785 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1786 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm15, %xmm1
1787 ; GFNISSE-NEXT: movdqa %xmm1, %xmm8
1788 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
1789 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1790 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1791 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
1792 ; GFNISSE-NEXT: movdqa %xmm1, %xmm8
1793 ; GFNISSE-NEXT: paddb %xmm1, %xmm8
1794 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1795 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1796 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
1797 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1798 ; GFNISSE-NEXT: movdqa %xmm6, %xmm8
1799 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
1800 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8
1801 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1802 ; GFNISSE-NEXT: pand %xmm12, %xmm0
1803 ; GFNISSE-NEXT: psllw $5, %xmm0
1804 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6
1805 ; GFNISSE-NEXT: movdqa %xmm6, %xmm8
1806 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
1807 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm8
1808 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1809 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6
1810 ; GFNISSE-NEXT: movdqa %xmm6, %xmm8
1811 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm14 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
1812 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8
1813 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1814 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6
1815 ; GFNISSE-NEXT: paddb %xmm2, %xmm2
1816 ; GFNISSE-NEXT: movdqa %xmm2, %xmm8
1817 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm15 = [16909320,16909320]
1818 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm15, %xmm8
1819 ; GFNISSE-NEXT: pandn %xmm12, %xmm9
1820 ; GFNISSE-NEXT: psllw $5, %xmm9
1821 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1822 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
1823 ; GFNISSE-NEXT: movdqa %xmm2, %xmm8
1824 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm0 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
1825 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm0, %xmm8
1826 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1827 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1828 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
1829 ; GFNISSE-NEXT: movdqa %xmm2, %xmm8
1830 ; GFNISSE-NEXT: paddb %xmm2, %xmm8
1831 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1832 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1833 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
1834 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1835 ; GFNISSE-NEXT: movdqa %xmm7, %xmm8
1836 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8
1837 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1838 ; GFNISSE-NEXT: pand %xmm12, %xmm0
1839 ; GFNISSE-NEXT: psllw $5, %xmm0
1840 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7
1841 ; GFNISSE-NEXT: movdqa %xmm7, %xmm8
1842 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm8
1843 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1844 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7
1845 ; GFNISSE-NEXT: movdqa %xmm7, %xmm8
1846 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8
1847 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1848 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7
1849 ; GFNISSE-NEXT: paddb %xmm3, %xmm3
1850 ; GFNISSE-NEXT: movdqa %xmm3, %xmm8
1851 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm15, %xmm8
1852 ; GFNISSE-NEXT: pandn %xmm12, %xmm9
1853 ; GFNISSE-NEXT: psllw $5, %xmm9
1854 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1855 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3
1856 ; GFNISSE-NEXT: movdqa %xmm3, %xmm8
1857 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
1858 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1859 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1860 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3
1861 ; GFNISSE-NEXT: movdqa %xmm3, %xmm8
1862 ; GFNISSE-NEXT: paddb %xmm3, %xmm8
1863 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1864 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1865 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3
1866 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
1867 ; GFNISSE-NEXT: movdqa %xmm10, %xmm8
1868 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8
1869 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1870 ; GFNISSE-NEXT: pand %xmm12, %xmm0
1871 ; GFNISSE-NEXT: psllw $5, %xmm0
1872 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10
1873 ; GFNISSE-NEXT: movdqa %xmm10, %xmm8
1874 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm8
1875 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1876 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10
1877 ; GFNISSE-NEXT: movdqa %xmm10, %xmm8
1878 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8
1879 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
1880 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10
1881 ; GFNISSE-NEXT: paddb %xmm4, %xmm4
1882 ; GFNISSE-NEXT: movdqa %xmm4, %xmm8
1883 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm15, %xmm8
1884 ; GFNISSE-NEXT: pandn %xmm12, %xmm9
1885 ; GFNISSE-NEXT: psllw $5, %xmm9
1886 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1887 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4
1888 ; GFNISSE-NEXT: movdqa %xmm4, %xmm8
1889 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
1890 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1891 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1892 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4
1893 ; GFNISSE-NEXT: movdqa %xmm4, %xmm8
1894 ; GFNISSE-NEXT: paddb %xmm4, %xmm8
1895 ; GFNISSE-NEXT: paddb %xmm9, %xmm9
1896 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0
1897 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4
1898 ; GFNISSE-NEXT: por %xmm5, %xmm1
1899 ; GFNISSE-NEXT: por %xmm6, %xmm2
1900 ; GFNISSE-NEXT: por %xmm7, %xmm3
1901 ; GFNISSE-NEXT: por %xmm10, %xmm4
1902 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0
1903 ; GFNISSE-NEXT: movdqa %xmm2, %xmm1
1904 ; GFNISSE-NEXT: movdqa %xmm3, %xmm2
1905 ; GFNISSE-NEXT: movdqa %xmm4, %xmm3
1906 ; GFNISSE-NEXT: retq
1908 ; GFNIAVX1-LABEL: var_fshr_v64i8:
1909 ; GFNIAVX1: # %bb.0:
1910 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
1911 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
1912 ; GFNIAVX1-NEXT: # xmm7 = mem[0,0]
1913 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm8, %xmm9
1914 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1915 ; GFNIAVX1-NEXT: vandps %ymm6, %ymm4, %ymm11
1916 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm11, %xmm10
1917 ; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm12
1918 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm9, %xmm8, %xmm8
1919 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
1920 ; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
1921 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm8, %xmm9
1922 ; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
1923 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm9, %xmm8, %xmm9
1924 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
1925 ; GFNIAVX1-NEXT: # xmm8 = mem[0,0]
1926 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm9, %xmm13
1927 ; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
1928 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm13, %xmm9, %xmm12
1929 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
1930 ; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm13
1931 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
1932 ; GFNIAVX1-NEXT: # xmm9 = mem[0,0]
1933 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm13, %xmm14
1934 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm10, %xmm10
1935 ; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm15
1936 ; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
1937 ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm10 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
1938 ; GFNIAVX1-NEXT: # xmm10 = mem[0,0]
1939 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm13, %xmm14
1940 ; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15
1941 ; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
1942 ; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm14
1943 ; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15
1944 ; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
1945 ; GFNIAVX1-NEXT: vpor %xmm12, %xmm13, %xmm12
1946 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm13
1947 ; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm14
1948 ; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
1949 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm13
1950 ; GFNIAVX1-NEXT: vpaddb %xmm14, %xmm14, %xmm14
1951 ; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
1952 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm13
1953 ; GFNIAVX1-NEXT: vpaddb %xmm14, %xmm14, %xmm14
1954 ; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
1955 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1956 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm0, %xmm13
1957 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm11, %xmm11
1958 ; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm11
1959 ; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
1960 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm0, %xmm13
1961 ; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
1962 ; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
1963 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm13
1964 ; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
1965 ; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
1966 ; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1967 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0
1968 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
1969 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm11, %xmm12
1970 ; GFNIAVX1-NEXT: vandps %ymm6, %ymm5, %ymm2
1971 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1972 ; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm13
1973 ; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
1974 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm11, %xmm12
1975 ; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm13
1976 ; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
1977 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm11, %xmm12
1978 ; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm13
1979 ; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
1980 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
1981 ; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
1982 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm12, %xmm13
1983 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
1984 ; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5
1985 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
1986 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm12, %xmm13
1987 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
1988 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
1989 ; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm13
1990 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
1991 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm5
1992 ; GFNIAVX1-NEXT: vpor %xmm5, %xmm11, %xmm5
1993 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm3, %xmm7
1994 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm11
1995 ; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm7, %xmm3, %xmm3
1996 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm3, %xmm4
1997 ; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm7
1998 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
1999 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm3, %xmm4
2000 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
2001 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
2002 ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
2003 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm1, %xmm4
2004 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
2005 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
2006 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
2007 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm1, %xmm4
2008 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
2009 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
2010 ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm4
2011 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
2012 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
2013 ; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
2014 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
2015 ; GFNIAVX1-NEXT: retq
2017 ; GFNIAVX2-LABEL: var_fshr_v64i8:
2018 ; GFNIAVX2: # %bb.0:
2019 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
2020 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm8
2021 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2022 ; GFNIAVX2-NEXT: vpand %ymm6, %ymm4, %ymm9
2023 ; GFNIAVX2-NEXT: vpsllw $5, %ymm9, %ymm9
2024 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
2025 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
2026 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm10
2027 ; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9
2028 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm10, %ymm2, %ymm2
2029 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
2030 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
2031 ; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9
2032 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm2, %ymm2
2033 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
2034 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
2035 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm11
2036 ; GFNIAVX2-NEXT: vpandn %ymm6, %ymm4, %ymm4
2037 ; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
2038 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm11, %ymm0, %ymm0
2039 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
2040 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm11, %ymm0, %ymm12
2041 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
2042 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
2043 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm12
2044 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
2045 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
2046 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
2047 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm3, %ymm2
2048 ; GFNIAVX2-NEXT: vpand %ymm6, %ymm5, %ymm4
2049 ; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
2050 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
2051 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm3
2052 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
2053 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
2054 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm3
2055 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
2056 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
2057 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
2058 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm3
2059 ; GFNIAVX2-NEXT: vpandn %ymm6, %ymm5, %ymm4
2060 ; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
2061 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
2062 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm11, %ymm1, %ymm3
2063 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
2064 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
2065 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
2066 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
2067 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
2068 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
2069 ; GFNIAVX2-NEXT: retq
2071 ; GFNIAVX512VL-LABEL: var_fshr_v64i8:
2072 ; GFNIAVX512VL: # %bb.0:
2073 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
2074 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
2075 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
2076 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2077 ; GFNIAVX512VL-NEXT: vpandq %zmm6, %zmm2, %zmm2
2078 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm7
2079 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm8
2080 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3
2081 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
2082 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm9
2083 ; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
2084 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm9, %ymm3, %ymm3
2085 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
2086 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm9, %ymm3, %ymm10
2087 ; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
2088 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm10, %ymm3, %ymm3
2089 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm4
2090 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm8
2091 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
2092 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm4
2093 ; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm5
2094 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
2095 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4
2096 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
2097 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
2098 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
2099 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2100 ; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2101 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
2102 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5
2103 ; GFNIAVX512VL-NEXT: vpxor %ymm6, %ymm7, %ymm7
2104 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
2105 ; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
2106 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
2107 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm8
2108 ; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
2109 ; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm3, %ymm3
2110 ; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
2111 ; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
2112 ; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm3, %ymm3
2113 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
2114 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm4
2115 ; GFNIAVX512VL-NEXT: vpxor %ymm6, %ymm2, %ymm2
2116 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
2117 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
2118 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm4
2119 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
2120 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
2121 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
2122 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
2123 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
2124 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2125 ; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
2126 ; GFNIAVX512VL-NEXT: retq
2128 ; GFNIAVX512BW-LABEL: var_fshr_v64i8:
2129 ; GFNIAVX512BW: # %bb.0:
2130 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2131 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
2132 ; GFNIAVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
2133 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
2134 ; GFNIAVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3
2135 ; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2136 ; GFNIAVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3
2137 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2138 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
2139 ; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
2140 ; GFNIAVX512BW-NEXT: vpandq %zmm5, %zmm0, %zmm0
2141 ; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
2142 ; GFNIAVX512BW-NEXT: retq
2143 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt)
2147 define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
2148 ; GFNISSE-LABEL: splatvar_fshl_v64i8:
2150 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2151 ; GFNISSE-NEXT: movdqa %xmm4, %xmm9
2152 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
2153 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
2154 ; GFNISSE-NEXT: psllw %xmm8, %xmm9
2155 ; GFNISSE-NEXT: psrlw $8, %xmm9
2156 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2157 ; GFNISSE-NEXT: psllw %xmm8, %xmm4
2158 ; GFNISSE-NEXT: psrlw $8, %xmm4
2159 ; GFNISSE-NEXT: packuswb %xmm9, %xmm4
2160 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
2161 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2162 ; GFNISSE-NEXT: psllw %xmm8, %xmm0
2163 ; GFNISSE-NEXT: psrlw $8, %xmm0
2164 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
2165 ; GFNISSE-NEXT: psllw %xmm8, %xmm5
2166 ; GFNISSE-NEXT: psrlw $8, %xmm5
2167 ; GFNISSE-NEXT: packuswb %xmm0, %xmm5
2168 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
2169 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2170 ; GFNISSE-NEXT: psllw %xmm8, %xmm0
2171 ; GFNISSE-NEXT: psrlw $8, %xmm0
2172 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
2173 ; GFNISSE-NEXT: psllw %xmm8, %xmm6
2174 ; GFNISSE-NEXT: psrlw $8, %xmm6
2175 ; GFNISSE-NEXT: packuswb %xmm0, %xmm6
2176 ; GFNISSE-NEXT: movdqa %xmm7, %xmm0
2177 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2178 ; GFNISSE-NEXT: psllw %xmm8, %xmm0
2179 ; GFNISSE-NEXT: psrlw $8, %xmm0
2180 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
2181 ; GFNISSE-NEXT: psllw %xmm8, %xmm7
2182 ; GFNISSE-NEXT: psrlw $8, %xmm7
2183 ; GFNISSE-NEXT: packuswb %xmm0, %xmm7
2184 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
2185 ; GFNISSE-NEXT: movdqa %xmm5, %xmm1
2186 ; GFNISSE-NEXT: movdqa %xmm6, %xmm2
2187 ; GFNISSE-NEXT: movdqa %xmm7, %xmm3
2188 ; GFNISSE-NEXT: retq
2190 ; GFNIAVX1-LABEL: splatvar_fshl_v64i8:
2191 ; GFNIAVX1: # %bb.0:
2192 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
2193 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
2194 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2195 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
2196 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm7, %xmm7
2197 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
2198 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2199 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5
2200 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
2201 ; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5
2202 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2203 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm6
2204 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2205 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2206 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
2207 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
2208 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
2209 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
2210 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2211 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
2212 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
2213 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm6
2214 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2215 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2216 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm2, %xmm2
2217 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2218 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
2219 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2220 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5
2221 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
2222 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2223 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm1, %xmm1
2224 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2225 ; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
2226 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2227 ; GFNIAVX1-NEXT: retq
2229 ; GFNIAVX2-LABEL: splatvar_fshl_v64i8:
2230 ; GFNIAVX2: # %bb.0:
2231 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2232 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
2233 ; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm5, %ymm5
2234 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm5, %ymm5
2235 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2236 ; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
2237 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
2238 ; GFNIAVX2-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
2239 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
2240 ; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm2, %ymm2
2241 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2242 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
2243 ; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm1, %ymm1
2244 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2245 ; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
2246 ; GFNIAVX2-NEXT: retq
2248 ; GFNIAVX512VL-LABEL: splatvar_fshl_v64i8:
2249 ; GFNIAVX512VL: # %bb.0:
2250 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2251 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2252 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
2253 ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2254 ; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm5, %ymm5
2255 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
2256 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
2257 ; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
2258 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
2259 ; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
2260 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2261 ; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm4
2262 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
2263 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2264 ; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
2265 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
2266 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
2267 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2268 ; GFNIAVX512VL-NEXT: retq
2270 ; GFNIAVX512BW-LABEL: splatvar_fshl_v64i8:
2271 ; GFNIAVX512BW: # %bb.0:
2272 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2273 ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2274 ; GFNIAVX512BW-NEXT: vpsllw %xmm2, %zmm3, %zmm3
2275 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
2276 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2277 ; GFNIAVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
2278 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
2279 ; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
2280 ; GFNIAVX512BW-NEXT: retq
2281 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
2282 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat)
2286 define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
2287 ; GFNISSE-LABEL: splatvar_fshr_v64i8:
2289 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2290 ; GFNISSE-NEXT: movdqa %xmm4, %xmm10
2291 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
2292 ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
2293 ; GFNISSE-NEXT: psrlw %xmm9, %xmm10
2294 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
2295 ; GFNISSE-NEXT: pand %xmm8, %xmm10
2296 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2297 ; GFNISSE-NEXT: psrlw %xmm9, %xmm4
2298 ; GFNISSE-NEXT: pand %xmm8, %xmm4
2299 ; GFNISSE-NEXT: packuswb %xmm10, %xmm4
2300 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
2301 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2302 ; GFNISSE-NEXT: psrlw %xmm9, %xmm0
2303 ; GFNISSE-NEXT: pand %xmm8, %xmm0
2304 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
2305 ; GFNISSE-NEXT: psrlw %xmm9, %xmm5
2306 ; GFNISSE-NEXT: pand %xmm8, %xmm5
2307 ; GFNISSE-NEXT: packuswb %xmm0, %xmm5
2308 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
2309 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2310 ; GFNISSE-NEXT: psrlw %xmm9, %xmm0
2311 ; GFNISSE-NEXT: pand %xmm8, %xmm0
2312 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
2313 ; GFNISSE-NEXT: psrlw %xmm9, %xmm6
2314 ; GFNISSE-NEXT: pand %xmm8, %xmm6
2315 ; GFNISSE-NEXT: packuswb %xmm0, %xmm6
2316 ; GFNISSE-NEXT: movdqa %xmm7, %xmm0
2317 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2318 ; GFNISSE-NEXT: psrlw %xmm9, %xmm0
2319 ; GFNISSE-NEXT: pand %xmm8, %xmm0
2320 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
2321 ; GFNISSE-NEXT: psrlw %xmm9, %xmm7
2322 ; GFNISSE-NEXT: pand %xmm7, %xmm8
2323 ; GFNISSE-NEXT: packuswb %xmm0, %xmm8
2324 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
2325 ; GFNISSE-NEXT: movdqa %xmm5, %xmm1
2326 ; GFNISSE-NEXT: movdqa %xmm6, %xmm2
2327 ; GFNISSE-NEXT: movdqa %xmm8, %xmm3
2328 ; GFNISSE-NEXT: retq
2330 ; GFNIAVX1-LABEL: splatvar_fshr_v64i8:
2331 ; GFNIAVX1: # %bb.0:
2332 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
2333 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
2334 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
2335 ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
2336 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm8
2337 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2338 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm8, %xmm8
2339 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2340 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6
2341 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
2342 ; GFNIAVX1-NEXT: vpackuswb %xmm8, %xmm6, %xmm6
2343 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2344 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm7, %xmm7
2345 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
2346 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2347 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm0, %xmm0
2348 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
2349 ; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm0, %xmm0
2350 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
2351 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2352 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
2353 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
2354 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm7, %xmm7
2355 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
2356 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
2357 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm2, %xmm2
2358 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
2359 ; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm2
2360 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2361 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6
2362 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
2363 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2364 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
2365 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
2366 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1
2367 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2368 ; GFNIAVX1-NEXT: retq
2370 ; GFNIAVX2-LABEL: splatvar_fshr_v64i8:
2371 ; GFNIAVX2: # %bb.0:
2372 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2373 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
2374 ; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
2375 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2376 ; GFNIAVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
2377 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2378 ; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm0, %ymm0
2379 ; GFNIAVX2-NEXT: vpand %ymm6, %ymm0, %ymm0
2380 ; GFNIAVX2-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
2381 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
2382 ; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm2, %ymm2
2383 ; GFNIAVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
2384 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
2385 ; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
2386 ; GFNIAVX2-NEXT: vpand %ymm6, %ymm1, %ymm1
2387 ; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
2388 ; GFNIAVX2-NEXT: retq
2390 ; GFNIAVX512VL-LABEL: splatvar_fshr_v64i8:
2391 ; GFNIAVX512VL: # %bb.0:
2392 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2393 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2394 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
2395 ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2396 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
2397 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2398 ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
2399 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
2400 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
2401 ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
2402 ; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
2403 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2404 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
2405 ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
2406 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2407 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
2408 ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0
2409 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
2410 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2411 ; GFNIAVX512VL-NEXT: retq
2413 ; GFNIAVX512BW-LABEL: splatvar_fshr_v64i8:
2414 ; GFNIAVX512BW: # %bb.0:
2415 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2416 ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2417 ; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3
2418 ; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2419 ; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
2420 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2421 ; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
2422 ; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
2423 ; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
2424 ; GFNIAVX512BW-NEXT: retq
2425 %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
2426 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat)
2430 define <64 x i8> @constant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
2431 ; GFNISSE-LABEL: constant_fshl_v64i8:
2433 ; GFNISSE-NEXT: movdqa %xmm4, %xmm10
2434 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
2435 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2]
2436 ; GFNISSE-NEXT: pmullw %xmm9, %xmm10
2437 ; GFNISSE-NEXT: psrlw $8, %xmm10
2438 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2439 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
2440 ; GFNISSE-NEXT: pmullw %xmm8, %xmm4
2441 ; GFNISSE-NEXT: psrlw $8, %xmm4
2442 ; GFNISSE-NEXT: packuswb %xmm10, %xmm4
2443 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
2444 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2445 ; GFNISSE-NEXT: pmullw %xmm9, %xmm0
2446 ; GFNISSE-NEXT: psrlw $8, %xmm0
2447 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
2448 ; GFNISSE-NEXT: pmullw %xmm8, %xmm5
2449 ; GFNISSE-NEXT: psrlw $8, %xmm5
2450 ; GFNISSE-NEXT: packuswb %xmm0, %xmm5
2451 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
2452 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2453 ; GFNISSE-NEXT: pmullw %xmm9, %xmm0
2454 ; GFNISSE-NEXT: psrlw $8, %xmm0
2455 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
2456 ; GFNISSE-NEXT: pmullw %xmm8, %xmm6
2457 ; GFNISSE-NEXT: psrlw $8, %xmm6
2458 ; GFNISSE-NEXT: packuswb %xmm0, %xmm6
2459 ; GFNISSE-NEXT: movdqa %xmm7, %xmm0
2460 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2461 ; GFNISSE-NEXT: pmullw %xmm9, %xmm0
2462 ; GFNISSE-NEXT: psrlw $8, %xmm0
2463 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
2464 ; GFNISSE-NEXT: pmullw %xmm7, %xmm8
2465 ; GFNISSE-NEXT: psrlw $8, %xmm8
2466 ; GFNISSE-NEXT: packuswb %xmm0, %xmm8
2467 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
2468 ; GFNISSE-NEXT: movdqa %xmm5, %xmm1
2469 ; GFNISSE-NEXT: movdqa %xmm6, %xmm2
2470 ; GFNISSE-NEXT: movdqa %xmm8, %xmm3
2471 ; GFNISSE-NEXT: retq
2473 ; GFNIAVX1-LABEL: constant_fshl_v64i8:
2474 ; GFNIAVX1: # %bb.0:
2475 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
2476 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
2477 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2478 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
2479 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
2480 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2481 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2482 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
2483 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
2484 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
2485 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
2486 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2487 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
2488 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2489 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2490 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
2491 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
2492 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
2493 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2494 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2495 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
2496 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
2497 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
2498 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2499 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2500 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2
2501 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2502 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
2503 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2504 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
2505 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
2506 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2507 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
2508 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2509 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
2510 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2511 ; GFNIAVX1-NEXT: retq
2513 ; GFNIAVX2-LABEL: constant_fshl_v64i8:
2514 ; GFNIAVX2: # %bb.0:
2515 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2516 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
2517 ; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1]
2518 ; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2519 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
2520 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2521 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2522 ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
2523 ; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2524 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
2525 ; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
2526 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
2527 ; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2528 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
2529 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
2530 ; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
2531 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2532 ; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
2533 ; GFNIAVX2-NEXT: retq
2535 ; GFNIAVX512VL-LABEL: constant_fshl_v64i8:
2536 ; GFNIAVX512VL: # %bb.0:
2537 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2538 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
2539 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31]
2540 ; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
2541 ; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
2542 ; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2543 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
2544 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
2545 ; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2546 ; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
2547 ; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
2548 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
2549 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
2550 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2551 ; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2552 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
2553 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2554 ; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2555 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
2556 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
2557 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
2558 ; GFNIAVX512VL-NEXT: retq
2560 ; GFNIAVX512BW-LABEL: constant_fshl_v64i8:
2561 ; GFNIAVX512BW: # %bb.0:
2562 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2563 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
2564 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
2565 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2566 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2567 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
2568 ; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2569 ; GFNIAVX512BW-NEXT: retq
2570 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2574 define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
2575 ; GFNISSE-LABEL: constant_fshr_v64i8:
2577 ; GFNISSE-NEXT: movdqa %xmm4, %xmm10
2578 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
2579 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2]
2580 ; GFNISSE-NEXT: pmullw %xmm9, %xmm10
2581 ; GFNISSE-NEXT: psrlw $8, %xmm10
2582 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2583 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
2584 ; GFNISSE-NEXT: pmullw %xmm8, %xmm4
2585 ; GFNISSE-NEXT: psrlw $8, %xmm4
2586 ; GFNISSE-NEXT: packuswb %xmm10, %xmm4
2587 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0
2588 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2589 ; GFNISSE-NEXT: pmullw %xmm9, %xmm0
2590 ; GFNISSE-NEXT: psrlw $8, %xmm0
2591 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
2592 ; GFNISSE-NEXT: pmullw %xmm8, %xmm5
2593 ; GFNISSE-NEXT: psrlw $8, %xmm5
2594 ; GFNISSE-NEXT: packuswb %xmm0, %xmm5
2595 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0
2596 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2597 ; GFNISSE-NEXT: pmullw %xmm9, %xmm0
2598 ; GFNISSE-NEXT: psrlw $8, %xmm0
2599 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
2600 ; GFNISSE-NEXT: pmullw %xmm8, %xmm6
2601 ; GFNISSE-NEXT: psrlw $8, %xmm6
2602 ; GFNISSE-NEXT: packuswb %xmm0, %xmm6
2603 ; GFNISSE-NEXT: movdqa %xmm7, %xmm0
2604 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2605 ; GFNISSE-NEXT: pmullw %xmm9, %xmm0
2606 ; GFNISSE-NEXT: psrlw $8, %xmm0
2607 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
2608 ; GFNISSE-NEXT: pmullw %xmm7, %xmm8
2609 ; GFNISSE-NEXT: psrlw $8, %xmm8
2610 ; GFNISSE-NEXT: packuswb %xmm0, %xmm8
2611 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0
2612 ; GFNISSE-NEXT: movdqa %xmm5, %xmm1
2613 ; GFNISSE-NEXT: movdqa %xmm6, %xmm2
2614 ; GFNISSE-NEXT: movdqa %xmm8, %xmm3
2615 ; GFNISSE-NEXT: retq
2617 ; GFNIAVX1-LABEL: constant_fshr_v64i8:
2618 ; GFNIAVX1: # %bb.0:
2619 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
2620 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
2621 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2622 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
2623 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
2624 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2625 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2626 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
2627 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
2628 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
2629 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
2630 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2631 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
2632 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2633 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2634 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
2635 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
2636 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
2637 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2638 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2639 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
2640 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
2641 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
2642 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
2643 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2644 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2
2645 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2646 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
2647 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2648 ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
2649 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
2650 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2651 ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
2652 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2653 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
2654 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2655 ; GFNIAVX1-NEXT: retq
2657 ; GFNIAVX2-LABEL: constant_fshr_v64i8:
2658 ; GFNIAVX2: # %bb.0:
2659 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2660 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
2661 ; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1]
2662 ; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2663 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
2664 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2665 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2666 ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
2667 ; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2668 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
2669 ; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
2670 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
2671 ; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2672 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
2673 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
2674 ; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
2675 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2676 ; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
2677 ; GFNIAVX2-NEXT: retq
2679 ; GFNIAVX512VL-LABEL: constant_fshr_v64i8:
2680 ; GFNIAVX512VL: # %bb.0:
2681 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2682 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
2683 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31]
2684 ; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
2685 ; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
2686 ; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2687 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
2688 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
2689 ; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2690 ; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
2691 ; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
2692 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
2693 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
2694 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2695 ; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
2696 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
2697 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2698 ; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2699 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
2700 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
2701 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
2702 ; GFNIAVX512VL-NEXT: retq
2704 ; GFNIAVX512BW-LABEL: constant_fshr_v64i8:
2705 ; GFNIAVX512BW: # %bb.0:
2706 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2707 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
2708 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
2709 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2710 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2711 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
2712 ; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2713 ; GFNIAVX512BW-NEXT: retq
2714 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2718 define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
2719 ; GFNISSE-LABEL: splatconstant_fshl_v64i8:
2721 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
2722 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4
2723 ; GFNISSE-NEXT: paddb %xmm0, %xmm0
2724 ; GFNISSE-NEXT: por %xmm4, %xmm0
2725 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm5
2726 ; GFNISSE-NEXT: paddb %xmm1, %xmm1
2727 ; GFNISSE-NEXT: por %xmm5, %xmm1
2728 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm6
2729 ; GFNISSE-NEXT: paddb %xmm2, %xmm2
2730 ; GFNISSE-NEXT: por %xmm6, %xmm2
2731 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm7
2732 ; GFNISSE-NEXT: paddb %xmm3, %xmm3
2733 ; GFNISSE-NEXT: por %xmm7, %xmm3
2734 ; GFNISSE-NEXT: retq
2736 ; GFNIAVX1-LABEL: splatconstant_fshl_v64i8:
2737 ; GFNIAVX1: # %bb.0:
2738 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
2739 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
2740 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm5
2741 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2742 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2743 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
2744 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
2745 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
2746 ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm3
2747 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2748 ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
2749 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
2750 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
2751 ; GFNIAVX1-NEXT: retq
2753 ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8:
2754 ; GFNIAVX2: # %bb.0:
2755 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
2756 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
2757 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
2758 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
2759 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
2760 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
2761 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
2762 ; GFNIAVX2-NEXT: retq
2764 ; GFNIAVX512VL-LABEL: splatconstant_fshl_v64i8:
2765 ; GFNIAVX512VL: # %bb.0:
2766 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
2767 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2768 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
2769 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
2770 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
2771 ; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
2772 ; GFNIAVX512VL-NEXT: retq
2774 ; GFNIAVX512BW-LABEL: splatconstant_fshl_v64i8:
2775 ; GFNIAVX512BW: # %bb.0:
2776 ; GFNIAVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm2
2777 ; GFNIAVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm0
2778 ; GFNIAVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2779 ; GFNIAVX512BW-NEXT: retq
2780 %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
2783 declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
2785 define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
2786 ; GFNISSE-LABEL: splatconstant_fshr_v64i8:
2788 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
2789 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4
2790 ; GFNISSE-NEXT: pmovsxwq {{.*#+}} xmm9 = [258,258]
2791 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
2792 ; GFNISSE-NEXT: por %xmm4, %xmm0
2793 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm5
2794 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm1
2795 ; GFNISSE-NEXT: por %xmm5, %xmm1
2796 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm6
2797 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2
2798 ; GFNISSE-NEXT: por %xmm6, %xmm2
2799 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm7
2800 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm3
2801 ; GFNISSE-NEXT: por %xmm7, %xmm3
2802 ; GFNISSE-NEXT: retq
2804 ; GFNIAVX1-LABEL: splatconstant_fshr_v64i8:
2805 ; GFNIAVX1: # %bb.0:
2806 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
2807 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
2808 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
2809 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0
2810 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
2811 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
2812 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
2813 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
2814 ; GFNIAVX1-NEXT: retq
2816 ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8:
2817 ; GFNIAVX2: # %bb.0:
2818 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
2819 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2
2820 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
2821 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0
2822 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
2823 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2
2824 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1
2825 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
2826 ; GFNIAVX2-NEXT: retq
2828 ; GFNIAVX512VL-LABEL: splatconstant_fshr_v64i8:
2829 ; GFNIAVX512VL: # %bb.0:
2830 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
2831 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2832 ; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
2833 ; GFNIAVX512VL-NEXT: retq
2835 ; GFNIAVX512BW-LABEL: splatconstant_fshr_v64i8:
2836 ; GFNIAVX512BW: # %bb.0:
2837 ; GFNIAVX512BW-NEXT: vpsllw $6, %zmm0, %zmm2
2838 ; GFNIAVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm0
2839 ; GFNIAVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2840 ; GFNIAVX512BW-NEXT: retq
2841 %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>)
2844 declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)