1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9
13 define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
14 ; SSE2-LABEL: vec16_v2i8_to_v1i16_factor2:
16 ; SSE2-NEXT: movdqa (%rdi), %xmm0
17 ; SSE2-NEXT: paddb (%rsi), %xmm0
18 ; SSE2-NEXT: pxor %xmm1, %xmm1
19 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
20 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
21 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
22 ; SSE2-NEXT: paddb (%rdx), %xmm0
23 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
26 ; SSE42-LABEL: vec16_v2i8_to_v1i16_factor2:
28 ; SSE42-NEXT: movdqa (%rdi), %xmm0
29 ; SSE42-NEXT: paddb (%rsi), %xmm0
30 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
31 ; SSE42-NEXT: paddb (%rdx), %xmm0
32 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
35 ; AVX-LABEL: vec16_v2i8_to_v1i16_factor2:
37 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
38 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
39 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
40 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
41 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
44 ; AVX2-LABEL: vec16_v2i8_to_v1i16_factor2:
46 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
47 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
48 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
49 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
50 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
51 ; AVX2-NEXT: vzeroupper
54 ; AVX512F-LABEL: vec16_v2i8_to_v1i16_factor2:
56 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
58 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
59 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
60 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
61 ; AVX512F-NEXT: vzeroupper
64 ; AVX512BW-LABEL: vec16_v2i8_to_v1i16_factor2:
66 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
67 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
68 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
69 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
70 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
71 ; AVX512BW-NEXT: vzeroupper
73 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
74 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
75 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
76 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <2 x i32> <i32 0, i32 1>
77 %zextd.vec = shufflevector <2 x i8> %in.vec.trunc, <2 x i8> zeroinitializer, <2 x i32> <i32 0, i32 3>
78 %out.bytevec.padded = shufflevector <2 x i8> %zextd.vec, <2 x i8> poison, <64 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
79 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
80 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
81 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
85 define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
86 ; SSE2-LABEL: vec32_v4i8_to_v2i16_factor2:
88 ; SSE2-NEXT: movdqa (%rdi), %xmm0
89 ; SSE2-NEXT: paddb (%rsi), %xmm0
90 ; SSE2-NEXT: pxor %xmm1, %xmm1
91 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
92 ; SSE2-NEXT: paddb (%rdx), %xmm0
93 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
96 ; SSE42-LABEL: vec32_v4i8_to_v2i16_factor2:
98 ; SSE42-NEXT: movdqa (%rdi), %xmm0
99 ; SSE42-NEXT: paddb (%rsi), %xmm0
100 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
101 ; SSE42-NEXT: paddb (%rdx), %xmm0
102 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
105 ; AVX-LABEL: vec32_v4i8_to_v2i16_factor2:
107 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
108 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
109 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
110 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
111 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
114 ; AVX2-LABEL: vec32_v4i8_to_v2i16_factor2:
116 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
117 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
118 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
119 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
120 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
121 ; AVX2-NEXT: vzeroupper
124 ; AVX512F-LABEL: vec32_v4i8_to_v2i16_factor2:
126 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
127 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
128 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
129 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
130 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
131 ; AVX512F-NEXT: vzeroupper
134 ; AVX512BW-LABEL: vec32_v4i8_to_v2i16_factor2:
136 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
137 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
138 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
139 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
140 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
141 ; AVX512BW-NEXT: vzeroupper
142 ; AVX512BW-NEXT: retq
143 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
144 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
145 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
146 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
147 %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
148 %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
149 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
150 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
151 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
155 define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
156 ; SSE2-LABEL: vec32_v4i8_to_v1i32_factor4:
158 ; SSE2-NEXT: movdqa (%rdi), %xmm0
159 ; SSE2-NEXT: paddb (%rsi), %xmm0
160 ; SSE2-NEXT: pxor %xmm1, %xmm1
161 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
162 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
163 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
164 ; SSE2-NEXT: paddb (%rdx), %xmm0
165 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
168 ; SSE42-LABEL: vec32_v4i8_to_v1i32_factor4:
170 ; SSE42-NEXT: movdqa (%rdi), %xmm0
171 ; SSE42-NEXT: paddb (%rsi), %xmm0
172 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
173 ; SSE42-NEXT: paddb (%rdx), %xmm0
174 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
177 ; AVX-LABEL: vec32_v4i8_to_v1i32_factor4:
179 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
180 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
181 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
182 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
183 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
186 ; AVX2-LABEL: vec32_v4i8_to_v1i32_factor4:
188 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
189 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
190 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
191 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
192 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
193 ; AVX2-NEXT: vzeroupper
196 ; AVX512F-LABEL: vec32_v4i8_to_v1i32_factor4:
198 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
199 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
200 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
201 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
202 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
203 ; AVX512F-NEXT: vzeroupper
206 ; AVX512BW-LABEL: vec32_v4i8_to_v1i32_factor4:
208 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
209 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
210 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
211 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
212 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
213 ; AVX512BW-NEXT: vzeroupper
214 ; AVX512BW-NEXT: retq
215 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
216 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
217 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
218 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
219 %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
220 %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
221 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
222 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
223 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
227 define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
228 ; SSE2-LABEL: vec32_v2i16_to_v1i32_factor2:
230 ; SSE2-NEXT: movdqa (%rdi), %xmm0
231 ; SSE2-NEXT: paddb (%rsi), %xmm0
232 ; SSE2-NEXT: pxor %xmm1, %xmm1
233 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
234 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
235 ; SSE2-NEXT: paddb (%rdx), %xmm0
236 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
239 ; SSE42-LABEL: vec32_v2i16_to_v1i32_factor2:
241 ; SSE42-NEXT: movdqa (%rdi), %xmm0
242 ; SSE42-NEXT: paddb (%rsi), %xmm0
243 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
244 ; SSE42-NEXT: paddb (%rdx), %xmm0
245 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
248 ; AVX-LABEL: vec32_v2i16_to_v1i32_factor2:
250 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
251 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
252 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
253 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
254 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
257 ; AVX2-LABEL: vec32_v2i16_to_v1i32_factor2:
259 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
260 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
261 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
262 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
263 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
264 ; AVX2-NEXT: vzeroupper
267 ; AVX512F-LABEL: vec32_v2i16_to_v1i32_factor2:
269 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
270 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
271 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
272 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
273 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
274 ; AVX512F-NEXT: vzeroupper
277 ; AVX512BW-LABEL: vec32_v2i16_to_v1i32_factor2:
279 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
280 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
281 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
282 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
283 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
284 ; AVX512BW-NEXT: vzeroupper
285 ; AVX512BW-NEXT: retq
286 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
287 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
288 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
289 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
290 %in.vec.cast = bitcast <4 x i8> %in.vec.trunc to <2 x i16>
291 %zextd.vec = shufflevector <2 x i16> %in.vec.cast, <2 x i16> zeroinitializer, <2 x i32> <i32 0, i32 3>
292 %out.bytevec = bitcast <2 x i16> %zextd.vec to <4 x i8>
293 %out.bytevec.padded = shufflevector <4 x i8> %out.bytevec, <4 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
294 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
295 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
296 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
300 define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
301 ; SSE2-LABEL: vec64_v8i8_to_v4i16_factor2:
303 ; SSE2-NEXT: movdqa (%rdi), %xmm0
304 ; SSE2-NEXT: paddb (%rsi), %xmm0
305 ; SSE2-NEXT: pxor %xmm1, %xmm1
306 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
307 ; SSE2-NEXT: paddb (%rdx), %xmm0
308 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
311 ; SSE42-LABEL: vec64_v8i8_to_v4i16_factor2:
313 ; SSE42-NEXT: movdqa (%rdi), %xmm0
314 ; SSE42-NEXT: paddb (%rsi), %xmm0
315 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
316 ; SSE42-NEXT: paddb (%rdx), %xmm0
317 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
320 ; AVX-LABEL: vec64_v8i8_to_v4i16_factor2:
322 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
323 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
324 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
325 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
326 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
329 ; AVX2-LABEL: vec64_v8i8_to_v4i16_factor2:
331 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
332 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
333 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
334 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
335 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
336 ; AVX2-NEXT: vzeroupper
339 ; AVX512F-LABEL: vec64_v8i8_to_v4i16_factor2:
341 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
342 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
343 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
344 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
345 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
346 ; AVX512F-NEXT: vzeroupper
349 ; AVX512BW-LABEL: vec64_v8i8_to_v4i16_factor2:
351 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
352 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
353 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
354 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
355 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
356 ; AVX512BW-NEXT: vzeroupper
357 ; AVX512BW-NEXT: retq
358 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
359 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
360 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
361 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
362 %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
363 %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
364 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
365 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
366 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
370 define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
371 ; SSE2-LABEL: vec64_v8i8_to_v2i32_factor4:
373 ; SSE2-NEXT: movdqa (%rdi), %xmm0
374 ; SSE2-NEXT: paddb (%rsi), %xmm0
375 ; SSE2-NEXT: pxor %xmm1, %xmm1
376 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
377 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
378 ; SSE2-NEXT: paddb (%rdx), %xmm0
379 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
382 ; SSE42-LABEL: vec64_v8i8_to_v2i32_factor4:
384 ; SSE42-NEXT: movdqa (%rdi), %xmm0
385 ; SSE42-NEXT: paddb (%rsi), %xmm0
386 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
387 ; SSE42-NEXT: paddb (%rdx), %xmm0
388 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
391 ; AVX-LABEL: vec64_v8i8_to_v2i32_factor4:
393 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
394 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
395 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
396 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
397 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
400 ; AVX2-LABEL: vec64_v8i8_to_v2i32_factor4:
402 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
403 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
404 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
405 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
406 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
407 ; AVX2-NEXT: vzeroupper
410 ; AVX512F-LABEL: vec64_v8i8_to_v2i32_factor4:
412 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
413 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
414 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
415 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
416 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
417 ; AVX512F-NEXT: vzeroupper
420 ; AVX512BW-LABEL: vec64_v8i8_to_v2i32_factor4:
422 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
423 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
424 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
425 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
426 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
427 ; AVX512BW-NEXT: vzeroupper
428 ; AVX512BW-NEXT: retq
429 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
430 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
431 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
432 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
433 %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
434 %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
435 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
436 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
437 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
441 define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
442 ; SSE2-LABEL: vec64_v8i8_to_v1i64_factor8:
444 ; SSE2-NEXT: movdqa (%rdi), %xmm0
445 ; SSE2-NEXT: paddb (%rsi), %xmm0
446 ; SSE2-NEXT: pxor %xmm1, %xmm1
447 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
448 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
449 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
450 ; SSE2-NEXT: paddb (%rdx), %xmm0
451 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
454 ; SSE42-LABEL: vec64_v8i8_to_v1i64_factor8:
456 ; SSE42-NEXT: movdqa (%rdi), %xmm0
457 ; SSE42-NEXT: paddb (%rsi), %xmm0
458 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
459 ; SSE42-NEXT: paddb (%rdx), %xmm0
460 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
463 ; AVX-LABEL: vec64_v8i8_to_v1i64_factor8:
465 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
466 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
467 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
468 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
469 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
472 ; AVX2-LABEL: vec64_v8i8_to_v1i64_factor8:
474 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
475 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
476 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
477 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
478 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
479 ; AVX2-NEXT: vzeroupper
482 ; AVX512F-LABEL: vec64_v8i8_to_v1i64_factor8:
484 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
485 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
486 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
487 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
488 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
489 ; AVX512F-NEXT: vzeroupper
492 ; AVX512BW-LABEL: vec64_v8i8_to_v1i64_factor8:
494 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
495 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
496 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
497 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
498 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
499 ; AVX512BW-NEXT: vzeroupper
500 ; AVX512BW-NEXT: retq
501 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
502 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
503 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
504 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
505 %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
506 %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
507 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
508 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
509 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
513 define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
514 ; SSE2-LABEL: vec64_v4i16_to_v2i32_factor2:
516 ; SSE2-NEXT: movdqa (%rdi), %xmm0
517 ; SSE2-NEXT: paddb (%rsi), %xmm0
518 ; SSE2-NEXT: pxor %xmm1, %xmm1
519 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
520 ; SSE2-NEXT: paddb (%rdx), %xmm0
521 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
524 ; SSE42-LABEL: vec64_v4i16_to_v2i32_factor2:
526 ; SSE42-NEXT: movdqa (%rdi), %xmm0
527 ; SSE42-NEXT: paddb (%rsi), %xmm0
528 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
529 ; SSE42-NEXT: paddb (%rdx), %xmm0
530 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
533 ; AVX-LABEL: vec64_v4i16_to_v2i32_factor2:
535 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
536 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
537 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
538 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
539 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
542 ; AVX2-LABEL: vec64_v4i16_to_v2i32_factor2:
544 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
545 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
546 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
547 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
548 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
549 ; AVX2-NEXT: vzeroupper
552 ; AVX512F-LABEL: vec64_v4i16_to_v2i32_factor2:
554 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
555 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
556 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
557 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
558 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
559 ; AVX512F-NEXT: vzeroupper
562 ; AVX512BW-LABEL: vec64_v4i16_to_v2i32_factor2:
564 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
565 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
566 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
567 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
568 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
569 ; AVX512BW-NEXT: vzeroupper
570 ; AVX512BW-NEXT: retq
571 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
572 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
573 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
574 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
575 %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16>
576 %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
577 %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8>
578 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
579 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
580 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
581 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
585 define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
586 ; SSE2-LABEL: vec64_v4i16_to_v1i64_factor4:
588 ; SSE2-NEXT: movdqa (%rdi), %xmm0
589 ; SSE2-NEXT: paddb (%rsi), %xmm0
590 ; SSE2-NEXT: pxor %xmm1, %xmm1
591 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
592 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
593 ; SSE2-NEXT: paddb (%rdx), %xmm0
594 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
597 ; SSE42-LABEL: vec64_v4i16_to_v1i64_factor4:
599 ; SSE42-NEXT: movdqa (%rdi), %xmm0
600 ; SSE42-NEXT: paddb (%rsi), %xmm0
601 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
602 ; SSE42-NEXT: paddb (%rdx), %xmm0
603 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
606 ; AVX-LABEL: vec64_v4i16_to_v1i64_factor4:
608 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
609 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
610 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
611 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
612 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
615 ; AVX2-LABEL: vec64_v4i16_to_v1i64_factor4:
617 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
618 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
619 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
620 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
621 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
622 ; AVX2-NEXT: vzeroupper
625 ; AVX512F-LABEL: vec64_v4i16_to_v1i64_factor4:
627 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
628 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
629 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
630 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
631 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
632 ; AVX512F-NEXT: vzeroupper
635 ; AVX512BW-LABEL: vec64_v4i16_to_v1i64_factor4:
637 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
638 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
639 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
640 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
641 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
642 ; AVX512BW-NEXT: vzeroupper
643 ; AVX512BW-NEXT: retq
644 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
645 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
646 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
647 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
648 %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16>
649 %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
650 %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8>
651 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
652 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
653 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
654 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
658 define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
659 ; SSE2-LABEL: vec64_v2i32_to_v1i64_factor2:
661 ; SSE2-NEXT: movdqa (%rdi), %xmm0
662 ; SSE2-NEXT: paddb (%rsi), %xmm0
663 ; SSE2-NEXT: pxor %xmm1, %xmm1
664 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
665 ; SSE2-NEXT: paddb (%rdx), %xmm0
666 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
669 ; SSE42-LABEL: vec64_v2i32_to_v1i64_factor2:
671 ; SSE42-NEXT: movdqa (%rdi), %xmm0
672 ; SSE42-NEXT: paddb (%rsi), %xmm0
673 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
674 ; SSE42-NEXT: paddb (%rdx), %xmm0
675 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
678 ; AVX-LABEL: vec64_v2i32_to_v1i64_factor2:
680 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
681 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
682 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
683 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
684 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
687 ; AVX2-LABEL: vec64_v2i32_to_v1i64_factor2:
689 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
690 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
691 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
692 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
693 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
694 ; AVX2-NEXT: vzeroupper
697 ; AVX512F-LABEL: vec64_v2i32_to_v1i64_factor2:
699 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
700 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
701 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
702 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
703 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
704 ; AVX512F-NEXT: vzeroupper
707 ; AVX512BW-LABEL: vec64_v2i32_to_v1i64_factor2:
709 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
710 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
711 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
712 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
713 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
714 ; AVX512BW-NEXT: vzeroupper
715 ; AVX512BW-NEXT: retq
716 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
717 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
718 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
719 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
720 %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <2 x i32>
721 %zextd.vec = shufflevector <2 x i32> %in.vec.cast, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 3>
722 %out.bytevec = bitcast <2 x i32> %zextd.vec to <8 x i8>
723 %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
724 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
725 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
726 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
730 define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
731 ; SSE2-LABEL: vec128_v16i8_to_v8i16_factor2:
733 ; SSE2-NEXT: movdqa (%rdi), %xmm0
734 ; SSE2-NEXT: paddb (%rsi), %xmm0
735 ; SSE2-NEXT: pxor %xmm1, %xmm1
736 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
737 ; SSE2-NEXT: paddb (%rdx), %xmm0
738 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
741 ; SSE42-LABEL: vec128_v16i8_to_v8i16_factor2:
743 ; SSE42-NEXT: movdqa (%rdi), %xmm0
744 ; SSE42-NEXT: paddb (%rsi), %xmm0
745 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
746 ; SSE42-NEXT: paddb (%rdx), %xmm0
747 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
750 ; AVX-LABEL: vec128_v16i8_to_v8i16_factor2:
752 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
753 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
754 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
755 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
756 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
759 ; AVX2-LABEL: vec128_v16i8_to_v8i16_factor2:
761 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
762 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
763 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
764 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
765 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
766 ; AVX2-NEXT: vzeroupper
769 ; AVX512F-LABEL: vec128_v16i8_to_v8i16_factor2:
771 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
772 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
773 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
774 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
775 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
776 ; AVX512F-NEXT: vzeroupper
779 ; AVX512BW-LABEL: vec128_v16i8_to_v8i16_factor2:
781 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
782 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
783 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
784 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
785 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
786 ; AVX512BW-NEXT: vzeroupper
787 ; AVX512BW-NEXT: retq
788 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
789 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
790 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
791 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
792 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
793 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
794 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
795 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
796 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
800 define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
801 ; SSE2-LABEL: vec128_v16i8_to_v4i32_factor4:
803 ; SSE2-NEXT: movdqa (%rdi), %xmm0
804 ; SSE2-NEXT: paddb (%rsi), %xmm0
805 ; SSE2-NEXT: pxor %xmm1, %xmm1
806 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
807 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
808 ; SSE2-NEXT: paddb (%rdx), %xmm0
809 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
812 ; SSE42-LABEL: vec128_v16i8_to_v4i32_factor4:
814 ; SSE42-NEXT: movdqa (%rdi), %xmm0
815 ; SSE42-NEXT: paddb (%rsi), %xmm0
816 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
817 ; SSE42-NEXT: paddb (%rdx), %xmm0
818 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
821 ; AVX-LABEL: vec128_v16i8_to_v4i32_factor4:
823 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
824 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
825 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
826 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
827 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
830 ; AVX2-LABEL: vec128_v16i8_to_v4i32_factor4:
832 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
833 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
834 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
835 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
836 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
837 ; AVX2-NEXT: vzeroupper
840 ; AVX512F-LABEL: vec128_v16i8_to_v4i32_factor4:
842 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
843 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
844 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
845 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
846 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
847 ; AVX512F-NEXT: vzeroupper
850 ; AVX512BW-LABEL: vec128_v16i8_to_v4i32_factor4:
852 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
853 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
854 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
855 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
856 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
857 ; AVX512BW-NEXT: vzeroupper
858 ; AVX512BW-NEXT: retq
859 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
860 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
861 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
862 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
863 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
864 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
865 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
866 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
867 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
871 define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
872 ; SSE2-LABEL: vec128_v16i8_to_v2i64_factor8:
874 ; SSE2-NEXT: movdqa (%rdi), %xmm0
875 ; SSE2-NEXT: paddb (%rsi), %xmm0
876 ; SSE2-NEXT: pxor %xmm1, %xmm1
877 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
878 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
879 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
880 ; SSE2-NEXT: paddb (%rdx), %xmm0
881 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
884 ; SSE42-LABEL: vec128_v16i8_to_v2i64_factor8:
886 ; SSE42-NEXT: movdqa (%rdi), %xmm0
887 ; SSE42-NEXT: paddb (%rsi), %xmm0
888 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
889 ; SSE42-NEXT: paddb (%rdx), %xmm0
890 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
893 ; AVX-LABEL: vec128_v16i8_to_v2i64_factor8:
895 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
896 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
897 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
898 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
899 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
902 ; AVX2-LABEL: vec128_v16i8_to_v2i64_factor8:
904 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
905 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
906 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
907 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
908 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
909 ; AVX2-NEXT: vzeroupper
912 ; AVX512F-LABEL: vec128_v16i8_to_v2i64_factor8:
914 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
915 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
916 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
917 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
918 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
919 ; AVX512F-NEXT: vzeroupper
922 ; AVX512BW-LABEL: vec128_v16i8_to_v2i64_factor8:
924 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
925 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
926 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
927 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
928 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
929 ; AVX512BW-NEXT: vzeroupper
930 ; AVX512BW-NEXT: retq
931 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
932 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
933 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
934 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
935 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
936 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
937 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
938 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
939 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
943 define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
944 ; SSE-LABEL: vec128_v16i8_to_v1i128_factor16:
946 ; SSE-NEXT: movdqa (%rdi), %xmm0
947 ; SSE-NEXT: paddb (%rsi), %xmm0
948 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
949 ; SSE-NEXT: paddb (%rdx), %xmm0
950 ; SSE-NEXT: movdqa %xmm0, (%rcx)
953 ; AVX-LABEL: vec128_v16i8_to_v1i128_factor16:
955 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
956 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
957 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
958 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
959 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
962 ; AVX2-LABEL: vec128_v16i8_to_v1i128_factor16:
964 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
965 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
966 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
967 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
968 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
969 ; AVX2-NEXT: vzeroupper
972 ; AVX512F-LABEL: vec128_v16i8_to_v1i128_factor16:
974 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
975 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
976 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
977 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
978 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
979 ; AVX512F-NEXT: vzeroupper
982 ; AVX512BW-LABEL: vec128_v16i8_to_v1i128_factor16:
984 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
985 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
986 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
987 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
988 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
989 ; AVX512BW-NEXT: vzeroupper
990 ; AVX512BW-NEXT: retq
991 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
992 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
993 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
994 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
995 %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
996 %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
997 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
998 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
999 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1003 define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1004 ; SSE2-LABEL: vec128_v8i16_to_v4i32_factor2:
1006 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1007 ; SSE2-NEXT: paddb (%rsi), %xmm0
1008 ; SSE2-NEXT: pxor %xmm1, %xmm1
1009 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1010 ; SSE2-NEXT: paddb (%rdx), %xmm0
1011 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
1014 ; SSE42-LABEL: vec128_v8i16_to_v4i32_factor2:
1016 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1017 ; SSE42-NEXT: paddb (%rsi), %xmm0
1018 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1019 ; SSE42-NEXT: paddb (%rdx), %xmm0
1020 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
1023 ; AVX-LABEL: vec128_v8i16_to_v4i32_factor2:
1025 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1026 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1027 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1028 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1029 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1032 ; AVX2-LABEL: vec128_v8i16_to_v4i32_factor2:
1034 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1035 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1036 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1037 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1038 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1039 ; AVX2-NEXT: vzeroupper
1042 ; AVX512F-LABEL: vec128_v8i16_to_v4i32_factor2:
1044 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1045 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1046 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1047 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1048 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1049 ; AVX512F-NEXT: vzeroupper
1050 ; AVX512F-NEXT: retq
1052 ; AVX512BW-LABEL: vec128_v8i16_to_v4i32_factor2:
1053 ; AVX512BW: # %bb.0:
1054 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1055 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1056 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1057 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1058 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1059 ; AVX512BW-NEXT: vzeroupper
1060 ; AVX512BW-NEXT: retq
1061 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1062 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1063 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1064 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1065 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16>
1066 %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
1067 %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8>
1068 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1069 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1070 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1071 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1075 define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1076 ; SSE2-LABEL: vec128_v8i16_to_v2i64_factor4:
1078 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1079 ; SSE2-NEXT: paddb (%rsi), %xmm0
1080 ; SSE2-NEXT: pxor %xmm1, %xmm1
1081 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1082 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1083 ; SSE2-NEXT: paddb (%rdx), %xmm0
1084 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
1087 ; SSE42-LABEL: vec128_v8i16_to_v2i64_factor4:
1089 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1090 ; SSE42-NEXT: paddb (%rsi), %xmm0
1091 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1092 ; SSE42-NEXT: paddb (%rdx), %xmm0
1093 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
1096 ; AVX-LABEL: vec128_v8i16_to_v2i64_factor4:
1098 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1099 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1100 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1101 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1102 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1105 ; AVX2-LABEL: vec128_v8i16_to_v2i64_factor4:
1107 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1108 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1109 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1110 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1111 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1112 ; AVX2-NEXT: vzeroupper
1115 ; AVX512F-LABEL: vec128_v8i16_to_v2i64_factor4:
1117 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1118 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1119 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1120 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1121 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1122 ; AVX512F-NEXT: vzeroupper
1123 ; AVX512F-NEXT: retq
1125 ; AVX512BW-LABEL: vec128_v8i16_to_v2i64_factor4:
1126 ; AVX512BW: # %bb.0:
1127 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1128 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1129 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1130 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1131 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1132 ; AVX512BW-NEXT: vzeroupper
1133 ; AVX512BW-NEXT: retq
1134 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1135 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1136 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1137 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1138 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16>
1139 %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
1140 %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8>
1141 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1142 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1143 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1144 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1148 define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1149 ; SSE2-LABEL: vec128_v8i16_to_v1i128_factor8:
1151 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1152 ; SSE2-NEXT: paddb (%rsi), %xmm0
1153 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1154 ; SSE2-NEXT: paddb (%rdx), %xmm0
1155 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
1158 ; SSE42-LABEL: vec128_v8i16_to_v1i128_factor8:
1160 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1161 ; SSE42-NEXT: paddb (%rsi), %xmm0
1162 ; SSE42-NEXT: pxor %xmm1, %xmm1
1163 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1164 ; SSE42-NEXT: paddb (%rdx), %xmm1
1165 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1168 ; AVX-LABEL: vec128_v8i16_to_v1i128_factor8:
1170 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1171 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1172 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1173 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1174 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1175 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1178 ; AVX2-LABEL: vec128_v8i16_to_v1i128_factor8:
1180 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1181 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1182 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1183 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1184 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1185 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1186 ; AVX2-NEXT: vzeroupper
1189 ; AVX512F-LABEL: vec128_v8i16_to_v1i128_factor8:
1191 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1192 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1193 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
1194 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1195 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1196 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1197 ; AVX512F-NEXT: vzeroupper
1198 ; AVX512F-NEXT: retq
1200 ; AVX512BW-LABEL: vec128_v8i16_to_v1i128_factor8:
1201 ; AVX512BW: # %bb.0:
1202 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1203 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1204 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1205 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1206 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1207 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1208 ; AVX512BW-NEXT: vzeroupper
1209 ; AVX512BW-NEXT: retq
1210 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1211 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1212 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1213 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1214 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16>
1215 %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1216 %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8>
1217 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1218 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1219 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1220 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1224 define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1225 ; SSE2-LABEL: vec128_v4i32_to_v2i64_factor2:
1227 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1228 ; SSE2-NEXT: paddb (%rsi), %xmm0
1229 ; SSE2-NEXT: pxor %xmm1, %xmm1
1230 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1231 ; SSE2-NEXT: paddb (%rdx), %xmm0
1232 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
1235 ; SSE42-LABEL: vec128_v4i32_to_v2i64_factor2:
1237 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1238 ; SSE42-NEXT: paddb (%rsi), %xmm0
1239 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1240 ; SSE42-NEXT: paddb (%rdx), %xmm0
1241 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
1244 ; AVX-LABEL: vec128_v4i32_to_v2i64_factor2:
1246 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1247 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1248 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1249 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1250 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1253 ; AVX2-LABEL: vec128_v4i32_to_v2i64_factor2:
1255 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1256 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1257 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1258 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1259 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1260 ; AVX2-NEXT: vzeroupper
1263 ; AVX512F-LABEL: vec128_v4i32_to_v2i64_factor2:
1265 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1266 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1267 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1268 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1269 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1270 ; AVX512F-NEXT: vzeroupper
1271 ; AVX512F-NEXT: retq
1273 ; AVX512BW-LABEL: vec128_v4i32_to_v2i64_factor2:
1274 ; AVX512BW: # %bb.0:
1275 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1276 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1277 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1278 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1279 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1280 ; AVX512BW-NEXT: vzeroupper
1281 ; AVX512BW-NEXT: retq
1282 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1283 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1284 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1285 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1286 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32>
1287 %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1288 %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8>
1289 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1290 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1291 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1292 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1296 define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1297 ; SSE2-LABEL: vec128_v4i32_to_v1i128_factor4:
1299 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1300 ; SSE2-NEXT: paddb (%rsi), %xmm0
1301 ; SSE2-NEXT: xorps %xmm1, %xmm1
1302 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1303 ; SSE2-NEXT: paddb (%rdx), %xmm1
1304 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1307 ; SSE42-LABEL: vec128_v4i32_to_v1i128_factor4:
1309 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1310 ; SSE42-NEXT: paddb (%rsi), %xmm0
1311 ; SSE42-NEXT: pxor %xmm1, %xmm1
1312 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1313 ; SSE42-NEXT: paddb (%rdx), %xmm1
1314 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1317 ; AVX-LABEL: vec128_v4i32_to_v1i128_factor4:
1319 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1320 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1321 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1322 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1323 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1324 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1327 ; AVX2-LABEL: vec128_v4i32_to_v1i128_factor4:
1329 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1330 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1331 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1332 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1333 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1334 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1335 ; AVX2-NEXT: vzeroupper
1338 ; AVX512F-LABEL: vec128_v4i32_to_v1i128_factor4:
1340 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1341 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1342 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
1343 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1344 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1345 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1346 ; AVX512F-NEXT: vzeroupper
1347 ; AVX512F-NEXT: retq
1349 ; AVX512BW-LABEL: vec128_v4i32_to_v1i128_factor4:
1350 ; AVX512BW: # %bb.0:
1351 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1352 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1353 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1354 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1355 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1356 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1357 ; AVX512BW-NEXT: vzeroupper
1358 ; AVX512BW-NEXT: retq
1359 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1360 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1361 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1362 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1363 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32>
1364 %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1365 %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8>
1366 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1367 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1368 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1369 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1373 define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1374 ; SSE-LABEL: vec128_v2i64_to_v1i128_factor2:
1376 ; SSE-NEXT: movdqa (%rdi), %xmm0
1377 ; SSE-NEXT: paddb (%rsi), %xmm0
1378 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
1379 ; SSE-NEXT: paddb (%rdx), %xmm0
1380 ; SSE-NEXT: movdqa %xmm0, (%rcx)
1383 ; AVX-LABEL: vec128_v2i64_to_v1i128_factor2:
1385 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1386 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1387 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1388 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1389 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1392 ; AVX2-LABEL: vec128_v2i64_to_v1i128_factor2:
1394 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1395 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1396 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1397 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1398 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1399 ; AVX2-NEXT: vzeroupper
1402 ; AVX512F-LABEL: vec128_v2i64_to_v1i128_factor2:
1404 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1405 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1406 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1407 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1408 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1409 ; AVX512F-NEXT: vzeroupper
1410 ; AVX512F-NEXT: retq
1412 ; AVX512BW-LABEL: vec128_v2i64_to_v1i128_factor2:
1413 ; AVX512BW: # %bb.0:
1414 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1415 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1416 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1417 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1418 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1419 ; AVX512BW-NEXT: vzeroupper
1420 ; AVX512BW-NEXT: retq
1421 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1422 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1423 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1424 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1425 %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <2 x i64>
1426 %zextd.vec = shufflevector <2 x i64> %in.vec.cast, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
1427 %out.bytevec = bitcast <2 x i64> %zextd.vec to <16 x i8>
1428 %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1429 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1430 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1431 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1435 define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1436 ; SSE2-LABEL: vec256_v32i8_to_v16i16_factor2:
1438 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1439 ; SSE2-NEXT: paddb (%rsi), %xmm0
1440 ; SSE2-NEXT: pxor %xmm1, %xmm1
1441 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1442 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1443 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1444 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1445 ; SSE2-NEXT: paddb (%rdx), %xmm2
1446 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
1447 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1450 ; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2:
1452 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1453 ; SSE42-NEXT: paddb (%rsi), %xmm0
1454 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1455 ; SSE42-NEXT: pxor %xmm2, %xmm2
1456 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1457 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1458 ; SSE42-NEXT: paddb (%rdx), %xmm1
1459 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1460 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1463 ; AVX-LABEL: vec256_v32i8_to_v16i16_factor2:
1465 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1466 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1467 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1468 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
1469 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1470 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1471 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1472 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1473 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1476 ; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2:
1478 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1479 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1480 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1481 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1482 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1483 ; AVX2-NEXT: vzeroupper
1486 ; AVX512F-LABEL: vec256_v32i8_to_v16i16_factor2:
1488 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1489 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1490 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1491 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1492 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1493 ; AVX512F-NEXT: vzeroupper
1494 ; AVX512F-NEXT: retq
1496 ; AVX512BW-LABEL: vec256_v32i8_to_v16i16_factor2:
1497 ; AVX512BW: # %bb.0:
1498 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1499 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1500 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1501 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1502 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1503 ; AVX512BW-NEXT: vzeroupper
1504 ; AVX512BW-NEXT: retq
1505 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1506 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1507 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1508 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1509 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 1, i32 35, i32 2, i32 37, i32 3, i32 39, i32 4, i32 41, i32 5, i32 43, i32 6, i32 45, i32 7, i32 47, i32 8, i32 49, i32 9, i32 51, i32 10, i32 53, i32 11, i32 55, i32 12, i32 57, i32 13, i32 59, i32 14, i32 61, i32 15, i32 63>
1510 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1511 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1512 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1513 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1517 define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1518 ; SSE2-LABEL: vec256_v32i8_to_v8i32_factor4:
1520 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1521 ; SSE2-NEXT: paddb (%rsi), %xmm0
1522 ; SSE2-NEXT: pxor %xmm1, %xmm1
1523 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1524 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1525 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1526 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1527 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1528 ; SSE2-NEXT: paddb (%rdx), %xmm2
1529 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
1530 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1533 ; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4:
1535 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1536 ; SSE42-NEXT: paddb (%rsi), %xmm0
1537 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1538 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1539 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1540 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1541 ; SSE42-NEXT: paddb (%rdx), %xmm1
1542 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1543 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1546 ; AVX-LABEL: vec256_v32i8_to_v8i32_factor4:
1548 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1549 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1550 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1551 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1552 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1553 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1554 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1555 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1556 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1559 ; AVX2-LABEL: vec256_v32i8_to_v8i32_factor4:
1561 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1562 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1563 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1564 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1565 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1566 ; AVX2-NEXT: vzeroupper
1569 ; AVX512F-LABEL: vec256_v32i8_to_v8i32_factor4:
1571 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1572 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1573 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1574 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1575 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1576 ; AVX512F-NEXT: vzeroupper
1577 ; AVX512F-NEXT: retq
1579 ; AVX512BW-LABEL: vec256_v32i8_to_v8i32_factor4:
1580 ; AVX512BW: # %bb.0:
1581 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1582 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1583 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1584 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1585 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1586 ; AVX512BW-NEXT: vzeroupper
1587 ; AVX512BW-NEXT: retq
1588 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1589 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1590 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1591 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1592 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 3, i32 45, i32 46, i32 47, i32 4, i32 49, i32 50, i32 51, i32 5, i32 53, i32 54, i32 55, i32 6, i32 57, i32 58, i32 59, i32 7, i32 61, i32 62, i32 63>
1593 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1594 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1595 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1596 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1600 define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1601 ; SSE2-LABEL: vec256_v32i8_to_v4i64_factor8:
1603 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1604 ; SSE2-NEXT: paddb (%rsi), %xmm0
1605 ; SSE2-NEXT: pxor %xmm1, %xmm1
1606 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1607 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1608 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1609 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1610 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1611 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1612 ; SSE2-NEXT: paddb (%rdx), %xmm2
1613 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
1614 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1617 ; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8:
1619 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1620 ; SSE42-NEXT: paddb (%rsi), %xmm0
1621 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1622 ; SSE42-NEXT: psrld $16, %xmm0
1623 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1624 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1625 ; SSE42-NEXT: paddb (%rdx), %xmm1
1626 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1627 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1630 ; AVX-LABEL: vec256_v32i8_to_v4i64_factor8:
1632 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1633 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1634 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1635 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
1636 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1637 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1638 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1639 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1640 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1643 ; AVX2-LABEL: vec256_v32i8_to_v4i64_factor8:
1645 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1646 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1647 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1648 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1649 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1650 ; AVX2-NEXT: vzeroupper
1653 ; AVX512F-LABEL: vec256_v32i8_to_v4i64_factor8:
1655 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1656 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1657 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1658 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1659 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1660 ; AVX512F-NEXT: vzeroupper
1661 ; AVX512F-NEXT: retq
1663 ; AVX512BW-LABEL: vec256_v32i8_to_v4i64_factor8:
1664 ; AVX512BW: # %bb.0:
1665 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1666 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1667 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1668 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1669 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1670 ; AVX512BW-NEXT: vzeroupper
1671 ; AVX512BW-NEXT: retq
1672 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1673 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1674 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1675 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1676 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 1, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 2, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 3, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1677 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1678 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1679 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1680 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1684 define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1685 ; SSE2-LABEL: vec256_v32i8_to_v2i128_factor16:
1687 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1688 ; SSE2-NEXT: paddb (%rsi), %xmm0
1689 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
1690 ; SSE2-NEXT: pand %xmm0, %xmm1
1691 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1692 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1693 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1694 ; SSE2-NEXT: paddb (%rdx), %xmm1
1695 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
1696 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1699 ; SSE42-LABEL: vec256_v32i8_to_v2i128_factor16:
1701 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1702 ; SSE42-NEXT: paddb (%rsi), %xmm0
1703 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0]
1704 ; SSE42-NEXT: pand %xmm0, %xmm1
1705 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1706 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1707 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1708 ; SSE42-NEXT: paddb (%rdx), %xmm1
1709 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1710 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1713 ; AVX-LABEL: vec256_v32i8_to_v2i128_factor16:
1715 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1716 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1717 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1718 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
1719 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1720 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1721 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1722 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1723 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1726 ; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16:
1728 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1729 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1730 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1731 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1732 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1733 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1734 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1735 ; AVX2-NEXT: vzeroupper
1738 ; AVX512F-LABEL: vec256_v32i8_to_v2i128_factor16:
1740 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1741 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1742 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1743 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1744 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1745 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1746 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1747 ; AVX512F-NEXT: vzeroupper
1748 ; AVX512F-NEXT: retq
1750 ; AVX512BW-LABEL: vec256_v32i8_to_v2i128_factor16:
1751 ; AVX512BW: # %bb.0:
1752 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1753 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1754 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1755 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1756 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1757 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1758 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1759 ; AVX512BW-NEXT: vzeroupper
1760 ; AVX512BW-NEXT: retq
1761 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1762 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1763 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1764 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1765 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 1, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1766 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1767 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1768 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1769 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1773 define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1774 ; SSE-LABEL: vec256_v32i8_to_v1i256_factor32:
1776 ; SSE-NEXT: movdqa (%rdi), %xmm0
1777 ; SSE-NEXT: paddb (%rsi), %xmm0
1778 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1779 ; SSE-NEXT: movaps 16(%rdx), %xmm1
1780 ; SSE-NEXT: paddb (%rdx), %xmm0
1781 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
1782 ; SSE-NEXT: movdqa %xmm0, (%rcx)
1785 ; AVX-LABEL: vec256_v32i8_to_v1i256_factor32:
1787 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1788 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1789 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1790 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
1791 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1
1792 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx)
1793 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
1796 ; AVX2-LABEL: vec256_v32i8_to_v1i256_factor32:
1798 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1799 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1800 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
1801 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1802 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1803 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1804 ; AVX2-NEXT: vzeroupper
1807 ; AVX512F-LABEL: vec256_v32i8_to_v1i256_factor32:
1809 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1810 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1811 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
1812 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
1813 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1814 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1815 ; AVX512F-NEXT: vzeroupper
1816 ; AVX512F-NEXT: retq
1818 ; AVX512BW-LABEL: vec256_v32i8_to_v1i256_factor32:
1819 ; AVX512BW: # %bb.0:
1820 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1821 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
1822 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
1823 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
1824 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1825 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1826 ; AVX512BW-NEXT: vzeroupper
1827 ; AVX512BW-NEXT: retq
1828 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1829 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1830 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1831 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1832 %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1833 %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1834 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1835 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1836 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1840 define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1841 ; SSE2-LABEL: vec256_v16i16_to_v8i32_factor2:
1843 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1844 ; SSE2-NEXT: paddb (%rsi), %xmm0
1845 ; SSE2-NEXT: pxor %xmm1, %xmm1
1846 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1847 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1848 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1849 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1850 ; SSE2-NEXT: paddb (%rdx), %xmm2
1851 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
1852 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1855 ; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2:
1857 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1858 ; SSE42-NEXT: paddb (%rsi), %xmm0
1859 ; SSE42-NEXT: pxor %xmm1, %xmm1
1860 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1861 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1862 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1863 ; SSE42-NEXT: paddb (%rdx), %xmm2
1864 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
1865 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1868 ; AVX-LABEL: vec256_v16i16_to_v8i32_factor2:
1870 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1871 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1872 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1873 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
1874 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1875 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1876 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1877 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1878 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1881 ; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2:
1883 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1884 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1885 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1886 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1887 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1888 ; AVX2-NEXT: vzeroupper
1891 ; AVX512F-LABEL: vec256_v16i16_to_v8i32_factor2:
1893 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1894 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1895 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1896 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1897 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1898 ; AVX512F-NEXT: vzeroupper
1899 ; AVX512F-NEXT: retq
1901 ; AVX512BW-LABEL: vec256_v16i16_to_v8i32_factor2:
1902 ; AVX512BW: # %bb.0:
1903 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1904 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1905 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1906 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1907 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1908 ; AVX512BW-NEXT: vzeroupper
1909 ; AVX512BW-NEXT: retq
1910 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1911 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1912 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1913 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1914 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
1915 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1916 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
1917 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1918 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
1919 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
1920 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
1924 define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
1925 ; SSE2-LABEL: vec256_v16i16_to_v4i64_factor4:
1927 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1928 ; SSE2-NEXT: paddb (%rsi), %xmm0
1929 ; SSE2-NEXT: pxor %xmm1, %xmm1
1930 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1931 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1932 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1933 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1934 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
1935 ; SSE2-NEXT: paddb (%rdx), %xmm2
1936 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
1937 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
1940 ; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4:
1942 ; SSE42-NEXT: movdqa (%rdi), %xmm0
1943 ; SSE42-NEXT: paddb (%rsi), %xmm0
1944 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1945 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1946 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1947 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
1948 ; SSE42-NEXT: paddb (%rdx), %xmm1
1949 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
1950 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
1953 ; AVX-LABEL: vec256_v16i16_to_v4i64_factor4:
1955 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1956 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1957 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1958 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1959 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1960 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
1961 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
1962 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1963 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
1966 ; AVX2-LABEL: vec256_v16i16_to_v4i64_factor4:
1968 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1969 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1970 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1971 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1972 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1973 ; AVX2-NEXT: vzeroupper
1976 ; AVX512F-LABEL: vec256_v16i16_to_v4i64_factor4:
1978 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1979 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1980 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1981 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
1982 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
1983 ; AVX512F-NEXT: vzeroupper
1984 ; AVX512F-NEXT: retq
1986 ; AVX512BW-LABEL: vec256_v16i16_to_v4i64_factor4:
1987 ; AVX512BW: # %bb.0:
1988 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1989 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
1990 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1991 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
1992 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
1993 ; AVX512BW-NEXT: vzeroupper
1994 ; AVX512BW-NEXT: retq
1995 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
1996 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
1997 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
1998 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1999 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
2000 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
2001 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
2002 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2003 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2004 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2005 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2009 define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2010 ; SSE2-LABEL: vec256_v16i16_to_v2i128_factor8:
2012 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2013 ; SSE2-NEXT: paddb (%rsi), %xmm0
2014 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
2015 ; SSE2-NEXT: pand %xmm0, %xmm1
2016 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2017 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2018 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2019 ; SSE2-NEXT: paddb (%rdx), %xmm1
2020 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
2021 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2024 ; SSE42-LABEL: vec256_v16i16_to_v2i128_factor8:
2026 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2027 ; SSE42-NEXT: paddb (%rsi), %xmm0
2028 ; SSE42-NEXT: pxor %xmm1, %xmm1
2029 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2030 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2031 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2032 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2033 ; SSE42-NEXT: paddb (%rdx), %xmm1
2034 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2035 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2038 ; AVX-LABEL: vec256_v16i16_to_v2i128_factor8:
2040 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2041 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2042 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2043 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2044 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2045 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2046 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2047 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2048 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2049 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2052 ; AVX2-LABEL: vec256_v16i16_to_v2i128_factor8:
2054 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2055 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2056 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2057 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2058 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2059 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
2060 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2061 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2062 ; AVX2-NEXT: vzeroupper
2065 ; AVX512F-LABEL: vec256_v16i16_to_v2i128_factor8:
2067 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2068 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2069 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2070 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2071 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2072 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
2073 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2074 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2075 ; AVX512F-NEXT: vzeroupper
2076 ; AVX512F-NEXT: retq
2078 ; AVX512BW-LABEL: vec256_v16i16_to_v2i128_factor8:
2079 ; AVX512BW: # %bb.0:
2080 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2081 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2082 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
2083 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
2084 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
2085 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
2086 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2087 ; AVX512BW-NEXT: vzeroupper
2088 ; AVX512BW-NEXT: retq
2089 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2090 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2091 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2092 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2093 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
2094 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2095 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
2096 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2097 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2098 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2099 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2103 define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2104 ; SSE2-LABEL: vec256_v16i16_to_v1i256_factor16:
2106 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2107 ; SSE2-NEXT: paddb (%rsi), %xmm0
2108 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2109 ; SSE2-NEXT: movaps 16(%rdx), %xmm1
2110 ; SSE2-NEXT: paddb (%rdx), %xmm0
2111 ; SSE2-NEXT: movaps %xmm1, 16(%rcx)
2112 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
2115 ; SSE42-LABEL: vec256_v16i16_to_v1i256_factor16:
2117 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2118 ; SSE42-NEXT: paddb (%rsi), %xmm0
2119 ; SSE42-NEXT: pxor %xmm1, %xmm1
2120 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2121 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
2122 ; SSE42-NEXT: paddb (%rdx), %xmm1
2123 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
2124 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2127 ; AVX-LABEL: vec256_v16i16_to_v1i256_factor16:
2129 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2130 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2131 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2132 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
2133 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2134 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1
2135 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx)
2136 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2139 ; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16:
2141 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2142 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2143 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
2144 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2145 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2146 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2147 ; AVX2-NEXT: vzeroupper
2150 ; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16:
2152 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2153 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2154 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
2155 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
2156 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2157 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2158 ; AVX512F-NEXT: vzeroupper
2159 ; AVX512F-NEXT: retq
2161 ; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16:
2162 ; AVX512BW: # %bb.0:
2163 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2164 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2165 ; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
2166 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
2167 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2168 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2169 ; AVX512BW-NEXT: vzeroupper
2170 ; AVX512BW-NEXT: retq
2171 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2172 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2173 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2174 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2175 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16>
2176 %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2177 %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8>
2178 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2179 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2180 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2181 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2185 define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2186 ; SSE2-LABEL: vec256_v8i32_to_v4i64_factor2:
2188 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2189 ; SSE2-NEXT: paddb (%rsi), %xmm0
2190 ; SSE2-NEXT: pxor %xmm1, %xmm1
2191 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2192 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2193 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2194 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2195 ; SSE2-NEXT: paddb (%rdx), %xmm2
2196 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2197 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2200 ; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2:
2202 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2203 ; SSE42-NEXT: paddb (%rsi), %xmm0
2204 ; SSE42-NEXT: pxor %xmm1, %xmm1
2205 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
2206 ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2207 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2208 ; SSE42-NEXT: paddb (%rdx), %xmm2
2209 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
2210 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2213 ; AVX-LABEL: vec256_v8i32_to_v4i64_factor2:
2215 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2216 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2217 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
2218 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
2219 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2220 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2221 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2222 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2223 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2226 ; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2:
2228 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2229 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2230 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2231 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2232 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2233 ; AVX2-NEXT: vzeroupper
2236 ; AVX512F-LABEL: vec256_v8i32_to_v4i64_factor2:
2238 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2239 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2240 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2241 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2242 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2243 ; AVX512F-NEXT: vzeroupper
2244 ; AVX512F-NEXT: retq
2246 ; AVX512BW-LABEL: vec256_v8i32_to_v4i64_factor2:
2247 ; AVX512BW: # %bb.0:
2248 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2249 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2250 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2251 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2252 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2253 ; AVX512BW-NEXT: vzeroupper
2254 ; AVX512BW-NEXT: retq
2255 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2256 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2257 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2258 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2259 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32>
2260 %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
2261 %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8>
2262 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2263 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2264 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2265 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2269 define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2270 ; SSE2-LABEL: vec256_v8i32_to_v2i128_factor4:
2272 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2273 ; SSE2-NEXT: paddb (%rsi), %xmm0
2274 ; SSE2-NEXT: xorps %xmm1, %xmm1
2275 ; SSE2-NEXT: xorps %xmm2, %xmm2
2276 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
2277 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
2278 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
2279 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2280 ; SSE2-NEXT: paddb (%rdx), %xmm2
2281 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
2282 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2285 ; SSE42-LABEL: vec256_v8i32_to_v2i128_factor4:
2287 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2288 ; SSE42-NEXT: paddb (%rsi), %xmm0
2289 ; SSE42-NEXT: pxor %xmm1, %xmm1
2290 ; SSE42-NEXT: pxor %xmm2, %xmm2
2291 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
2292 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2293 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2294 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2295 ; SSE42-NEXT: paddb (%rdx), %xmm2
2296 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
2297 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2300 ; AVX-LABEL: vec256_v8i32_to_v2i128_factor4:
2302 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2303 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2304 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
2305 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2306 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2307 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2308 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
2309 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
2310 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2311 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2312 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
2313 ; AVX-NEXT: vzeroupper
2316 ; AVX2-SLOW-LABEL: vec256_v8i32_to_v2i128_factor4:
2317 ; AVX2-SLOW: # %bb.0:
2318 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
2319 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2320 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2321 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2322 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2323 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2324 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2325 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
2326 ; AVX2-SLOW-NEXT: vzeroupper
2327 ; AVX2-SLOW-NEXT: retq
2329 ; AVX2-FAST-PERLANE-LABEL: vec256_v8i32_to_v2i128_factor4:
2330 ; AVX2-FAST-PERLANE: # %bb.0:
2331 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
2332 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2333 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2334 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2335 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1
2336 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2337 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2338 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
2339 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2340 ; AVX2-FAST-PERLANE-NEXT: retq
2342 ; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4:
2343 ; AVX2-FAST: # %bb.0:
2344 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
2345 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2346 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0]
2347 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2348 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
2349 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
2350 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2351 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
2352 ; AVX2-FAST-NEXT: vzeroupper
2353 ; AVX2-FAST-NEXT: retq
2355 ; AVX512F-LABEL: vec256_v8i32_to_v2i128_factor4:
2357 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2358 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2359 ; AVX512F-NEXT: movb $17, %al
2360 ; AVX512F-NEXT: kmovw %eax, %k1
2361 ; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
2362 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2363 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2364 ; AVX512F-NEXT: vzeroupper
2365 ; AVX512F-NEXT: retq
2367 ; AVX512BW-LABEL: vec256_v8i32_to_v2i128_factor4:
2368 ; AVX512BW: # %bb.0:
2369 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2370 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2371 ; AVX512BW-NEXT: movb $17, %al
2372 ; AVX512BW-NEXT: kmovd %eax, %k1
2373 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
2374 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2375 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2376 ; AVX512BW-NEXT: vzeroupper
2377 ; AVX512BW-NEXT: retq
2378 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2379 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2380 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2381 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2382 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32>
2383 %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
2384 %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8>
2385 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2386 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2387 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2388 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2392 define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2393 ; SSE2-LABEL: vec256_v8i32_to_v1i256_factor8:
2395 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2396 ; SSE2-NEXT: paddb (%rsi), %xmm0
2397 ; SSE2-NEXT: xorps %xmm1, %xmm1
2398 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2399 ; SSE2-NEXT: movaps 16(%rdx), %xmm0
2400 ; SSE2-NEXT: paddb (%rdx), %xmm1
2401 ; SSE2-NEXT: movaps %xmm0, 16(%rcx)
2402 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
2405 ; SSE42-LABEL: vec256_v8i32_to_v1i256_factor8:
2407 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2408 ; SSE42-NEXT: paddb (%rsi), %xmm0
2409 ; SSE42-NEXT: pxor %xmm1, %xmm1
2410 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2411 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
2412 ; SSE42-NEXT: paddb (%rdx), %xmm1
2413 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
2414 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2417 ; AVX-LABEL: vec256_v8i32_to_v1i256_factor8:
2419 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2420 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2421 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2422 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2423 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2424 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1
2425 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx)
2426 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2429 ; AVX2-LABEL: vec256_v8i32_to_v1i256_factor8:
2431 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2432 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2433 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2434 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2435 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2436 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2437 ; AVX2-NEXT: vzeroupper
2440 ; AVX512F-LABEL: vec256_v8i32_to_v1i256_factor8:
2442 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2443 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2444 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
2445 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2446 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2447 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2448 ; AVX512F-NEXT: vzeroupper
2449 ; AVX512F-NEXT: retq
2451 ; AVX512BW-LABEL: vec256_v8i32_to_v1i256_factor8:
2452 ; AVX512BW: # %bb.0:
2453 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2454 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2455 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2456 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
2457 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2458 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2459 ; AVX512BW-NEXT: vzeroupper
2460 ; AVX512BW-NEXT: retq
2461 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2462 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2463 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2464 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2465 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32>
2466 %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2467 %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8>
2468 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2469 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2470 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2471 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2475 define void @vec256_v4i64_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2476 ; SSE-LABEL: vec256_v4i64_to_v2i128_factor2:
2478 ; SSE-NEXT: movdqa (%rdi), %xmm0
2479 ; SSE-NEXT: paddb (%rsi), %xmm0
2480 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
2481 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
2482 ; SSE-NEXT: paddb 16(%rdx), %xmm0
2483 ; SSE-NEXT: paddb (%rdx), %xmm1
2484 ; SSE-NEXT: movdqa %xmm1, (%rcx)
2485 ; SSE-NEXT: movdqa %xmm0, 16(%rcx)
2488 ; AVX-LABEL: vec256_v4i64_to_v2i128_factor2:
2490 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2491 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2492 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2493 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
2494 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
2495 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
2496 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
2497 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2498 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2499 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
2500 ; AVX-NEXT: vzeroupper
2503 ; AVX2-LABEL: vec256_v4i64_to_v2i128_factor2:
2505 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2506 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2507 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
2508 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2509 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
2510 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2511 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2512 ; AVX2-NEXT: vzeroupper
2515 ; AVX512F-LABEL: vec256_v4i64_to_v2i128_factor2:
2517 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2518 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2519 ; AVX512F-NEXT: movb $5, %al
2520 ; AVX512F-NEXT: kmovw %eax, %k1
2521 ; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
2522 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2523 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2524 ; AVX512F-NEXT: vzeroupper
2525 ; AVX512F-NEXT: retq
2527 ; AVX512BW-LABEL: vec256_v4i64_to_v2i128_factor2:
2528 ; AVX512BW: # %bb.0:
2529 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2530 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2531 ; AVX512BW-NEXT: movb $5, %al
2532 ; AVX512BW-NEXT: kmovd %eax, %k1
2533 ; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
2534 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2535 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2536 ; AVX512BW-NEXT: vzeroupper
2537 ; AVX512BW-NEXT: retq
2538 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2539 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2540 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2541 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2542 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64>
2543 %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2544 %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8>
2545 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2546 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2547 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2548 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2552 define void @vec256_v4i64_to_v1i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2553 ; SSE-LABEL: vec256_v4i64_to_v1i256_factor4:
2555 ; SSE-NEXT: movdqa (%rdi), %xmm0
2556 ; SSE-NEXT: paddb (%rsi), %xmm0
2557 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
2558 ; SSE-NEXT: movaps 16(%rdx), %xmm1
2559 ; SSE-NEXT: paddb (%rdx), %xmm0
2560 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
2561 ; SSE-NEXT: movdqa %xmm0, (%rcx)
2564 ; AVX-LABEL: vec256_v4i64_to_v1i256_factor4:
2566 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2567 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2568 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2569 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2570 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1
2571 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx)
2572 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2575 ; AVX2-LABEL: vec256_v4i64_to_v1i256_factor4:
2577 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2578 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2579 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2580 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2581 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2582 ; AVX2-NEXT: vzeroupper
2585 ; AVX512F-LABEL: vec256_v4i64_to_v1i256_factor4:
2587 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2588 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2589 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2590 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2591 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2592 ; AVX512F-NEXT: vzeroupper
2593 ; AVX512F-NEXT: retq
2595 ; AVX512BW-LABEL: vec256_v4i64_to_v1i256_factor4:
2596 ; AVX512BW: # %bb.0:
2597 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2598 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2599 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2600 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2601 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2602 ; AVX512BW-NEXT: vzeroupper
2603 ; AVX512BW-NEXT: retq
2604 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2605 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2606 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2607 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2608 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64>
2609 %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2610 %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8>
2611 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2612 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2613 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2614 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2618 define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2619 ; SSE-LABEL: vec256_v2i128_to_v1i256_factor2:
2621 ; SSE-NEXT: movdqa (%rdi), %xmm0
2622 ; SSE-NEXT: paddb (%rsi), %xmm0
2623 ; SSE-NEXT: movaps 16(%rdx), %xmm1
2624 ; SSE-NEXT: paddb (%rdx), %xmm0
2625 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
2626 ; SSE-NEXT: movdqa %xmm0, (%rcx)
2629 ; AVX-LABEL: vec256_v2i128_to_v1i256_factor2:
2631 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2632 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2633 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
2634 ; AVX-NEXT: vmovaps 16(%rdx), %xmm1
2635 ; AVX-NEXT: vmovaps %xmm1, 16(%rcx)
2636 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
2639 ; AVX2-LABEL: vec256_v2i128_to_v1i256_factor2:
2641 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2642 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2643 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2644 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
2645 ; AVX2-NEXT: vzeroupper
2648 ; AVX512F-LABEL: vec256_v2i128_to_v1i256_factor2:
2650 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2651 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2652 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
2653 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
2654 ; AVX512F-NEXT: vzeroupper
2655 ; AVX512F-NEXT: retq
2657 ; AVX512BW-LABEL: vec256_v2i128_to_v1i256_factor2:
2658 ; AVX512BW: # %bb.0:
2659 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2660 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2661 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2662 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2663 ; AVX512BW-NEXT: vzeroupper
2664 ; AVX512BW-NEXT: retq
2665 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2666 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2667 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2668 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2669 %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <2 x i128>
2670 %zextd.vec = shufflevector <2 x i128> %in.vec.cast, <2 x i128> zeroinitializer, <2 x i32> <i32 0, i32 3>
2671 %out.bytevec = bitcast <2 x i128> %zextd.vec to <32 x i8>
2672 %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2673 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2674 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2675 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2679 define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2680 ; SSE2-LABEL: vec384_v48i8_to_v24i16_factor2:
2682 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2683 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
2684 ; SSE2-NEXT: paddb (%rsi), %xmm0
2685 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
2686 ; SSE2-NEXT: pxor %xmm2, %xmm2
2687 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2688 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2689 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2690 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2691 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2692 ; SSE2-NEXT: paddb (%rdx), %xmm3
2693 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
2694 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
2695 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
2696 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2699 ; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2:
2701 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2702 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
2703 ; SSE42-NEXT: paddb (%rsi), %xmm0
2704 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
2705 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2706 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2707 ; SSE42-NEXT: pxor %xmm3, %xmm3
2708 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2709 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2710 ; SSE42-NEXT: paddb (%rdx), %xmm2
2711 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2712 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2713 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
2714 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2717 ; AVX-LABEL: vec384_v48i8_to_v24i16_factor2:
2719 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2720 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
2721 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
2722 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2723 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2724 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
2725 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2726 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2727 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
2728 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
2729 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
2730 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
2731 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
2732 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
2735 ; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2:
2737 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2738 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2739 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2740 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2741 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2742 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2743 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2744 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2745 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2746 ; AVX2-NEXT: vzeroupper
2749 ; AVX512F-LABEL: vec384_v48i8_to_v24i16_factor2:
2751 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2752 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2753 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2754 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
2755 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2756 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2757 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2758 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2759 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2760 ; AVX512F-NEXT: vzeroupper
2761 ; AVX512F-NEXT: retq
2763 ; AVX512BW-LABEL: vec384_v48i8_to_v24i16_factor2:
2764 ; AVX512BW: # %bb.0:
2765 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2766 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2767 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2768 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2769 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2770 ; AVX512BW-NEXT: vzeroupper
2771 ; AVX512BW-NEXT: retq
2772 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2773 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2774 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2775 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
2776 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 1, i32 51, i32 2, i32 53, i32 3, i32 55, i32 4, i32 57, i32 5, i32 59, i32 6, i32 61, i32 7, i32 63, i32 8, i32 65, i32 9, i32 67, i32 10, i32 69, i32 11, i32 71, i32 12, i32 73, i32 13, i32 75, i32 14, i32 77, i32 15, i32 79, i32 16, i32 81, i32 17, i32 83, i32 18, i32 85, i32 19, i32 87, i32 20, i32 89, i32 21, i32 91, i32 22, i32 93, i32 23, i32 95>
2777 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2778 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2779 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2780 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2784 define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2785 ; SSE2-LABEL: vec384_v48i8_to_v16i24_factor3:
2787 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2788 ; SSE2-NEXT: paddb (%rsi), %xmm0
2789 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
2790 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5]
2791 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2792 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2793 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7]
2794 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
2795 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2796 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2797 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
2798 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6]
2799 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2800 ; SSE2-NEXT: paddb (%rdx), %xmm0
2801 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
2802 ; SSE2-NEXT: paddb 16(%rdx), %xmm1
2803 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
2804 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
2805 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
2808 ; SSE42-LABEL: vec384_v48i8_to_v16i24_factor3:
2810 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2811 ; SSE42-NEXT: paddb (%rsi), %xmm0
2812 ; SSE42-NEXT: movdqa %xmm0, %xmm1
2813 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero
2814 ; SSE42-NEXT: movdqa %xmm0, %xmm2
2815 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5]
2816 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero
2817 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2818 ; SSE42-NEXT: paddb (%rdx), %xmm2
2819 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
2820 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
2821 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
2822 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2825 ; AVX-LABEL: vec384_v48i8_to_v16i24_factor3:
2827 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2828 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2829 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5]
2830 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero
2831 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2832 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
2833 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
2834 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2835 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2836 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
2837 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
2840 ; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3:
2842 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2843 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2844 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
2845 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero
2846 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2847 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2848 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2849 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
2850 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2851 ; AVX2-NEXT: vzeroupper
2854 ; AVX512F-LABEL: vec384_v48i8_to_v16i24_factor3:
2856 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
2857 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2858 ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
2859 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero
2860 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2861 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2862 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2863 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2864 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2865 ; AVX512F-NEXT: vzeroupper
2866 ; AVX512F-NEXT: retq
2868 ; AVX512BW-LABEL: vec384_v48i8_to_v16i24_factor3:
2869 ; AVX512BW: # %bb.0:
2870 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
2871 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
2872 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2,0,3,3,0,4,4,0,5]
2873 ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm1
2874 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2875 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
2876 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
2877 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
2878 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2879 ; AVX512BW-NEXT: vzeroupper
2880 ; AVX512BW-NEXT: retq
2881 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
2882 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
2883 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
2884 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
2885 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 1, i32 52, i32 53, i32 2, i32 55, i32 56, i32 3, i32 58, i32 59, i32 4, i32 61, i32 62, i32 5, i32 64, i32 65, i32 6, i32 67, i32 68, i32 7, i32 70, i32 71, i32 8, i32 73, i32 74, i32 9, i32 76, i32 77, i32 10, i32 79, i32 80, i32 11, i32 82, i32 83, i32 12, i32 85, i32 86, i32 13, i32 88, i32 89, i32 14, i32 91, i32 92, i32 15, i32 94, i32 95>
2886 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2887 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
2888 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
2889 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
2893 define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
2894 ; SSE2-LABEL: vec384_v48i8_to_v12i32_factor4:
2896 ; SSE2-NEXT: movdqa (%rdi), %xmm0
2897 ; SSE2-NEXT: paddb (%rsi), %xmm0
2898 ; SSE2-NEXT: pxor %xmm1, %xmm1
2899 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2900 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2901 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2902 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2903 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2904 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2905 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2906 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
2907 ; SSE2-NEXT: paddb (%rdx), %xmm3
2908 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
2909 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
2910 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
2911 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
2914 ; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4:
2916 ; SSE42-NEXT: movdqa (%rdi), %xmm0
2917 ; SSE42-NEXT: paddb (%rsi), %xmm0
2918 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2919 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2920 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
2921 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2922 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2923 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
2924 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
2925 ; SSE42-NEXT: paddb (%rdx), %xmm1
2926 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
2927 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
2928 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
2931 ; AVX-LABEL: vec384_v48i8_to_v12i32_factor4:
2933 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2934 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2935 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2936 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
2937 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
2938 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2939 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2940 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
2941 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
2942 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
2943 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
2944 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
2945 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
2948 ; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4:
2949 ; AVX2-SLOW: # %bb.0:
2950 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
2951 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2952 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2953 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2954 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2955 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2956 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2957 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
2958 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
2959 ; AVX2-SLOW-NEXT: vzeroupper
2960 ; AVX2-SLOW-NEXT: retq
2962 ; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4:
2963 ; AVX2-FAST-PERLANE: # %bb.0:
2964 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
2965 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2966 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2967 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
2968 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2969 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2970 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
2971 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
2972 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2973 ; AVX2-FAST-PERLANE-NEXT: retq
2975 ; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4:
2976 ; AVX2-FAST: # %bb.0:
2977 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
2978 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2979 ; AVX2-FAST-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2980 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
2981 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2982 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2983 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
2984 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
2985 ; AVX2-FAST-NEXT: vzeroupper
2986 ; AVX2-FAST-NEXT: retq
2988 ; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4:
2990 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
2991 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2992 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2993 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
2994 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2995 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
2996 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
2997 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
2998 ; AVX512F-NEXT: vzeroupper
2999 ; AVX512F-NEXT: retq
3001 ; AVX512BW-LABEL: vec384_v48i8_to_v12i32_factor4:
3002 ; AVX512BW: # %bb.0:
3003 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
3004 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3005 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3006 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3007 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3008 ; AVX512BW-NEXT: vzeroupper
3009 ; AVX512BW-NEXT: retq
3010 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3011 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3012 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3013 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3014 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 1, i32 53, i32 54, i32 55, i32 2, i32 57, i32 58, i32 59, i32 3, i32 61, i32 62, i32 63, i32 4, i32 65, i32 66, i32 67, i32 5, i32 69, i32 70, i32 71, i32 6, i32 73, i32 74, i32 75, i32 7, i32 77, i32 78, i32 79, i32 8, i32 81, i32 82, i32 83, i32 9, i32 85, i32 86, i32 87, i32 10, i32 89, i32 90, i32 91, i32 11, i32 93, i32 94, i32 95>
3015 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3016 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3017 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3018 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3022 define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3023 ; SSE2-LABEL: vec384_v48i8_to_v8i48_factor6:
3025 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3026 ; SSE2-NEXT: paddb (%rsi), %xmm0
3027 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3028 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3029 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3030 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3031 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
3032 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3033 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3034 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3035 ; SSE2-NEXT: paddb (%rdx), %xmm2
3036 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3037 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3038 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3039 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3042 ; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6:
3044 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3045 ; SSE42-NEXT: paddb (%rsi), %xmm0
3046 ; SSE42-NEXT: movdqa %xmm0, %xmm1
3047 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero
3048 ; SSE42-NEXT: movdqa %xmm0, %xmm2
3049 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero
3050 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero
3051 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3052 ; SSE42-NEXT: paddb (%rdx), %xmm2
3053 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
3054 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
3055 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
3056 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3059 ; AVX-LABEL: vec384_v48i8_to_v8i48_factor6:
3061 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3062 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3063 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero
3064 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero
3065 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3066 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3067 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3068 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3069 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3070 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3071 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3074 ; AVX2-LABEL: vec384_v48i8_to_v8i48_factor6:
3076 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3077 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3078 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3079 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u]
3080 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
3081 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3082 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3083 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3084 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
3085 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3086 ; AVX2-NEXT: vzeroupper
3089 ; AVX512F-LABEL: vec384_v48i8_to_v8i48_factor6:
3091 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3092 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3093 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3094 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u]
3095 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
3096 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3097 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3098 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3099 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
3100 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3101 ; AVX512F-NEXT: vzeroupper
3102 ; AVX512F-NEXT: retq
3104 ; AVX512BW-LABEL: vec384_v48i8_to_v8i48_factor6:
3105 ; AVX512BW: # %bb.0:
3106 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
3107 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3108 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3109 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5]
3110 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
3111 ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
3112 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3113 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero
3114 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3115 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3116 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3117 ; AVX512BW-NEXT: vzeroupper
3118 ; AVX512BW-NEXT: retq
3119 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3120 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3121 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3122 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3123 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 1, i32 55, i32 56, i32 57, i32 58, i32 59, i32 2, i32 61, i32 62, i32 63, i32 64, i32 65, i32 3, i32 67, i32 68, i32 69, i32 70, i32 71, i32 4, i32 73, i32 74, i32 75, i32 76, i32 77, i32 5, i32 79, i32 80, i32 81, i32 82, i32 83, i32 6, i32 85, i32 86, i32 87, i32 88, i32 89, i32 7, i32 91, i32 92, i32 93, i32 94, i32 95>
3124 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3125 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3126 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3127 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3131 define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3132 ; SSE2-LABEL: vec384_v48i8_to_v6i64_factor8:
3134 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3135 ; SSE2-NEXT: paddb (%rsi), %xmm0
3136 ; SSE2-NEXT: pxor %xmm1, %xmm1
3137 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3138 ; SSE2-NEXT: movdqa %xmm0, %xmm2
3139 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3140 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3141 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3142 ; SSE2-NEXT: movdqa %xmm0, %xmm3
3143 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3144 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3145 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3146 ; SSE2-NEXT: paddb (%rdx), %xmm3
3147 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
3148 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
3149 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
3150 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3153 ; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8:
3155 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3156 ; SSE42-NEXT: paddb (%rsi), %xmm0
3157 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3158 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
3159 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
3160 ; SSE42-NEXT: psrld $16, %xmm0
3161 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3162 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3163 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
3164 ; SSE42-NEXT: paddb (%rdx), %xmm1
3165 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3166 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
3167 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3170 ; AVX-LABEL: vec384_v48i8_to_v6i64_factor8:
3172 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3173 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3174 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3175 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
3176 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
3177 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3178 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3179 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3180 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3181 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3182 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3183 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3184 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3187 ; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8:
3188 ; AVX2-SLOW: # %bb.0:
3189 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
3190 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3191 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3192 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
3193 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3194 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3195 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3196 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
3197 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
3198 ; AVX2-SLOW-NEXT: vzeroupper
3199 ; AVX2-SLOW-NEXT: retq
3201 ; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8:
3202 ; AVX2-FAST-PERLANE: # %bb.0:
3203 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
3204 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3205 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3206 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
3207 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3208 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3209 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
3210 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
3211 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3212 ; AVX2-FAST-PERLANE-NEXT: retq
3214 ; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8:
3215 ; AVX2-FAST: # %bb.0:
3216 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
3217 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3218 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3219 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
3220 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3221 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3222 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
3223 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
3224 ; AVX2-FAST-NEXT: vzeroupper
3225 ; AVX2-FAST-NEXT: retq
3227 ; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8:
3229 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3230 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3231 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
3232 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
3233 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3234 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3235 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3236 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3237 ; AVX512F-NEXT: vzeroupper
3238 ; AVX512F-NEXT: retq
3240 ; AVX512BW-LABEL: vec384_v48i8_to_v6i64_factor8:
3241 ; AVX512BW: # %bb.0:
3242 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
3243 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3244 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
3245 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3246 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3247 ; AVX512BW-NEXT: vzeroupper
3248 ; AVX512BW-NEXT: retq
3249 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3250 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3251 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3252 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3253 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 1, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 2, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 3, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 4, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 5, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3254 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3255 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3256 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3257 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3261 define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3262 ; SSE2-LABEL: vec384_v48i8_to_v4i96_factor12:
3264 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3265 ; SSE2-NEXT: paddb (%rsi), %xmm0
3266 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3267 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3268 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
3269 ; SSE2-NEXT: movdqa %xmm0, %xmm2
3270 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3271 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3272 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
3273 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3274 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
3275 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3276 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3277 ; SSE2-NEXT: paddb (%rdx), %xmm0
3278 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
3279 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3280 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3281 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
3282 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
3285 ; SSE42-LABEL: vec384_v48i8_to_v4i96_factor12:
3287 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3288 ; SSE42-NEXT: paddb (%rsi), %xmm0
3289 ; SSE42-NEXT: movdqa %xmm0, %xmm1
3290 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3291 ; SSE42-NEXT: movdqa %xmm0, %xmm2
3292 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero
3293 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero
3294 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3295 ; SSE42-NEXT: paddb (%rdx), %xmm2
3296 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
3297 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
3298 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
3299 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3302 ; AVX-LABEL: vec384_v48i8_to_v4i96_factor12:
3304 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3305 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3306 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero
3307 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero
3308 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3309 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3310 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3311 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3312 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3313 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3314 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3317 ; AVX2-LABEL: vec384_v48i8_to_v4i96_factor12:
3319 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3320 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3321 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3322 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3323 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3324 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3325 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3326 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3327 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
3328 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3329 ; AVX2-NEXT: vzeroupper
3332 ; AVX512F-LABEL: vec384_v48i8_to_v4i96_factor12:
3334 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3335 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3336 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3337 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3338 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3339 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3340 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3341 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3342 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
3343 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3344 ; AVX512F-NEXT: vzeroupper
3345 ; AVX512F-NEXT: retq
3347 ; AVX512BW-LABEL: vec384_v48i8_to_v4i96_factor12:
3348 ; AVX512BW: # %bb.0:
3349 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
3350 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3351 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
3352 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3353 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3354 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3355 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3356 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3357 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3358 ; AVX512BW-NEXT: vzeroupper
3359 ; AVX512BW-NEXT: retq
3360 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3361 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3362 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3363 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3364 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 1, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 2, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 3, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3365 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3366 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3367 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3368 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3372 define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3373 ; SSE2-LABEL: vec384_v48i8_to_v3i128_factor16:
3375 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3376 ; SSE2-NEXT: paddb (%rsi), %xmm0
3377 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
3378 ; SSE2-NEXT: pand %xmm0, %xmm1
3379 ; SSE2-NEXT: movdqa %xmm0, %xmm2
3380 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3381 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3382 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3383 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3384 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3385 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
3386 ; SSE2-NEXT: paddb (%rdx), %xmm1
3387 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
3388 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
3389 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3392 ; SSE42-LABEL: vec384_v48i8_to_v3i128_factor16:
3394 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3395 ; SSE42-NEXT: paddb (%rsi), %xmm0
3396 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0]
3397 ; SSE42-NEXT: pand %xmm0, %xmm1
3398 ; SSE42-NEXT: movdqa %xmm0, %xmm2
3399 ; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
3400 ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3401 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3402 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3403 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3404 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
3405 ; SSE42-NEXT: paddb (%rdx), %xmm1
3406 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
3407 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
3408 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3411 ; AVX-LABEL: vec384_v48i8_to_v3i128_factor16:
3413 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3414 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3415 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3416 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3417 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3418 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
3419 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3420 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3421 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
3422 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3423 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3424 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3425 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3428 ; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16:
3429 ; AVX2-SLOW: # %bb.0:
3430 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
3431 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3432 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
3433 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3434 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3435 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3436 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3437 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3438 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3439 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
3440 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
3441 ; AVX2-SLOW-NEXT: vzeroupper
3442 ; AVX2-SLOW-NEXT: retq
3444 ; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16:
3445 ; AVX2-FAST-PERLANE: # %bb.0:
3446 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
3447 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3448 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3449 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3450 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3451 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3452 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3453 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3454 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
3455 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
3456 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3457 ; AVX2-FAST-PERLANE-NEXT: retq
3459 ; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16:
3460 ; AVX2-FAST: # %bb.0:
3461 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
3462 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3463 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3464 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3465 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3466 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3467 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3468 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3469 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
3470 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
3471 ; AVX2-FAST-NEXT: vzeroupper
3472 ; AVX2-FAST-NEXT: retq
3474 ; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16:
3476 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3477 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3478 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3479 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3480 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
3481 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3482 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3483 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
3484 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
3485 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3486 ; AVX512F-NEXT: vzeroupper
3487 ; AVX512F-NEXT: retq
3489 ; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16:
3490 ; AVX512BW: # %bb.0:
3491 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
3492 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3493 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3494 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
3495 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3496 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3497 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3498 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3499 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3500 ; AVX512BW-NEXT: vzeroupper
3501 ; AVX512BW-NEXT: retq
3502 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3503 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3504 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3505 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3506 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 1, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3507 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3508 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3509 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3510 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3514 define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3515 ; SSE2-LABEL: vec384_v48i8_to_v2i192_factor24:
3517 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3518 ; SSE2-NEXT: paddb (%rsi), %xmm0
3519 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
3520 ; SSE2-NEXT: pand %xmm0, %xmm1
3521 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
3522 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3523 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3524 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
3525 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3526 ; SSE2-NEXT: paddb (%rdx), %xmm1
3527 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
3528 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
3529 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3532 ; SSE42-LABEL: vec384_v48i8_to_v2i192_factor24:
3534 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3535 ; SSE42-NEXT: paddb (%rsi), %xmm0
3536 ; SSE42-NEXT: movdqa %xmm0, %xmm1
3537 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
3538 ; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3539 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
3540 ; SSE42-NEXT: paddb (%rdx), %xmm0
3541 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
3542 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
3543 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
3544 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
3547 ; AVX-LABEL: vec384_v48i8_to_v2i192_factor24:
3549 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3550 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3551 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3552 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3553 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2
3554 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3555 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3556 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
3557 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3558 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3559 ; AVX-NEXT: vzeroupper
3562 ; AVX2-LABEL: vec384_v48i8_to_v2i192_factor24:
3564 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
3565 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3566 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3567 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3568 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3569 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
3570 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3571 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
3572 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3573 ; AVX2-NEXT: vzeroupper
3576 ; AVX512F-LABEL: vec384_v48i8_to_v2i192_factor24:
3578 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
3579 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3580 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3581 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3582 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3583 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3584 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
3585 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
3586 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3587 ; AVX512F-NEXT: vzeroupper
3588 ; AVX512F-NEXT: retq
3590 ; AVX512BW-LABEL: vec384_v48i8_to_v2i192_factor24:
3591 ; AVX512BW: # %bb.0:
3592 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
3593 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3594 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
3595 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3596 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3597 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3598 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3599 ; AVX512BW-NEXT: vzeroupper
3600 ; AVX512BW-NEXT: retq
3601 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3602 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3603 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3604 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3605 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 1, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3606 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3607 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3608 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3609 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3613 define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3614 ; SSE-LABEL: vec384_v48i8_to_v1i384_factor48:
3616 ; SSE-NEXT: movdqa (%rdi), %xmm0
3617 ; SSE-NEXT: paddb (%rsi), %xmm0
3618 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3619 ; SSE-NEXT: movaps 16(%rdx), %xmm1
3620 ; SSE-NEXT: movaps 32(%rdx), %xmm2
3621 ; SSE-NEXT: paddb (%rdx), %xmm0
3622 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
3623 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
3624 ; SSE-NEXT: movdqa %xmm0, (%rcx)
3627 ; AVX-LABEL: vec384_v48i8_to_v1i384_factor48:
3629 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3630 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3631 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
3632 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3633 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
3634 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
3635 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
3636 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
3637 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
3638 ; AVX-NEXT: vzeroupper
3641 ; AVX2-LABEL: vec384_v48i8_to_v1i384_factor48:
3643 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3644 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3645 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
3646 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
3647 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
3648 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3649 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
3650 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3651 ; AVX2-NEXT: vzeroupper
3654 ; AVX512F-LABEL: vec384_v48i8_to_v1i384_factor48:
3656 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
3657 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3658 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
3659 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
3660 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3661 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
3662 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
3663 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
3664 ; AVX512F-NEXT: vzeroupper
3665 ; AVX512F-NEXT: retq
3667 ; AVX512BW-LABEL: vec384_v48i8_to_v1i384_factor48:
3668 ; AVX512BW: # %bb.0:
3669 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3670 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3671 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
3672 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
3673 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3674 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3675 ; AVX512BW-NEXT: vzeroupper
3676 ; AVX512BW-NEXT: retq
3677 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3678 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3679 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3680 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3681 %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> <i32 0, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
3682 %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3683 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3684 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3685 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3689 define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3690 ; SSE2-LABEL: vec384_v24i16_to_v12i32_factor2:
3692 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3693 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
3694 ; SSE2-NEXT: paddb (%rsi), %xmm0
3695 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
3696 ; SSE2-NEXT: pxor %xmm2, %xmm2
3697 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3698 ; SSE2-NEXT: movdqa %xmm0, %xmm3
3699 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3700 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3701 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3702 ; SSE2-NEXT: paddb (%rdx), %xmm3
3703 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3704 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3705 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
3706 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3709 ; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2:
3711 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3712 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
3713 ; SSE42-NEXT: paddb (%rsi), %xmm0
3714 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
3715 ; SSE42-NEXT: pxor %xmm2, %xmm2
3716 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
3717 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3718 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3719 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3720 ; SSE42-NEXT: paddb (%rdx), %xmm3
3721 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
3722 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
3723 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
3724 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3727 ; AVX-LABEL: vec384_v24i16_to_v12i32_factor2:
3729 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3730 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3731 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3732 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3733 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3734 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
3735 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3736 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
3737 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
3738 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
3739 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
3740 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
3741 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
3742 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
3745 ; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2:
3747 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3748 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3749 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3750 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
3751 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3752 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3753 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3754 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
3755 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3756 ; AVX2-NEXT: vzeroupper
3759 ; AVX512F-LABEL: vec384_v24i16_to_v12i32_factor2:
3761 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
3762 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3763 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3764 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
3765 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3766 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3767 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3768 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
3769 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
3770 ; AVX512F-NEXT: vzeroupper
3771 ; AVX512F-NEXT: retq
3773 ; AVX512BW-LABEL: vec384_v24i16_to_v12i32_factor2:
3774 ; AVX512BW: # %bb.0:
3775 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
3776 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3777 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3778 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
3779 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
3780 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
3781 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3782 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
3783 ; AVX512BW-NEXT: vzeroupper
3784 ; AVX512BW-NEXT: retq
3785 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3786 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3787 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3788 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3789 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
3790 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 1, i32 27, i32 2, i32 29, i32 3, i32 31, i32 4, i32 33, i32 5, i32 35, i32 6, i32 37, i32 7, i32 39, i32 8, i32 41, i32 9, i32 43, i32 10, i32 45, i32 11, i32 47>
3791 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
3792 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3793 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3794 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3795 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3799 define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3800 ; SSE2-LABEL: vec384_v24i16_to_v8i48_factor3:
3802 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3803 ; SSE2-NEXT: paddb (%rsi), %xmm0
3804 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3805 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3806 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
3807 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
3808 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3809 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3810 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3811 ; SSE2-NEXT: paddb (%rdx), %xmm2
3812 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
3813 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
3814 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
3815 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3818 ; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3:
3820 ; SSE42-NEXT: movdqa (%rdi), %xmm0
3821 ; SSE42-NEXT: paddb (%rsi), %xmm0
3822 ; SSE42-NEXT: pxor %xmm1, %xmm1
3823 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
3824 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
3825 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1]
3826 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7]
3827 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3828 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
3829 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
3830 ; SSE42-NEXT: paddb (%rdx), %xmm3
3831 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
3832 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
3833 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
3834 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
3837 ; AVX-LABEL: vec384_v24i16_to_v8i48_factor3:
3839 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3840 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3841 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
3842 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
3843 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
3844 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
3845 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
3846 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3847 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3848 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
3849 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm2
3850 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
3851 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
3852 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
3853 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
3856 ; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3:
3857 ; AVX2-SLOW: # %bb.0:
3858 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
3859 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3860 ; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3861 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1
3862 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3863 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3864 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3865 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3866 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3867 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3868 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
3869 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
3870 ; AVX2-SLOW-NEXT: vzeroupper
3871 ; AVX2-SLOW-NEXT: retq
3873 ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v8i48_factor3:
3874 ; AVX2-FAST-PERLANE: # %bb.0:
3875 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
3876 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3877 ; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3878 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1
3879 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3880 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3881 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3882 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3883 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
3884 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
3885 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3886 ; AVX2-FAST-PERLANE-NEXT: retq
3888 ; AVX2-FAST-LABEL: vec384_v24i16_to_v8i48_factor3:
3889 ; AVX2-FAST: # %bb.0:
3890 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
3891 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3892 ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3893 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
3894 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3895 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3896 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3897 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3898 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
3899 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
3900 ; AVX2-FAST-NEXT: vzeroupper
3901 ; AVX2-FAST-NEXT: retq
3903 ; AVX512F-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3:
3904 ; AVX512F-SLOW: # %bb.0:
3905 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
3906 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3907 ; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3908 ; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1
3909 ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3910 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3911 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3912 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
3913 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3914 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3915 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
3916 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
3917 ; AVX512F-SLOW-NEXT: vzeroupper
3918 ; AVX512F-SLOW-NEXT: retq
3920 ; AVX512F-FAST-LABEL: vec384_v24i16_to_v8i48_factor3:
3921 ; AVX512F-FAST: # %bb.0:
3922 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
3923 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3924 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2]
3925 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
3926 ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3927 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3928 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3929 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3930 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
3931 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
3932 ; AVX512F-FAST-NEXT: vzeroupper
3933 ; AVX512F-FAST-NEXT: retq
3935 ; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3:
3936 ; AVX512BW-SLOW: # %bb.0:
3937 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
3938 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3939 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21]
3940 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
3941 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
3942 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3943 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3944 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
3945 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
3946 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3947 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
3948 ; AVX512BW-SLOW-NEXT: vzeroupper
3949 ; AVX512BW-SLOW-NEXT: retq
3951 ; AVX512BW-FAST-LABEL: vec384_v24i16_to_v8i48_factor3:
3952 ; AVX512BW-FAST: # %bb.0:
3953 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
3954 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
3955 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21]
3956 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
3957 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
3958 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
3959 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
3960 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
3961 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
3962 ; AVX512BW-FAST-NEXT: vzeroupper
3963 ; AVX512BW-FAST-NEXT: retq
3964 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
3965 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
3966 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
3967 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3968 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
3969 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 1, i32 28, i32 29, i32 2, i32 31, i32 32, i32 3, i32 34, i32 35, i32 4, i32 37, i32 38, i32 5, i32 40, i32 41, i32 6, i32 43, i32 44, i32 7, i32 46, i32 47>
3970 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
3971 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3972 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
3973 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
3974 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
3978 define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
3979 ; SSE2-LABEL: vec384_v24i16_to_v6i64_factor4:
3981 ; SSE2-NEXT: movdqa (%rdi), %xmm0
3982 ; SSE2-NEXT: paddb (%rsi), %xmm0
3983 ; SSE2-NEXT: pxor %xmm1, %xmm1
3984 ; SSE2-NEXT: movdqa %xmm0, %xmm2
3985 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3986 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3987 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3988 ; SSE2-NEXT: movdqa %xmm0, %xmm3
3989 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3990 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3991 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
3992 ; SSE2-NEXT: paddb (%rdx), %xmm3
3993 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
3994 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
3995 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
3996 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
3999 ; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4:
4001 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4002 ; SSE42-NEXT: paddb (%rsi), %xmm0
4003 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4004 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
4005 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4006 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4007 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4008 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4009 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
4010 ; SSE42-NEXT: paddb (%rdx), %xmm1
4011 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4012 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
4013 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4016 ; AVX-LABEL: vec384_v24i16_to_v6i64_factor4:
4018 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4019 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4020 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4021 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
4022 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
4023 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4024 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4025 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4026 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4027 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4028 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4029 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
4030 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4033 ; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4:
4034 ; AVX2-SLOW: # %bb.0:
4035 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4036 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4037 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4038 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4039 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4040 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4041 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4042 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4043 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4044 ; AVX2-SLOW-NEXT: vzeroupper
4045 ; AVX2-SLOW-NEXT: retq
4047 ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4:
4048 ; AVX2-FAST-PERLANE: # %bb.0:
4049 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
4050 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4051 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4052 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4053 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4054 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4055 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
4056 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
4057 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4058 ; AVX2-FAST-PERLANE-NEXT: retq
4060 ; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4:
4061 ; AVX2-FAST: # %bb.0:
4062 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
4063 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4064 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4065 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4066 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4067 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4068 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4069 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4070 ; AVX2-FAST-NEXT: vzeroupper
4071 ; AVX2-FAST-NEXT: retq
4073 ; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4:
4075 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
4076 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4077 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4078 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4079 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4080 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4081 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4082 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
4083 ; AVX512F-NEXT: vzeroupper
4084 ; AVX512F-NEXT: retq
4086 ; AVX512BW-LABEL: vec384_v24i16_to_v6i64_factor4:
4087 ; AVX512BW: # %bb.0:
4088 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
4089 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4090 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
4091 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
4092 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4093 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4094 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4095 ; AVX512BW-NEXT: vzeroupper
4096 ; AVX512BW-NEXT: retq
4097 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4098 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4099 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4100 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4101 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4102 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 1, i32 29, i32 30, i32 31, i32 2, i32 33, i32 34, i32 35, i32 3, i32 37, i32 38, i32 39, i32 4, i32 41, i32 42, i32 43, i32 5, i32 45, i32 46, i32 47>
4103 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4104 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4105 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4106 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4107 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4111 define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4112 ; SSE2-LABEL: vec384_v24i16_to_v4i96_factor6:
4114 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4115 ; SSE2-NEXT: paddb (%rsi), %xmm0
4116 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
4117 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4118 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
4119 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7]
4120 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
4121 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
4122 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4123 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4124 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4125 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4126 ; SSE2-NEXT: paddb (%rdx), %xmm2
4127 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
4128 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
4129 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4130 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4133 ; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6:
4135 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4136 ; SSE42-NEXT: paddb (%rsi), %xmm0
4137 ; SSE42-NEXT: pxor %xmm1, %xmm1
4138 ; SSE42-NEXT: movdqa %xmm0, %xmm2
4139 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,3],zero,zero
4140 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
4141 ; SSE42-NEXT: psrld $16, %xmm0
4142 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
4143 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7]
4144 ; SSE42-NEXT: paddb 16(%rdx), %xmm3
4145 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
4146 ; SSE42-NEXT: paddb (%rdx), %xmm2
4147 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
4148 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
4149 ; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
4152 ; AVX-LABEL: vec384_v24i16_to_v4i96_factor6:
4154 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4155 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4156 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero
4157 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
4158 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
4159 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7]
4160 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
4161 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
4162 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4163 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4164 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4165 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4166 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
4167 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4170 ; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6:
4171 ; AVX2-SLOW: # %bb.0:
4172 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4173 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4174 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
4175 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4176 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
4177 ; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4178 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4179 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4180 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4181 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4182 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
4183 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
4184 ; AVX2-SLOW-NEXT: vzeroupper
4185 ; AVX2-SLOW-NEXT: retq
4187 ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v4i96_factor6:
4188 ; AVX2-FAST-PERLANE: # %bb.0:
4189 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
4190 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4191 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4192 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4193 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4194 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4195 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4196 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4197 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
4198 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
4199 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4200 ; AVX2-FAST-PERLANE-NEXT: retq
4202 ; AVX2-FAST-LABEL: vec384_v24i16_to_v4i96_factor6:
4203 ; AVX2-FAST: # %bb.0:
4204 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
4205 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4206 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4207 ; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4208 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4209 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4210 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4211 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4212 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
4213 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
4214 ; AVX2-FAST-NEXT: vzeroupper
4215 ; AVX2-FAST-NEXT: retq
4217 ; AVX512F-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6:
4218 ; AVX512F-SLOW: # %bb.0:
4219 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4220 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4221 ; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
4222 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4223 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7]
4224 ; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4225 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4226 ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4227 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4228 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4229 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
4230 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
4231 ; AVX512F-SLOW-NEXT: vzeroupper
4232 ; AVX512F-SLOW-NEXT: retq
4234 ; AVX512F-FAST-LABEL: vec384_v24i16_to_v4i96_factor6:
4235 ; AVX512F-FAST: # %bb.0:
4236 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
4237 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4238 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4239 ; AVX512F-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4240 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4241 ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4242 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4243 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4244 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
4245 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
4246 ; AVX512F-FAST-NEXT: vzeroupper
4247 ; AVX512F-FAST-NEXT: retq
4249 ; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6:
4250 ; AVX512BW-SLOW: # %bb.0:
4251 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
4252 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4253 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15]
4254 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4255 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
4256 ; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
4257 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
4258 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
4259 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4260 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4261 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4262 ; AVX512BW-SLOW-NEXT: vzeroupper
4263 ; AVX512BW-SLOW-NEXT: retq
4265 ; AVX512BW-FAST-LABEL: vec384_v24i16_to_v4i96_factor6:
4266 ; AVX512BW-FAST: # %bb.0:
4267 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
4268 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4269 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15]
4270 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4271 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
4272 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4273 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4274 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4275 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4276 ; AVX512BW-FAST-NEXT: vzeroupper
4277 ; AVX512BW-FAST-NEXT: retq
4278 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4279 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4280 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4281 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4282 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4283 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 1, i32 31, i32 32, i32 33, i32 34, i32 35, i32 2, i32 37, i32 38, i32 39, i32 40, i32 41, i32 3, i32 43, i32 44, i32 45, i32 46, i32 47>
4284 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4285 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4286 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4287 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4288 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4292 define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4293 ; SSE2-LABEL: vec384_v24i16_to_v3i128_factor8:
4295 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4296 ; SSE2-NEXT: paddb (%rsi), %xmm0
4297 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
4298 ; SSE2-NEXT: pand %xmm0, %xmm1
4299 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4300 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4301 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4302 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4303 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
4304 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
4305 ; SSE2-NEXT: paddb (%rdx), %xmm1
4306 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
4307 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
4308 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
4311 ; SSE42-LABEL: vec384_v24i16_to_v3i128_factor8:
4313 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4314 ; SSE42-NEXT: paddb (%rsi), %xmm0
4315 ; SSE42-NEXT: pxor %xmm1, %xmm1
4316 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4317 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4318 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4319 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4320 ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4321 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
4322 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
4323 ; SSE42-NEXT: paddb (%rdx), %xmm1
4324 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4325 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
4326 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
4329 ; AVX-LABEL: vec384_v24i16_to_v3i128_factor8:
4331 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4332 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4333 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
4334 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4335 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
4336 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4337 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4338 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4339 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4340 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4341 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4342 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4343 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
4344 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4347 ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
4348 ; AVX2-SLOW: # %bb.0:
4349 ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
4350 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
4351 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
4352 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
4353 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4354 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4355 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4356 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4357 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4358 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
4359 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
4360 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
4361 ; AVX2-SLOW-NEXT: vzeroupper
4362 ; AVX2-SLOW-NEXT: retq
4364 ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8:
4365 ; AVX2-FAST-PERLANE: # %bb.0:
4366 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
4367 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
4368 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
4369 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4370 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4371 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4372 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4373 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4374 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
4375 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
4376 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
4377 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4378 ; AVX2-FAST-PERLANE-NEXT: retq
4380 ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
4381 ; AVX2-FAST: # %bb.0:
4382 ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
4383 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
4384 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
4385 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4386 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
4387 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4388 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4389 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4390 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
4391 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
4392 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
4393 ; AVX2-FAST-NEXT: vzeroupper
4394 ; AVX2-FAST-NEXT: retq
4396 ; AVX512F-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
4397 ; AVX512F-SLOW: # %bb.0:
4398 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
4399 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4400 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
4401 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4402 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
4403 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4404 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
4405 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4406 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
4407 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4408 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4409 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
4410 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
4411 ; AVX512F-SLOW-NEXT: vzeroupper
4412 ; AVX512F-SLOW-NEXT: retq
4414 ; AVX512F-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
4415 ; AVX512F-FAST: # %bb.0:
4416 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
4417 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4418 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4419 ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4420 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
4421 ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4422 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
4423 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4424 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4425 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
4426 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
4427 ; AVX512F-FAST-NEXT: vzeroupper
4428 ; AVX512F-FAST-NEXT: retq
4430 ; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
4431 ; AVX512BW-SLOW: # %bb.0:
4432 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
4433 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4434 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
4435 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4436 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
4437 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4438 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
4439 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4440 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4441 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4442 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4443 ; AVX512BW-SLOW-NEXT: vzeroupper
4444 ; AVX512BW-SLOW-NEXT: retq
4446 ; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
4447 ; AVX512BW-FAST: # %bb.0:
4448 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
4449 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4450 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
4451 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
4452 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
4453 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4454 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
4455 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4456 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4457 ; AVX512BW-FAST-NEXT: vzeroupper
4458 ; AVX512BW-FAST-NEXT: retq
4459 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4460 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4461 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4462 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4463 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4464 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 1, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4465 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4466 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4467 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4468 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4469 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4473 define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4474 ; SSE2-LABEL: vec384_v24i16_to_v2i192_factor12:
4476 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4477 ; SSE2-NEXT: paddb (%rsi), %xmm0
4478 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
4479 ; SSE2-NEXT: pand %xmm0, %xmm1
4480 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4481 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4482 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4483 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
4484 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4485 ; SSE2-NEXT: paddb (%rdx), %xmm1
4486 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
4487 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
4488 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4491 ; SSE42-LABEL: vec384_v24i16_to_v2i192_factor12:
4493 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4494 ; SSE42-NEXT: paddb (%rsi), %xmm0
4495 ; SSE42-NEXT: pxor %xmm1, %xmm1
4496 ; SSE42-NEXT: pxor %xmm2, %xmm2
4497 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4498 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4499 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
4500 ; SSE42-NEXT: movaps 32(%rdx), %xmm1
4501 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4502 ; SSE42-NEXT: paddb (%rdx), %xmm2
4503 ; SSE42-NEXT: movaps %xmm1, 32(%rcx)
4504 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
4505 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4508 ; AVX-LABEL: vec384_v24i16_to_v2i192_factor12:
4510 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4511 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4512 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
4513 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4514 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4515 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
4516 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
4517 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4518 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4519 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
4520 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4521 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4522 ; AVX-NEXT: vzeroupper
4525 ; AVX2-LABEL: vec384_v24i16_to_v2i192_factor12:
4527 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
4528 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4529 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4530 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4531 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4532 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
4533 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4534 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
4535 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4536 ; AVX2-NEXT: vzeroupper
4539 ; AVX512F-LABEL: vec384_v24i16_to_v2i192_factor12:
4541 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
4542 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4543 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
4544 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4545 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
4546 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4547 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
4548 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
4549 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4550 ; AVX512F-NEXT: vzeroupper
4551 ; AVX512F-NEXT: retq
4553 ; AVX512BW-LABEL: vec384_v24i16_to_v2i192_factor12:
4554 ; AVX512BW: # %bb.0:
4555 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
4556 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4557 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15]
4558 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4559 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
4560 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
4561 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4562 ; AVX512BW-NEXT: vzeroupper
4563 ; AVX512BW-NEXT: retq
4564 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4565 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4566 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4567 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4568 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4569 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4570 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4571 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4572 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4573 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4574 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4578 define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4579 ; SSE2-LABEL: vec384_v24i16_to_v1i384_factor24:
4581 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4582 ; SSE2-NEXT: paddb (%rsi), %xmm0
4583 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
4584 ; SSE2-NEXT: movaps 16(%rdx), %xmm1
4585 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
4586 ; SSE2-NEXT: paddb (%rdx), %xmm0
4587 ; SSE2-NEXT: movaps %xmm1, 16(%rcx)
4588 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
4589 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
4592 ; SSE42-LABEL: vec384_v24i16_to_v1i384_factor24:
4594 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4595 ; SSE42-NEXT: paddb (%rsi), %xmm0
4596 ; SSE42-NEXT: pxor %xmm1, %xmm1
4597 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
4598 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
4599 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
4600 ; SSE42-NEXT: paddb (%rdx), %xmm1
4601 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
4602 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
4603 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
4606 ; AVX-LABEL: vec384_v24i16_to_v1i384_factor24:
4608 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4609 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4610 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
4611 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
4612 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
4613 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
4614 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
4615 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
4616 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
4617 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
4618 ; AVX-NEXT: vzeroupper
4621 ; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24:
4623 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4624 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4625 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
4626 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
4627 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
4628 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4629 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
4630 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
4631 ; AVX2-NEXT: vzeroupper
4634 ; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24:
4636 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4637 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4638 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
4639 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
4640 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4641 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
4642 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
4643 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4644 ; AVX512F-NEXT: vzeroupper
4645 ; AVX512F-NEXT: retq
4647 ; AVX512BW-LABEL: vec384_v24i16_to_v1i384_factor24:
4648 ; AVX512BW: # %bb.0:
4649 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4650 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4651 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
4652 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4653 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4654 ; AVX512BW-NEXT: vzeroupper
4655 ; AVX512BW-NEXT: retq
4656 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4657 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4658 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4659 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4660 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16>
4661 %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> <i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4662 %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8>
4663 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4664 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4665 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4666 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4670 define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4671 ; SSE2-LABEL: vec384_v12i32_to_v6i64_factor2:
4673 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4674 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4675 ; SSE2-NEXT: paddb (%rsi), %xmm0
4676 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
4677 ; SSE2-NEXT: pxor %xmm2, %xmm2
4678 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4679 ; SSE2-NEXT: movdqa %xmm0, %xmm3
4680 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4681 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4682 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4683 ; SSE2-NEXT: paddb (%rdx), %xmm3
4684 ; SSE2-NEXT: paddb 32(%rdx), %xmm1
4685 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
4686 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
4687 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4690 ; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2:
4692 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4693 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
4694 ; SSE42-NEXT: paddb (%rsi), %xmm0
4695 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
4696 ; SSE42-NEXT: pxor %xmm2, %xmm2
4697 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
4698 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
4699 ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4700 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4701 ; SSE42-NEXT: paddb (%rdx), %xmm3
4702 ; SSE42-NEXT: paddb 32(%rdx), %xmm1
4703 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
4704 ; SSE42-NEXT: movdqa %xmm3, (%rcx)
4705 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4708 ; AVX-LABEL: vec384_v12i32_to_v6i64_factor2:
4710 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4711 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
4712 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
4713 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4714 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
4715 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
4716 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
4717 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
4718 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
4719 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
4720 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
4721 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
4722 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
4723 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
4726 ; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
4728 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
4729 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4730 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4731 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
4732 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4733 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4734 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4735 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
4736 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
4737 ; AVX2-NEXT: vzeroupper
4740 ; AVX512F-LABEL: vec384_v12i32_to_v6i64_factor2:
4742 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4743 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4744 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4745 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4746 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4747 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4748 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4749 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4750 ; AVX512F-NEXT: vzeroupper
4751 ; AVX512F-NEXT: retq
4753 ; AVX512BW-LABEL: vec384_v12i32_to_v6i64_factor2:
4754 ; AVX512BW: # %bb.0:
4755 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
4756 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4757 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4758 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
4759 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
4760 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4761 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4762 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
4763 ; AVX512BW-NEXT: vzeroupper
4764 ; AVX512BW-NEXT: retq
4765 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4766 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4767 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4768 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4769 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
4770 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 1, i32 15, i32 2, i32 17, i32 3, i32 19, i32 4, i32 21, i32 5, i32 23>
4771 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
4772 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4773 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4774 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4775 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4779 define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4780 ; SSE2-LABEL: vec384_v12i32_to_v4i96_factor3:
4782 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4783 ; SSE2-NEXT: paddb (%rsi), %xmm0
4784 ; SSE2-NEXT: xorps %xmm1, %xmm1
4785 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0]
4786 ; SSE2-NEXT: pand %xmm0, %xmm2
4787 ; SSE2-NEXT: movdqa %xmm0, %xmm3
4788 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4789 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
4790 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4791 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
4792 ; SSE2-NEXT: paddb (%rdx), %xmm0
4793 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
4794 ; SSE2-NEXT: paddb 16(%rdx), %xmm2
4795 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx)
4796 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
4797 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
4800 ; SSE42-LABEL: vec384_v12i32_to_v4i96_factor3:
4802 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4803 ; SSE42-NEXT: paddb (%rsi), %xmm0
4804 ; SSE42-NEXT: pxor %xmm1, %xmm1
4805 ; SSE42-NEXT: pxor %xmm2, %xmm2
4806 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
4807 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
4808 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
4809 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4810 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
4811 ; SSE42-NEXT: paddb (%rdx), %xmm0
4812 ; SSE42-NEXT: paddb 32(%rdx), %xmm3
4813 ; SSE42-NEXT: paddb 16(%rdx), %xmm2
4814 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx)
4815 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
4816 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
4819 ; AVX-LABEL: vec384_v12i32_to_v4i96_factor3:
4821 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4822 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4823 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
4824 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4825 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
4826 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4827 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4828 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
4829 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4830 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
4831 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4832 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4833 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
4834 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
4835 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
4836 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
4837 ; AVX-NEXT: vzeroupper
4840 ; AVX2-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3:
4841 ; AVX2-SLOW: # %bb.0:
4842 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
4843 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4844 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
4845 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
4846 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4847 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4848 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4849 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4850 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4851 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4852 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
4853 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
4854 ; AVX2-SLOW-NEXT: vzeroupper
4855 ; AVX2-SLOW-NEXT: retq
4857 ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v4i96_factor3:
4858 ; AVX2-FAST-PERLANE: # %bb.0:
4859 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
4860 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4861 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1
4862 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
4863 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4864 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4865 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4866 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4867 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
4868 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
4869 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
4870 ; AVX2-FAST-PERLANE-NEXT: retq
4872 ; AVX2-FAST-LABEL: vec384_v12i32_to_v4i96_factor3:
4873 ; AVX2-FAST: # %bb.0:
4874 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
4875 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4876 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
4877 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
4878 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
4879 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4880 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4881 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
4882 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
4883 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
4884 ; AVX2-FAST-NEXT: vzeroupper
4885 ; AVX2-FAST-NEXT: retq
4887 ; AVX512F-LABEL: vec384_v12i32_to_v4i96_factor3:
4889 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
4890 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4891 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,0,0,0,0]
4892 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
4893 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
4894 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4895 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4896 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4897 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4898 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
4899 ; AVX512F-NEXT: vzeroupper
4900 ; AVX512F-NEXT: retq
4902 ; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3:
4903 ; AVX512BW-SLOW: # %bb.0:
4904 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
4905 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4906 ; AVX512BW-SLOW-NEXT: movb $73, %al
4907 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
4908 ; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
4909 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4910 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
4911 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
4912 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4913 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4914 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
4915 ; AVX512BW-SLOW-NEXT: vzeroupper
4916 ; AVX512BW-SLOW-NEXT: retq
4918 ; AVX512BW-FAST-LABEL: vec384_v12i32_to_v4i96_factor3:
4919 ; AVX512BW-FAST: # %bb.0:
4920 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
4921 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4922 ; AVX512BW-FAST-NEXT: movb $73, %al
4923 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1
4924 ; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
4925 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4926 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4927 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
4928 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
4929 ; AVX512BW-FAST-NEXT: vzeroupper
4930 ; AVX512BW-FAST-NEXT: retq
4931 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
4932 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
4933 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
4934 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
4935 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
4936 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 1, i32 16, i32 17, i32 2, i32 19, i32 20, i32 3, i32 22, i32 23>
4937 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
4938 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
4939 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
4940 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
4941 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
4945 define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
4946 ; SSE2-LABEL: vec384_v12i32_to_v3i128_factor4:
4948 ; SSE2-NEXT: movdqa (%rdi), %xmm0
4949 ; SSE2-NEXT: paddb (%rsi), %xmm0
4950 ; SSE2-NEXT: xorps %xmm1, %xmm1
4951 ; SSE2-NEXT: xorps %xmm2, %xmm2
4952 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
4953 ; SSE2-NEXT: movdqa %xmm0, %xmm3
4954 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
4955 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
4956 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
4957 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
4958 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
4959 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
4960 ; SSE2-NEXT: paddb (%rdx), %xmm2
4961 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
4962 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
4963 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
4966 ; SSE42-LABEL: vec384_v12i32_to_v3i128_factor4:
4968 ; SSE42-NEXT: movdqa (%rdi), %xmm0
4969 ; SSE42-NEXT: paddb (%rsi), %xmm0
4970 ; SSE42-NEXT: pxor %xmm1, %xmm1
4971 ; SSE42-NEXT: pxor %xmm2, %xmm2
4972 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
4973 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
4974 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5,6,7]
4975 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
4976 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4977 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
4978 ; SSE42-NEXT: paddb 32(%rdx), %xmm3
4979 ; SSE42-NEXT: paddb (%rdx), %xmm2
4980 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
4981 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
4982 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
4985 ; AVX-LABEL: vec384_v12i32_to_v3i128_factor4:
4987 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
4988 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
4989 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
4990 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
4991 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4992 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
4993 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4994 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
4995 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
4996 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
4997 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
4998 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
4999 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
5000 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
5001 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5002 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
5003 ; AVX-NEXT: vzeroupper
5006 ; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
5007 ; AVX2-SLOW: # %bb.0:
5008 ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
5009 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
5010 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
5011 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
5012 ; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
5013 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
5014 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5015 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5016 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
5017 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5018 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
5019 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
5020 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
5021 ; AVX2-SLOW-NEXT: vzeroupper
5022 ; AVX2-SLOW-NEXT: retq
5024 ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4:
5025 ; AVX2-FAST-PERLANE: # %bb.0:
5026 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
5027 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
5028 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
5029 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5030 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5031 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5032 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
5033 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5034 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
5035 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
5036 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
5037 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
5038 ; AVX2-FAST-PERLANE-NEXT: retq
5040 ; AVX2-FAST-LABEL: vec384_v12i32_to_v3i128_factor4:
5041 ; AVX2-FAST: # %bb.0:
5042 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
5043 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5044 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5045 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0]
5046 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
5047 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
5048 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5049 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5050 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5051 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
5052 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
5053 ; AVX2-FAST-NEXT: vzeroupper
5054 ; AVX2-FAST-NEXT: retq
5056 ; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4:
5058 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5059 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5060 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,0,0,0,0]
5061 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
5062 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
5063 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5064 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5065 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
5066 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
5067 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
5068 ; AVX512F-NEXT: vzeroupper
5069 ; AVX512F-NEXT: retq
5071 ; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
5072 ; AVX512BW-SLOW: # %bb.0:
5073 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
5074 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5075 ; AVX512BW-SLOW-NEXT: movb $17, %al
5076 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
5077 ; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
5078 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
5079 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5080 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5081 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5082 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5083 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
5084 ; AVX512BW-SLOW-NEXT: vzeroupper
5085 ; AVX512BW-SLOW-NEXT: retq
5087 ; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4:
5088 ; AVX512BW-FAST: # %bb.0:
5089 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
5090 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5091 ; AVX512BW-FAST-NEXT: movb $17, %al
5092 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1
5093 ; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
5094 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5095 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5096 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5097 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
5098 ; AVX512BW-FAST-NEXT: vzeroupper
5099 ; AVX512BW-FAST-NEXT: retq
5100 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5101 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5102 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5103 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5104 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
5105 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 1, i32 17, i32 18, i32 19, i32 2, i32 21, i32 22, i32 23>
5106 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
5107 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5108 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5109 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5110 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5114 define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5115 ; SSE2-LABEL: vec384_v12i32_to_v2i192_factor6:
5117 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5118 ; SSE2-NEXT: paddb (%rsi), %xmm0
5119 ; SSE2-NEXT: xorps %xmm1, %xmm1
5120 ; SSE2-NEXT: xorps %xmm2, %xmm2
5121 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
5122 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
5123 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
5124 ; SSE2-NEXT: movaps 32(%rdx), %xmm0
5125 ; SSE2-NEXT: paddb 16(%rdx), %xmm1
5126 ; SSE2-NEXT: paddb (%rdx), %xmm2
5127 ; SSE2-NEXT: movaps %xmm0, 32(%rcx)
5128 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
5129 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
5132 ; SSE42-LABEL: vec384_v12i32_to_v2i192_factor6:
5134 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5135 ; SSE42-NEXT: paddb (%rsi), %xmm0
5136 ; SSE42-NEXT: pxor %xmm1, %xmm1
5137 ; SSE42-NEXT: pxor %xmm2, %xmm2
5138 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5139 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
5140 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
5141 ; SSE42-NEXT: movaps 32(%rdx), %xmm1
5142 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
5143 ; SSE42-NEXT: paddb (%rdx), %xmm2
5144 ; SSE42-NEXT: movaps %xmm1, 32(%rcx)
5145 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
5146 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5149 ; AVX-LABEL: vec384_v12i32_to_v2i192_factor6:
5151 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5152 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5153 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
5154 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
5155 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
5156 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
5157 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
5158 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
5159 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
5160 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5161 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
5162 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5163 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
5164 ; AVX-NEXT: vzeroupper
5167 ; AVX2-SLOW-LABEL: vec384_v12i32_to_v2i192_factor6:
5168 ; AVX2-SLOW: # %bb.0:
5169 ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
5170 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
5171 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
5172 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5173 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
5174 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
5175 ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1
5176 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5177 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx)
5178 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
5179 ; AVX2-SLOW-NEXT: vzeroupper
5180 ; AVX2-SLOW-NEXT: retq
5182 ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v2i192_factor6:
5183 ; AVX2-FAST-PERLANE: # %bb.0:
5184 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
5185 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
5186 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
5187 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
5188 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
5189 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
5190 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1
5191 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5192 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx)
5193 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
5194 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
5195 ; AVX2-FAST-PERLANE-NEXT: retq
5197 ; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6:
5198 ; AVX2-FAST: # %bb.0:
5199 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
5200 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5201 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
5202 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
5203 ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
5204 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
5205 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
5206 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1
5207 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5208 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx)
5209 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
5210 ; AVX2-FAST-NEXT: vzeroupper
5211 ; AVX2-FAST-NEXT: retq
5213 ; AVX512F-LABEL: vec384_v12i32_to_v2i192_factor6:
5215 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5216 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5217 ; AVX512F-NEXT: movb $65, %al
5218 ; AVX512F-NEXT: kmovw %eax, %k1
5219 ; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
5220 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5221 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5222 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5223 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5224 ; AVX512F-NEXT: vzeroupper
5225 ; AVX512F-NEXT: retq
5227 ; AVX512BW-LABEL: vec384_v12i32_to_v2i192_factor6:
5228 ; AVX512BW: # %bb.0:
5229 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5230 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5231 ; AVX512BW-NEXT: movb $65, %al
5232 ; AVX512BW-NEXT: kmovd %eax, %k1
5233 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
5234 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5235 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5236 ; AVX512BW-NEXT: vzeroupper
5237 ; AVX512BW-NEXT: retq
5238 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5239 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5240 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5241 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5242 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
5243 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 1, i32 19, i32 20, i32 21, i32 22, i32 23>
5244 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
5245 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5246 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5247 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5248 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5252 define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5253 ; SSE2-LABEL: vec384_v12i32_to_v1i384_factor12:
5255 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5256 ; SSE2-NEXT: paddb (%rsi), %xmm0
5257 ; SSE2-NEXT: xorps %xmm1, %xmm1
5258 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
5259 ; SSE2-NEXT: movaps 16(%rdx), %xmm0
5260 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
5261 ; SSE2-NEXT: paddb (%rdx), %xmm1
5262 ; SSE2-NEXT: movaps %xmm0, 16(%rcx)
5263 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
5264 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
5267 ; SSE42-LABEL: vec384_v12i32_to_v1i384_factor12:
5269 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5270 ; SSE42-NEXT: paddb (%rsi), %xmm0
5271 ; SSE42-NEXT: pxor %xmm1, %xmm1
5272 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5273 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
5274 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
5275 ; SSE42-NEXT: paddb (%rdx), %xmm1
5276 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
5277 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
5278 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5281 ; AVX-LABEL: vec384_v12i32_to_v1i384_factor12:
5283 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5284 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5285 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
5286 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
5287 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
5288 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5289 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
5290 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
5291 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
5292 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5293 ; AVX-NEXT: vzeroupper
5296 ; AVX2-LABEL: vec384_v12i32_to_v1i384_factor12:
5298 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5299 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5300 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
5301 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5302 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
5303 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5304 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
5305 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5306 ; AVX2-NEXT: vzeroupper
5309 ; AVX512F-LABEL: vec384_v12i32_to_v1i384_factor12:
5311 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5312 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5313 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5314 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5315 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5316 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5317 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5318 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5319 ; AVX512F-NEXT: vzeroupper
5320 ; AVX512F-NEXT: retq
5322 ; AVX512BW-LABEL: vec384_v12i32_to_v1i384_factor12:
5323 ; AVX512BW: # %bb.0:
5324 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5325 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5326 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5327 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
5328 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5329 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5330 ; AVX512BW-NEXT: vzeroupper
5331 ; AVX512BW-NEXT: retq
5332 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5333 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5334 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5335 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5336 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32>
5337 %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> <i32 0, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
5338 %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8>
5339 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5340 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5341 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5342 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5346 define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5347 ; SSE-LABEL: vec384_v6i64_to_v3i128_factor2:
5349 ; SSE-NEXT: movdqa (%rdi), %xmm0
5350 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
5351 ; SSE-NEXT: paddb (%rsi), %xmm0
5352 ; SSE-NEXT: paddb 16(%rsi), %xmm1
5353 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero
5354 ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
5355 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
5356 ; SSE-NEXT: paddb 16(%rdx), %xmm0
5357 ; SSE-NEXT: paddb (%rdx), %xmm2
5358 ; SSE-NEXT: paddb 32(%rdx), %xmm1
5359 ; SSE-NEXT: movdqa %xmm1, 32(%rcx)
5360 ; SSE-NEXT: movdqa %xmm2, (%rcx)
5361 ; SSE-NEXT: movdqa %xmm0, 16(%rcx)
5364 ; AVX-LABEL: vec384_v6i64_to_v3i128_factor2:
5366 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5367 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
5368 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
5369 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5370 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
5371 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5372 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
5373 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
5374 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
5375 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
5376 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5377 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
5378 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
5379 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5380 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
5381 ; AVX-NEXT: vzeroupper
5384 ; AVX2-LABEL: vec384_v6i64_to_v3i128_factor2:
5386 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5387 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5388 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
5389 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
5390 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
5391 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
5392 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5393 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5394 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5395 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
5396 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
5397 ; AVX2-NEXT: vzeroupper
5400 ; AVX512F-LABEL: vec384_v6i64_to_v3i128_factor2:
5402 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5403 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5404 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5405 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,0,0]
5406 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5407 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5408 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5409 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
5410 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
5411 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
5412 ; AVX512F-NEXT: vzeroupper
5413 ; AVX512F-NEXT: retq
5415 ; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2:
5416 ; AVX512BW: # %bb.0:
5417 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5418 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5419 ; AVX512BW-NEXT: movb $5, %al
5420 ; AVX512BW-NEXT: kmovd %eax, %k1
5421 ; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
5422 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
5423 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5424 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
5425 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5426 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5427 ; AVX512BW-NEXT: vzeroupper
5428 ; AVX512BW-NEXT: retq
5429 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5430 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5431 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5432 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5433 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64>
5434 %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 1, i32 9, i32 2, i32 11>
5435 %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8>
5436 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5437 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5438 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5439 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5443 define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5444 ; SSE2-LABEL: vec384_v6i64_to_v2i192_factor3:
5446 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5447 ; SSE2-NEXT: paddb (%rsi), %xmm0
5448 ; SSE2-NEXT: pxor %xmm1, %xmm1
5449 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
5450 ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
5451 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
5452 ; SSE2-NEXT: paddb (%rdx), %xmm0
5453 ; SSE2-NEXT: paddb 16(%rdx), %xmm1
5454 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
5455 ; SSE2-NEXT: movdqa %xmm1, 16(%rcx)
5456 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
5459 ; SSE42-LABEL: vec384_v6i64_to_v2i192_factor3:
5461 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5462 ; SSE42-NEXT: paddb (%rsi), %xmm0
5463 ; SSE42-NEXT: pxor %xmm1, %xmm1
5464 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
5465 ; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
5466 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
5467 ; SSE42-NEXT: paddb (%rdx), %xmm0
5468 ; SSE42-NEXT: paddb 16(%rdx), %xmm1
5469 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
5470 ; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
5471 ; SSE42-NEXT: movdqa %xmm0, (%rcx)
5474 ; AVX-LABEL: vec384_v6i64_to_v2i192_factor3:
5476 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5477 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5478 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
5479 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
5480 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
5481 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
5482 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
5483 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5484 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
5485 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5486 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
5487 ; AVX-NEXT: vzeroupper
5490 ; AVX2-LABEL: vec384_v6i64_to_v2i192_factor3:
5492 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5493 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5494 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
5495 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
5496 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
5497 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
5498 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5499 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
5500 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5501 ; AVX2-NEXT: vzeroupper
5504 ; AVX512F-LABEL: vec384_v6i64_to_v2i192_factor3:
5506 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5507 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5508 ; AVX512F-NEXT: movb $9, %al
5509 ; AVX512F-NEXT: kmovw %eax, %k1
5510 ; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
5511 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5512 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5513 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5514 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5515 ; AVX512F-NEXT: vzeroupper
5516 ; AVX512F-NEXT: retq
5518 ; AVX512BW-LABEL: vec384_v6i64_to_v2i192_factor3:
5519 ; AVX512BW: # %bb.0:
5520 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5521 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5522 ; AVX512BW-NEXT: movb $9, %al
5523 ; AVX512BW-NEXT: kmovd %eax, %k1
5524 ; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
5525 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5526 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5527 ; AVX512BW-NEXT: vzeroupper
5528 ; AVX512BW-NEXT: retq
5529 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5530 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5531 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5532 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5533 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64>
5534 %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 1, i32 10, i32 11>
5535 %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8>
5536 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5537 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5538 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5539 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5543 define void @vec384_v6i64_to_v1i384_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5544 ; SSE-LABEL: vec384_v6i64_to_v1i384_factor6:
5546 ; SSE-NEXT: movdqa (%rdi), %xmm0
5547 ; SSE-NEXT: paddb (%rsi), %xmm0
5548 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
5549 ; SSE-NEXT: movaps 16(%rdx), %xmm1
5550 ; SSE-NEXT: movaps 32(%rdx), %xmm2
5551 ; SSE-NEXT: paddb (%rdx), %xmm0
5552 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
5553 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
5554 ; SSE-NEXT: movdqa %xmm0, (%rcx)
5557 ; AVX-LABEL: vec384_v6i64_to_v1i384_factor6:
5559 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5560 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5561 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
5562 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5563 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5564 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
5565 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
5566 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
5567 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5568 ; AVX-NEXT: vzeroupper
5571 ; AVX2-LABEL: vec384_v6i64_to_v1i384_factor6:
5573 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5574 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5575 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5576 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
5577 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5578 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
5579 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5580 ; AVX2-NEXT: vzeroupper
5583 ; AVX512F-LABEL: vec384_v6i64_to_v1i384_factor6:
5585 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5586 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5587 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5588 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5589 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5590 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5591 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5592 ; AVX512F-NEXT: vzeroupper
5593 ; AVX512F-NEXT: retq
5595 ; AVX512BW-LABEL: vec384_v6i64_to_v1i384_factor6:
5596 ; AVX512BW: # %bb.0:
5597 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5598 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
5599 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5600 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5601 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5602 ; AVX512BW-NEXT: vzeroupper
5603 ; AVX512BW-NEXT: retq
5604 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5605 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5606 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5607 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5608 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64>
5609 %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> <i32 0, i32 7, i32 8, i32 9, i32 10, i32 11>
5610 %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8>
5611 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5612 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5613 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5614 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5618 define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5619 ; SSE-LABEL: vec384_v3i128_to_v1i384_factor3:
5621 ; SSE-NEXT: movdqa (%rdi), %xmm0
5622 ; SSE-NEXT: paddb (%rsi), %xmm0
5623 ; SSE-NEXT: movaps 16(%rdx), %xmm1
5624 ; SSE-NEXT: movaps 32(%rdx), %xmm2
5625 ; SSE-NEXT: paddb (%rdx), %xmm0
5626 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
5627 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
5628 ; SSE-NEXT: movdqa %xmm0, (%rcx)
5631 ; AVX-LABEL: vec384_v3i128_to_v1i384_factor3:
5633 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5634 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5635 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
5636 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
5637 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
5638 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
5639 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
5640 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
5641 ; AVX-NEXT: vzeroupper
5644 ; AVX2-LABEL: vec384_v3i128_to_v1i384_factor3:
5646 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5647 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5648 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
5649 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5650 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
5651 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5652 ; AVX2-NEXT: vzeroupper
5655 ; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3:
5657 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
5658 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5659 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5660 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
5661 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
5662 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5663 ; AVX512F-NEXT: vzeroupper
5664 ; AVX512F-NEXT: retq
5666 ; AVX512BW-LABEL: vec384_v3i128_to_v1i384_factor3:
5667 ; AVX512BW: # %bb.0:
5668 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5669 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5670 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5671 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5672 ; AVX512BW-NEXT: vzeroupper
5673 ; AVX512BW-NEXT: retq
5674 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5675 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5676 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5677 %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5678 %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <3 x i128>
5679 %zextd.vec = shufflevector <3 x i128> %in.vec.cast, <3 x i128> zeroinitializer, <3 x i32> <i32 0, i32 4, i32 5>
5680 %out.bytevec = bitcast <3 x i128> %zextd.vec to <48 x i8>
5681 %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5682 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5683 %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias
5684 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5688 define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5689 ; SSE2-LABEL: vec512_v64i8_to_v32i16_factor2:
5691 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5692 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
5693 ; SSE2-NEXT: paddb (%rsi), %xmm0
5694 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
5695 ; SSE2-NEXT: pxor %xmm2, %xmm2
5696 ; SSE2-NEXT: movdqa %xmm1, %xmm3
5697 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
5698 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
5699 ; SSE2-NEXT: movdqa %xmm0, %xmm4
5700 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
5701 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
5702 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
5703 ; SSE2-NEXT: paddb (%rdx), %xmm4
5704 ; SSE2-NEXT: paddb 48(%rdx), %xmm1
5705 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
5706 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
5707 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx)
5708 ; SSE2-NEXT: movdqa %xmm4, (%rcx)
5709 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
5712 ; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2:
5714 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5715 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
5716 ; SSE42-NEXT: paddb (%rsi), %xmm0
5717 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
5718 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
5719 ; SSE42-NEXT: pxor %xmm3, %xmm3
5720 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
5721 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
5722 ; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
5723 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
5724 ; SSE42-NEXT: paddb (%rdx), %xmm4
5725 ; SSE42-NEXT: paddb 48(%rdx), %xmm1
5726 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
5727 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
5728 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx)
5729 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
5730 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5733 ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2:
5735 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5736 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
5737 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
5738 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5739 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
5740 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
5741 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
5742 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
5743 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
5744 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
5745 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
5746 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
5747 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
5748 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
5749 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
5750 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
5751 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
5754 ; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2:
5756 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
5757 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5758 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5759 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
5760 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5761 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5762 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5763 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
5764 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
5765 ; AVX2-NEXT: vzeroupper
5768 ; AVX512F-LABEL: vec512_v64i8_to_v32i16_factor2:
5770 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
5771 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5772 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5773 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
5774 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
5775 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5776 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5777 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
5778 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
5779 ; AVX512F-NEXT: vzeroupper
5780 ; AVX512F-NEXT: retq
5782 ; AVX512BW-LABEL: vec512_v64i8_to_v32i16_factor2:
5783 ; AVX512BW: # %bb.0:
5784 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
5785 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
5786 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
5787 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5788 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5789 ; AVX512BW-NEXT: vzeroupper
5790 ; AVX512BW-NEXT: retq
5791 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5792 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5793 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5794 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 1, i32 67, i32 2, i32 69, i32 3, i32 71, i32 4, i32 73, i32 5, i32 75, i32 6, i32 77, i32 7, i32 79, i32 8, i32 81, i32 9, i32 83, i32 10, i32 85, i32 11, i32 87, i32 12, i32 89, i32 13, i32 91, i32 14, i32 93, i32 15, i32 95, i32 16, i32 97, i32 17, i32 99, i32 18, i32 101, i32 19, i32 103, i32 20, i32 105, i32 21, i32 107, i32 22, i32 109, i32 23, i32 111, i32 24, i32 113, i32 25, i32 115, i32 26, i32 117, i32 27, i32 119, i32 28, i32 121, i32 29, i32 123, i32 30, i32 125, i32 31, i32 127>
5795 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5796 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
5797 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5801 define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5802 ; SSE2-LABEL: vec512_v64i8_to_v16i32_factor4:
5804 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5805 ; SSE2-NEXT: paddb (%rsi), %xmm0
5806 ; SSE2-NEXT: pxor %xmm1, %xmm1
5807 ; SSE2-NEXT: movdqa %xmm0, %xmm2
5808 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
5809 ; SSE2-NEXT: movdqa %xmm2, %xmm3
5810 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
5811 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5812 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5813 ; SSE2-NEXT: movdqa %xmm0, %xmm4
5814 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5815 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5816 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
5817 ; SSE2-NEXT: paddb (%rdx), %xmm4
5818 ; SSE2-NEXT: paddb 48(%rdx), %xmm2
5819 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
5820 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
5821 ; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
5822 ; SSE2-NEXT: movdqa %xmm4, (%rcx)
5823 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
5826 ; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4:
5828 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5829 ; SSE42-NEXT: paddb (%rsi), %xmm0
5830 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5831 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
5832 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
5833 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
5834 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
5835 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
5836 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5837 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
5838 ; SSE42-NEXT: paddb 48(%rdx), %xmm3
5839 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
5840 ; SSE42-NEXT: paddb (%rdx), %xmm1
5841 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5842 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
5843 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
5844 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5847 ; AVX-LABEL: vec512_v64i8_to_v16i32_factor4:
5849 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5850 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5851 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5852 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
5853 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
5854 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
5855 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
5856 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
5857 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
5858 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
5859 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
5860 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
5861 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
5862 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5863 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
5864 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
5865 ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
5868 ; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4:
5870 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5871 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5872 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5873 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
5874 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
5875 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5876 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5877 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
5878 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
5879 ; AVX2-NEXT: vzeroupper
5882 ; AVX512F-LABEL: vec512_v64i8_to_v16i32_factor4:
5884 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
5885 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5886 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5887 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
5888 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
5889 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
5890 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
5891 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
5892 ; AVX512F-NEXT: vzeroupper
5893 ; AVX512F-NEXT: retq
5895 ; AVX512BW-LABEL: vec512_v64i8_to_v16i32_factor4:
5896 ; AVX512BW: # %bb.0:
5897 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
5898 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5899 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
5900 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
5901 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
5902 ; AVX512BW-NEXT: vzeroupper
5903 ; AVX512BW-NEXT: retq
5904 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
5905 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
5906 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
5907 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 1, i32 69, i32 70, i32 71, i32 2, i32 73, i32 74, i32 75, i32 3, i32 77, i32 78, i32 79, i32 4, i32 81, i32 82, i32 83, i32 5, i32 85, i32 86, i32 87, i32 6, i32 89, i32 90, i32 91, i32 7, i32 93, i32 94, i32 95, i32 8, i32 97, i32 98, i32 99, i32 9, i32 101, i32 102, i32 103, i32 10, i32 105, i32 106, i32 107, i32 11, i32 109, i32 110, i32 111, i32 12, i32 113, i32 114, i32 115, i32 13, i32 117, i32 118, i32 119, i32 14, i32 121, i32 122, i32 123, i32 15, i32 125, i32 126, i32 127>
5908 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
5909 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
5910 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
5914 define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
5915 ; SSE2-LABEL: vec512_v64i8_to_v8i64_factor8:
5917 ; SSE2-NEXT: movdqa (%rdi), %xmm0
5918 ; SSE2-NEXT: paddb (%rsi), %xmm0
5919 ; SSE2-NEXT: pxor %xmm1, %xmm1
5920 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5921 ; SSE2-NEXT: movdqa %xmm0, %xmm2
5922 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5923 ; SSE2-NEXT: movdqa %xmm2, %xmm3
5924 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5925 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5926 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5927 ; SSE2-NEXT: movdqa %xmm0, %xmm4
5928 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5929 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5930 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
5931 ; SSE2-NEXT: paddb (%rdx), %xmm4
5932 ; SSE2-NEXT: paddb 48(%rdx), %xmm2
5933 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
5934 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
5935 ; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
5936 ; SSE2-NEXT: movdqa %xmm4, (%rcx)
5937 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
5940 ; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8:
5942 ; SSE42-NEXT: movdqa (%rdi), %xmm0
5943 ; SSE42-NEXT: paddb (%rsi), %xmm0
5944 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5945 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
5946 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
5947 ; SSE42-NEXT: movdqa %xmm0, %xmm3
5948 ; SSE42-NEXT: psrlq $48, %xmm3
5949 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
5950 ; SSE42-NEXT: psrld $16, %xmm0
5951 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5952 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
5953 ; SSE42-NEXT: paddb 48(%rdx), %xmm3
5954 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
5955 ; SSE42-NEXT: paddb (%rdx), %xmm1
5956 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
5957 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
5958 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
5959 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
5962 ; AVX-LABEL: vec512_v64i8_to_v8i64_factor8:
5964 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
5965 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5966 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5967 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
5968 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
5969 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
5970 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
5971 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
5972 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
5973 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
5974 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
5975 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
5976 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
5977 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
5978 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
5979 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
5980 ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
5983 ; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8:
5985 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
5986 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
5987 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
5988 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
5989 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
5990 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
5991 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
5992 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
5993 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
5994 ; AVX2-NEXT: vzeroupper
5997 ; AVX512F-LABEL: vec512_v64i8_to_v8i64_factor8:
5999 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
6000 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6001 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
6002 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6003 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
6004 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6005 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6006 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6007 ; AVX512F-NEXT: vzeroupper
6008 ; AVX512F-NEXT: retq
6010 ; AVX512BW-LABEL: vec512_v64i8_to_v8i64_factor8:
6011 ; AVX512BW: # %bb.0:
6012 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
6013 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6014 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
6015 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6016 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6017 ; AVX512BW-NEXT: vzeroupper
6018 ; AVX512BW-NEXT: retq
6019 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6020 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6021 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6022 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 1, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 2, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 3, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 4, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 5, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 6, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 7, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6023 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6024 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6025 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6029 define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6030 ; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16:
6032 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6033 ; SSE2-NEXT: paddb (%rsi), %xmm0
6034 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
6035 ; SSE2-NEXT: pand %xmm0, %xmm1
6036 ; SSE2-NEXT: movdqa %xmm0, %xmm2
6037 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
6038 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6039 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6040 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6041 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6042 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6043 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
6044 ; SSE2-NEXT: paddb 48(%rdx), %xmm3
6045 ; SSE2-NEXT: paddb 32(%rdx), %xmm2
6046 ; SSE2-NEXT: paddb (%rdx), %xmm1
6047 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
6048 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx)
6049 ; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
6050 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
6053 ; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16:
6055 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6056 ; SSE42-NEXT: paddb (%rsi), %xmm0
6057 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0]
6058 ; SSE42-NEXT: pand %xmm0, %xmm1
6059 ; SSE42-NEXT: movdqa %xmm0, %xmm2
6060 ; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
6061 ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6062 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6063 ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6064 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6065 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6066 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
6067 ; SSE42-NEXT: paddb 48(%rdx), %xmm3
6068 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
6069 ; SSE42-NEXT: paddb (%rdx), %xmm1
6070 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
6071 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
6072 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6073 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
6076 ; AVX-LABEL: vec512_v64i8_to_v4i128_factor16:
6078 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6079 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6080 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
6081 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6082 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6083 ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
6084 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6085 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6086 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6087 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
6088 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
6089 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
6090 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
6091 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
6092 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
6093 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6094 ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
6097 ; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
6098 ; AVX2-SLOW: # %bb.0:
6099 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
6100 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6101 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6102 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6103 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6104 ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
6105 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
6106 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
6107 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6108 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6109 ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
6110 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6111 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6112 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
6113 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
6114 ; AVX2-SLOW-NEXT: vzeroupper
6115 ; AVX2-SLOW-NEXT: retq
6117 ; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16:
6118 ; AVX2-FAST-PERLANE: # %bb.0:
6119 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
6120 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6121 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6122 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6123 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6124 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
6125 ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1
6126 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6127 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6128 ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0
6129 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6130 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6131 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
6132 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
6133 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
6134 ; AVX2-FAST-PERLANE-NEXT: retq
6136 ; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
6137 ; AVX2-FAST: # %bb.0:
6138 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
6139 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6140 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6141 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6142 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6143 ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
6144 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
6145 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6146 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6147 ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
6148 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6149 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6150 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
6151 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
6152 ; AVX2-FAST-NEXT: vzeroupper
6153 ; AVX2-FAST-NEXT: retq
6155 ; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
6156 ; AVX512F-SLOW: # %bb.0:
6157 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
6158 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6159 ; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6160 ; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
6161 ; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6162 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
6163 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6164 ; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6165 ; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6166 ; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0
6167 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6168 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
6169 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6170 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
6171 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
6172 ; AVX512F-SLOW-NEXT: vzeroupper
6173 ; AVX512F-SLOW-NEXT: retq
6175 ; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
6176 ; AVX512F-FAST: # %bb.0:
6177 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
6178 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6179 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6180 ; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6181 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6182 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6183 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6184 ; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6185 ; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
6186 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6187 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
6188 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6189 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
6190 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
6191 ; AVX512F-FAST-NEXT: vzeroupper
6192 ; AVX512F-FAST-NEXT: retq
6194 ; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
6195 ; AVX512BW-SLOW: # %bb.0:
6196 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
6197 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6198 ; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6199 ; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
6200 ; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6201 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
6202 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6203 ; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6204 ; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6205 ; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0
6206 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6207 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
6208 ; AVX512BW-SLOW-NEXT: vzeroupper
6209 ; AVX512BW-SLOW-NEXT: retq
6211 ; AVX512BW-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
6212 ; AVX512BW-FAST: # %bb.0:
6213 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
6214 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6215 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
6216 ; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
6217 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6218 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
6219 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
6220 ; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
6221 ; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
6222 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6223 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
6224 ; AVX512BW-FAST-NEXT: vzeroupper
6225 ; AVX512BW-FAST-NEXT: retq
6226 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6227 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6228 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6229 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 1, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 2, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 3, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6230 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6231 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6232 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6236 define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6237 ; SSE2-LABEL: vec512_v64i8_to_v2i256_factor32:
6239 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6240 ; SSE2-NEXT: paddb (%rsi), %xmm0
6241 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0]
6242 ; SSE2-NEXT: pand %xmm0, %xmm1
6243 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6244 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6245 ; SSE2-NEXT: movaps 16(%rdx), %xmm2
6246 ; SSE2-NEXT: movaps 48(%rdx), %xmm3
6247 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6248 ; SSE2-NEXT: paddb (%rdx), %xmm1
6249 ; SSE2-NEXT: movaps %xmm3, 48(%rcx)
6250 ; SSE2-NEXT: movaps %xmm2, 16(%rcx)
6251 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
6252 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6255 ; SSE42-LABEL: vec512_v64i8_to_v2i256_factor32:
6257 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6258 ; SSE42-NEXT: paddb (%rsi), %xmm0
6259 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0]
6260 ; SSE42-NEXT: pand %xmm0, %xmm1
6261 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6262 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6263 ; SSE42-NEXT: movaps 16(%rdx), %xmm2
6264 ; SSE42-NEXT: movaps 48(%rdx), %xmm3
6265 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
6266 ; SSE42-NEXT: paddb (%rdx), %xmm1
6267 ; SSE42-NEXT: movaps %xmm3, 48(%rcx)
6268 ; SSE42-NEXT: movaps %xmm2, 16(%rcx)
6269 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
6270 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
6273 ; AVX-LABEL: vec512_v64i8_to_v2i256_factor32:
6275 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6276 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6277 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
6278 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6279 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
6280 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
6281 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6282 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6283 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
6284 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6285 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
6286 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6287 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
6290 ; AVX2-LABEL: vec512_v64i8_to_v2i256_factor32:
6292 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6293 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6294 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
6295 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1
6296 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6297 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6298 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6299 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
6300 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
6301 ; AVX2-NEXT: vzeroupper
6304 ; AVX512F-LABEL: vec512_v64i8_to_v2i256_factor32:
6306 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6307 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6308 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
6309 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1
6310 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6311 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6312 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6313 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
6314 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
6315 ; AVX512F-NEXT: vzeroupper
6316 ; AVX512F-NEXT: retq
6318 ; AVX512BW-LABEL: vec512_v64i8_to_v2i256_factor32:
6319 ; AVX512BW: # %bb.0:
6320 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6321 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6322 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
6323 ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1
6324 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6325 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
6326 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6327 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6328 ; AVX512BW-NEXT: vzeroupper
6329 ; AVX512BW-NEXT: retq
6330 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6331 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6332 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6333 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 1, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6334 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6335 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6336 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6340 define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6341 ; SSE-LABEL: vec512_v64i8_to_v1i512_factor64:
6343 ; SSE-NEXT: movdqa (%rdi), %xmm0
6344 ; SSE-NEXT: paddb (%rsi), %xmm0
6345 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6346 ; SSE-NEXT: movaps 16(%rdx), %xmm1
6347 ; SSE-NEXT: movaps 32(%rdx), %xmm2
6348 ; SSE-NEXT: movaps 48(%rdx), %xmm3
6349 ; SSE-NEXT: paddb (%rdx), %xmm0
6350 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
6351 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
6352 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
6353 ; SSE-NEXT: movdqa %xmm0, (%rcx)
6356 ; AVX-LABEL: vec512_v64i8_to_v1i512_factor64:
6358 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6359 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6360 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
6361 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
6362 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6363 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6364 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6365 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
6366 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6367 ; AVX-NEXT: vzeroupper
6370 ; AVX2-LABEL: vec512_v64i8_to_v1i512_factor64:
6372 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6373 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6374 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
6375 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
6376 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
6377 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6378 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
6379 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6380 ; AVX2-NEXT: vzeroupper
6383 ; AVX512F-LABEL: vec512_v64i8_to_v1i512_factor64:
6385 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6386 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6387 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
6388 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
6389 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6390 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
6391 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
6392 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6393 ; AVX512F-NEXT: vzeroupper
6394 ; AVX512F-NEXT: retq
6396 ; AVX512BW-LABEL: vec512_v64i8_to_v1i512_factor64:
6397 ; AVX512BW: # %bb.0:
6398 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6399 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
6400 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0]
6401 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
6402 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6403 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6404 ; AVX512BW-NEXT: vzeroupper
6405 ; AVX512BW-NEXT: retq
6406 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6407 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6408 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6409 %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6410 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6411 %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias
6412 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6416 define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6417 ; SSE2-LABEL: vec512_v32i16_to_v16i32_factor2:
6419 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6420 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6421 ; SSE2-NEXT: paddb (%rsi), %xmm0
6422 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
6423 ; SSE2-NEXT: pxor %xmm2, %xmm2
6424 ; SSE2-NEXT: movdqa %xmm1, %xmm3
6425 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6426 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
6427 ; SSE2-NEXT: movdqa %xmm0, %xmm4
6428 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
6429 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6430 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
6431 ; SSE2-NEXT: paddb (%rdx), %xmm4
6432 ; SSE2-NEXT: paddb 48(%rdx), %xmm1
6433 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
6434 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
6435 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx)
6436 ; SSE2-NEXT: movdqa %xmm4, (%rcx)
6437 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
6440 ; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2:
6442 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6443 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
6444 ; SSE42-NEXT: paddb (%rsi), %xmm0
6445 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
6446 ; SSE42-NEXT: pxor %xmm2, %xmm2
6447 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
6448 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
6449 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6450 ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6451 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
6452 ; SSE42-NEXT: paddb (%rdx), %xmm4
6453 ; SSE42-NEXT: paddb 48(%rdx), %xmm1
6454 ; SSE42-NEXT: paddb 32(%rdx), %xmm3
6455 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
6456 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx)
6457 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
6458 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
6461 ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2:
6463 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6464 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
6465 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
6466 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6467 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
6468 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
6469 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
6470 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
6471 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6472 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
6473 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
6474 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
6475 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
6476 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
6477 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
6478 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6479 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
6482 ; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2:
6484 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6485 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6486 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
6487 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
6488 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
6489 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6490 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6491 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
6492 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
6493 ; AVX2-NEXT: vzeroupper
6496 ; AVX512F-LABEL: vec512_v32i16_to_v16i32_factor2:
6498 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6499 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6500 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
6501 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6502 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
6503 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6504 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6505 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6506 ; AVX512F-NEXT: vzeroupper
6507 ; AVX512F-NEXT: retq
6509 ; AVX512BW-LABEL: vec512_v32i16_to_v16i32_factor2:
6510 ; AVX512BW: # %bb.0:
6511 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6512 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6513 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
6514 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6515 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6516 ; AVX512BW-NEXT: vzeroupper
6517 ; AVX512BW-NEXT: retq
6518 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6519 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6520 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6521 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6522 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 1, i32 35, i32 2, i32 37, i32 3, i32 39, i32 4, i32 41, i32 5, i32 43, i32 6, i32 45, i32 7, i32 47, i32 8, i32 49, i32 9, i32 51, i32 10, i32 53, i32 11, i32 55, i32 12, i32 57, i32 13, i32 59, i32 14, i32 61, i32 15, i32 63>
6523 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6524 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6525 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6526 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6530 define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6531 ; SSE2-LABEL: vec512_v32i16_to_v8i64_factor4:
6533 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6534 ; SSE2-NEXT: paddb (%rsi), %xmm0
6535 ; SSE2-NEXT: pxor %xmm1, %xmm1
6536 ; SSE2-NEXT: movdqa %xmm0, %xmm2
6537 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6538 ; SSE2-NEXT: movdqa %xmm2, %xmm3
6539 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
6540 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6541 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6542 ; SSE2-NEXT: movdqa %xmm0, %xmm4
6543 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6544 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6545 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
6546 ; SSE2-NEXT: paddb (%rdx), %xmm4
6547 ; SSE2-NEXT: paddb 48(%rdx), %xmm2
6548 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
6549 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
6550 ; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
6551 ; SSE2-NEXT: movdqa %xmm4, (%rcx)
6552 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
6555 ; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4:
6557 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6558 ; SSE42-NEXT: paddb (%rsi), %xmm0
6559 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6560 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
6561 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6562 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
6563 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6564 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
6565 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6566 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
6567 ; SSE42-NEXT: paddb 48(%rdx), %xmm3
6568 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
6569 ; SSE42-NEXT: paddb (%rdx), %xmm1
6570 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
6571 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
6572 ; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
6573 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
6576 ; AVX-LABEL: vec512_v32i16_to_v8i64_factor4:
6578 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6579 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6580 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6581 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
6582 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
6583 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
6584 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
6585 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
6586 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6587 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
6588 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
6589 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
6590 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
6591 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
6592 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
6593 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6594 ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
6597 ; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4:
6599 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6600 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6601 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
6602 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
6603 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
6604 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6605 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6606 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
6607 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
6608 ; AVX2-NEXT: vzeroupper
6611 ; AVX512F-LABEL: vec512_v32i16_to_v8i64_factor4:
6613 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
6614 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6615 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
6616 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6617 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
6618 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6619 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
6620 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
6621 ; AVX512F-NEXT: vzeroupper
6622 ; AVX512F-NEXT: retq
6624 ; AVX512BW-LABEL: vec512_v32i16_to_v8i64_factor4:
6625 ; AVX512BW: # %bb.0:
6626 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
6627 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6628 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
6629 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6630 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6631 ; AVX512BW-NEXT: vzeroupper
6632 ; AVX512BW-NEXT: retq
6633 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6634 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6635 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6636 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6637 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 1, i32 37, i32 38, i32 39, i32 2, i32 41, i32 42, i32 43, i32 3, i32 45, i32 46, i32 47, i32 4, i32 49, i32 50, i32 51, i32 5, i32 53, i32 54, i32 55, i32 6, i32 57, i32 58, i32 59, i32 7, i32 61, i32 62, i32 63>
6638 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6639 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6640 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6641 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6645 define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6646 ; SSE2-LABEL: vec512_v32i16_to_v4i128_factor8:
6648 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6649 ; SSE2-NEXT: paddb (%rsi), %xmm0
6650 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
6651 ; SSE2-NEXT: pand %xmm0, %xmm1
6652 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
6653 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6654 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6655 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6656 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6657 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6658 ; SSE2-NEXT: paddb 16(%rdx), %xmm3
6659 ; SSE2-NEXT: paddb 48(%rdx), %xmm2
6660 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6661 ; SSE2-NEXT: paddb (%rdx), %xmm1
6662 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
6663 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6664 ; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
6665 ; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
6668 ; SSE42-LABEL: vec512_v32i16_to_v4i128_factor8:
6670 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6671 ; SSE42-NEXT: paddb (%rsi), %xmm0
6672 ; SSE42-NEXT: pxor %xmm1, %xmm1
6673 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6674 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
6675 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
6676 ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6677 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6678 ; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6679 ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6680 ; SSE42-NEXT: paddb 16(%rdx), %xmm3
6681 ; SSE42-NEXT: paddb 48(%rdx), %xmm2
6682 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
6683 ; SSE42-NEXT: paddb (%rdx), %xmm1
6684 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
6685 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
6686 ; SSE42-NEXT: movdqa %xmm2, 48(%rcx)
6687 ; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
6690 ; AVX-LABEL: vec512_v32i16_to_v4i128_factor8:
6692 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6693 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6694 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
6695 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6696 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
6697 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6698 ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6699 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6700 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6701 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6702 ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
6703 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
6704 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
6705 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
6706 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
6707 ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
6708 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
6709 ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
6712 ; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8:
6713 ; AVX2-SLOW: # %bb.0:
6714 ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
6715 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
6716 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
6717 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6718 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
6719 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6720 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
6721 ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6722 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6723 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6724 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6725 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6726 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
6727 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
6728 ; AVX2-SLOW-NEXT: vzeroupper
6729 ; AVX2-SLOW-NEXT: retq
6731 ; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8:
6732 ; AVX2-FAST-PERLANE: # %bb.0:
6733 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
6734 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
6735 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
6736 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6737 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
6738 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6739 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
6740 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6741 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6742 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6743 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6744 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
6745 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
6746 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
6747 ; AVX2-FAST-PERLANE-NEXT: retq
6749 ; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8:
6750 ; AVX2-FAST: # %bb.0:
6751 ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
6752 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
6753 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
6754 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
6755 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
6756 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6757 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
6758 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6759 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6760 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6761 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1
6762 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
6763 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
6764 ; AVX2-FAST-NEXT: vzeroupper
6765 ; AVX2-FAST-NEXT: retq
6767 ; AVX512F-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8:
6768 ; AVX512F-SLOW: # %bb.0:
6769 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
6770 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6771 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6772 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6773 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
6774 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
6775 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
6776 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6777 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6778 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
6779 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6780 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6781 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
6782 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
6783 ; AVX512F-SLOW-NEXT: vzeroupper
6784 ; AVX512F-SLOW-NEXT: retq
6786 ; AVX512F-FAST-LABEL: vec512_v32i16_to_v4i128_factor8:
6787 ; AVX512F-FAST: # %bb.0:
6788 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
6789 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6790 ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
6791 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
6792 ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
6793 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
6794 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
6795 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
6796 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
6797 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6798 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6799 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
6800 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
6801 ; AVX512F-FAST-NEXT: vzeroupper
6802 ; AVX512F-FAST-NEXT: retq
6804 ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8:
6805 ; AVX512BW: # %bb.0:
6806 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6807 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
6808 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31]
6809 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
6810 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2
6811 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
6812 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6813 ; AVX512BW-NEXT: vzeroupper
6814 ; AVX512BW-NEXT: retq
6815 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6816 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6817 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6818 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6819 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 1, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 2, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 3, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6820 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6821 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6822 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6823 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6827 define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6828 ; SSE2-LABEL: vec512_v32i16_to_v2i256_factor16:
6830 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6831 ; SSE2-NEXT: paddb (%rsi), %xmm0
6832 ; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0]
6833 ; SSE2-NEXT: pand %xmm0, %xmm1
6834 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6835 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6836 ; SSE2-NEXT: movaps 16(%rdx), %xmm2
6837 ; SSE2-NEXT: movaps 48(%rdx), %xmm3
6838 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
6839 ; SSE2-NEXT: paddb (%rdx), %xmm1
6840 ; SSE2-NEXT: movaps %xmm3, 48(%rcx)
6841 ; SSE2-NEXT: movaps %xmm2, 16(%rcx)
6842 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
6843 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
6846 ; SSE42-LABEL: vec512_v32i16_to_v2i256_factor16:
6848 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6849 ; SSE42-NEXT: paddb (%rsi), %xmm0
6850 ; SSE42-NEXT: pxor %xmm1, %xmm1
6851 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6852 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6853 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6854 ; SSE42-NEXT: movaps 16(%rdx), %xmm2
6855 ; SSE42-NEXT: movaps 48(%rdx), %xmm3
6856 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
6857 ; SSE42-NEXT: paddb (%rdx), %xmm1
6858 ; SSE42-NEXT: movaps %xmm3, 48(%rcx)
6859 ; SSE42-NEXT: movaps %xmm2, 16(%rcx)
6860 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
6861 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
6864 ; AVX-LABEL: vec512_v32i16_to_v2i256_factor16:
6866 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6867 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6868 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
6869 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6870 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
6871 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
6872 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
6873 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6874 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6875 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
6876 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6877 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
6878 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6879 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
6882 ; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16:
6884 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6885 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6886 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6887 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1
6888 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6889 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6890 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6891 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
6892 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
6893 ; AVX2-NEXT: vzeroupper
6896 ; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16:
6898 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6899 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6900 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6901 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1
6902 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6903 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
6904 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
6905 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
6906 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
6907 ; AVX512F-NEXT: vzeroupper
6908 ; AVX512F-NEXT: retq
6910 ; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16:
6911 ; AVX512BW: # %bb.0:
6912 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
6913 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6914 ; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6915 ; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1
6916 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6917 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
6918 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
6919 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
6920 ; AVX512BW-NEXT: vzeroupper
6921 ; AVX512BW-NEXT: retq
6922 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
6923 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
6924 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
6925 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
6926 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 1, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
6927 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
6928 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
6929 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
6930 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
6934 define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
6935 ; SSE2-LABEL: vec512_v32i16_to_v1i512_factor32:
6937 ; SSE2-NEXT: movdqa (%rdi), %xmm0
6938 ; SSE2-NEXT: paddb (%rsi), %xmm0
6939 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6940 ; SSE2-NEXT: movaps 16(%rdx), %xmm1
6941 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
6942 ; SSE2-NEXT: movaps 48(%rdx), %xmm3
6943 ; SSE2-NEXT: paddb (%rdx), %xmm0
6944 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
6945 ; SSE2-NEXT: movaps %xmm3, 48(%rcx)
6946 ; SSE2-NEXT: movaps %xmm1, 16(%rcx)
6947 ; SSE2-NEXT: movdqa %xmm0, (%rcx)
6950 ; SSE42-LABEL: vec512_v32i16_to_v1i512_factor32:
6952 ; SSE42-NEXT: movdqa (%rdi), %xmm0
6953 ; SSE42-NEXT: paddb (%rsi), %xmm0
6954 ; SSE42-NEXT: pxor %xmm1, %xmm1
6955 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
6956 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
6957 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
6958 ; SSE42-NEXT: movaps 48(%rdx), %xmm3
6959 ; SSE42-NEXT: paddb (%rdx), %xmm1
6960 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
6961 ; SSE42-NEXT: movaps %xmm3, 48(%rcx)
6962 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
6963 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
6966 ; AVX-LABEL: vec512_v32i16_to_v1i512_factor32:
6968 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
6969 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
6970 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
6971 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
6972 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
6973 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
6974 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
6975 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
6976 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
6977 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
6978 ; AVX-NEXT: vzeroupper
6981 ; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32:
6983 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
6984 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6985 ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6986 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
6987 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
6988 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
6989 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
6990 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
6991 ; AVX2-NEXT: vzeroupper
6994 ; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32:
6996 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
6997 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
6998 ; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
6999 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
7000 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7001 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
7002 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
7003 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7004 ; AVX512F-NEXT: vzeroupper
7005 ; AVX512F-NEXT: retq
7007 ; AVX512BW-LABEL: vec512_v32i16_to_v1i512_factor32:
7008 ; AVX512BW: # %bb.0:
7009 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7010 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
7011 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7012 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7013 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7014 ; AVX512BW-NEXT: vzeroupper
7015 ; AVX512BW-NEXT: retq
7016 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7017 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7018 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7019 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
7020 %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
7021 %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8>
7022 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7023 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7024 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7028 define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7029 ; SSE2-LABEL: vec512_v16i32_to_v8i64_factor2:
7031 ; SSE2-NEXT: movdqa (%rdi), %xmm0
7032 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
7033 ; SSE2-NEXT: paddb (%rsi), %xmm0
7034 ; SSE2-NEXT: paddb 16(%rsi), %xmm1
7035 ; SSE2-NEXT: pxor %xmm2, %xmm2
7036 ; SSE2-NEXT: movdqa %xmm1, %xmm3
7037 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
7038 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
7039 ; SSE2-NEXT: movdqa %xmm0, %xmm4
7040 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
7041 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7042 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
7043 ; SSE2-NEXT: paddb (%rdx), %xmm4
7044 ; SSE2-NEXT: paddb 48(%rdx), %xmm1
7045 ; SSE2-NEXT: paddb 32(%rdx), %xmm3
7046 ; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
7047 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx)
7048 ; SSE2-NEXT: movdqa %xmm4, (%rcx)
7049 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
7052 ; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2:
7054 ; SSE42-NEXT: movdqa (%rdi), %xmm0
7055 ; SSE42-NEXT: movdqa 16(%rdi), %xmm1
7056 ; SSE42-NEXT: paddb (%rsi), %xmm0
7057 ; SSE42-NEXT: paddb 16(%rsi), %xmm1
7058 ; SSE42-NEXT: pxor %xmm2, %xmm2
7059 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
7060 ; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
7061 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
7062 ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7063 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
7064 ; SSE42-NEXT: paddb (%rdx), %xmm4
7065 ; SSE42-NEXT: paddb 48(%rdx), %xmm1
7066 ; SSE42-NEXT: paddb 32(%rdx), %xmm3
7067 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
7068 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx)
7069 ; SSE42-NEXT: movdqa %xmm4, (%rcx)
7070 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
7073 ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2:
7075 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7076 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
7077 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
7078 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7079 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
7080 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
7081 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
7082 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
7083 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
7084 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
7085 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
7086 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
7087 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
7088 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
7089 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
7090 ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
7091 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
7094 ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:
7096 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7097 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7098 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
7099 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
7100 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
7101 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7102 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
7103 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
7104 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
7105 ; AVX2-NEXT: vzeroupper
7108 ; AVX512F-LABEL: vec512_v16i32_to_v8i64_factor2:
7110 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7111 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7112 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
7113 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7114 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7115 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7116 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7117 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
7118 ; AVX512F-NEXT: vzeroupper
7119 ; AVX512F-NEXT: retq
7121 ; AVX512BW-LABEL: vec512_v16i32_to_v8i64_factor2:
7122 ; AVX512BW: # %bb.0:
7123 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
7124 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7125 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
7126 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7127 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7128 ; AVX512BW-NEXT: vzeroupper
7129 ; AVX512BW-NEXT: retq
7130 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7131 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7132 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7133 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7134 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
7135 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7136 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7137 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7138 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7142 define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7143 ; SSE2-LABEL: vec512_v16i32_to_v4i128_factor4:
7145 ; SSE2-NEXT: movdqa (%rdi), %xmm0
7146 ; SSE2-NEXT: paddb (%rsi), %xmm0
7147 ; SSE2-NEXT: xorps %xmm1, %xmm1
7148 ; SSE2-NEXT: movdqa %xmm0, %xmm2
7149 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7150 ; SSE2-NEXT: xorps %xmm3, %xmm3
7151 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
7152 ; SSE2-NEXT: movdqa %xmm0, %xmm4
7153 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7154 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3]
7155 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
7156 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
7157 ; SSE2-NEXT: paddb 16(%rdx), %xmm0
7158 ; SSE2-NEXT: paddb 32(%rdx), %xmm4
7159 ; SSE2-NEXT: paddb (%rdx), %xmm3
7160 ; SSE2-NEXT: paddb 48(%rdx), %xmm2
7161 ; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
7162 ; SSE2-NEXT: movdqa %xmm3, (%rcx)
7163 ; SSE2-NEXT: movdqa %xmm4, 32(%rcx)
7164 ; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
7167 ; SSE42-LABEL: vec512_v16i32_to_v4i128_factor4:
7169 ; SSE42-NEXT: movdqa (%rdi), %xmm0
7170 ; SSE42-NEXT: paddb (%rsi), %xmm0
7171 ; SSE42-NEXT: pxor %xmm1, %xmm1
7172 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7173 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
7174 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
7175 ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7176 ; SSE42-NEXT: pxor %xmm4, %xmm4
7177 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
7178 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
7179 ; SSE42-NEXT: paddb 16(%rdx), %xmm3
7180 ; SSE42-NEXT: paddb 32(%rdx), %xmm2
7181 ; SSE42-NEXT: paddb (%rdx), %xmm1
7182 ; SSE42-NEXT: paddb 48(%rdx), %xmm0
7183 ; SSE42-NEXT: movdqa %xmm0, 48(%rcx)
7184 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
7185 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
7186 ; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
7189 ; AVX-LABEL: vec512_v16i32_to_v4i128_factor4:
7191 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7192 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7193 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
7194 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
7195 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
7196 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
7197 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
7198 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
7199 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
7200 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
7201 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
7202 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
7203 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
7204 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
7205 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
7206 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
7207 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
7208 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
7209 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
7210 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
7211 ; AVX-NEXT: vzeroupper
7214 ; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4:
7215 ; AVX2-SLOW: # %bb.0:
7216 ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
7217 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
7218 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
7219 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
7220 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
7221 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
7222 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
7223 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
7224 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
7225 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7226 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7227 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
7228 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
7229 ; AVX2-SLOW-NEXT: vzeroupper
7230 ; AVX2-SLOW-NEXT: retq
7232 ; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4:
7233 ; AVX2-FAST-PERLANE: # %bb.0:
7234 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
7235 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
7236 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
7237 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
7238 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
7239 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
7240 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
7241 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
7242 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
7243 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7244 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7245 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
7246 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
7247 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
7248 ; AVX2-FAST-PERLANE-NEXT: retq
7250 ; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4:
7251 ; AVX2-FAST: # %bb.0:
7252 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
7253 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7254 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
7255 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0]
7256 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
7257 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
7258 ; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,0]
7259 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
7260 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
7261 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7262 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7263 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
7264 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
7265 ; AVX2-FAST-NEXT: vzeroupper
7266 ; AVX2-FAST-NEXT: retq
7268 ; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4:
7270 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7271 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7272 ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
7273 ; AVX512F-NEXT: kmovw %eax, %k1
7274 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
7275 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7276 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7277 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7278 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7279 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
7280 ; AVX512F-NEXT: vzeroupper
7281 ; AVX512F-NEXT: retq
7283 ; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4:
7284 ; AVX512BW: # %bb.0:
7285 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
7286 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7287 ; AVX512BW-NEXT: movb $17, %al
7288 ; AVX512BW-NEXT: kmovd %eax, %k1
7289 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
7290 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
7291 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15]
7292 ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
7293 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0
7294 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7295 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7296 ; AVX512BW-NEXT: vzeroupper
7297 ; AVX512BW-NEXT: retq
7298 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7299 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7300 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7301 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7302 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
7303 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7304 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7305 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7306 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7310 define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7311 ; SSE2-LABEL: vec512_v16i32_to_v2i256_factor8:
7313 ; SSE2-NEXT: movdqa (%rdi), %xmm0
7314 ; SSE2-NEXT: paddb (%rsi), %xmm0
7315 ; SSE2-NEXT: xorps %xmm1, %xmm1
7316 ; SSE2-NEXT: xorps %xmm2, %xmm2
7317 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
7318 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
7319 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
7320 ; SSE2-NEXT: movaps 16(%rdx), %xmm1
7321 ; SSE2-NEXT: movaps 48(%rdx), %xmm3
7322 ; SSE2-NEXT: paddb 32(%rdx), %xmm0
7323 ; SSE2-NEXT: paddb (%rdx), %xmm2
7324 ; SSE2-NEXT: movaps %xmm3, 48(%rcx)
7325 ; SSE2-NEXT: movaps %xmm1, 16(%rcx)
7326 ; SSE2-NEXT: movdqa %xmm2, (%rcx)
7327 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
7330 ; SSE42-LABEL: vec512_v16i32_to_v2i256_factor8:
7332 ; SSE42-NEXT: movdqa (%rdi), %xmm0
7333 ; SSE42-NEXT: paddb (%rsi), %xmm0
7334 ; SSE42-NEXT: pxor %xmm1, %xmm1
7335 ; SSE42-NEXT: pxor %xmm2, %xmm2
7336 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
7337 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7338 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7339 ; SSE42-NEXT: movaps 16(%rdx), %xmm1
7340 ; SSE42-NEXT: movaps 48(%rdx), %xmm3
7341 ; SSE42-NEXT: paddb 32(%rdx), %xmm0
7342 ; SSE42-NEXT: paddb (%rdx), %xmm2
7343 ; SSE42-NEXT: movaps %xmm3, 48(%rcx)
7344 ; SSE42-NEXT: movaps %xmm1, 16(%rcx)
7345 ; SSE42-NEXT: movdqa %xmm2, (%rcx)
7346 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
7349 ; AVX-LABEL: vec512_v16i32_to_v2i256_factor8:
7351 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7352 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7353 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7354 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
7355 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
7356 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
7357 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
7358 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7359 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7360 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
7361 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7362 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
7363 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7364 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7367 ; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8:
7368 ; AVX2-SLOW: # %bb.0:
7369 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
7370 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7371 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7372 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7373 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7374 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7375 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7376 ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7377 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
7378 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
7379 ; AVX2-SLOW-NEXT: vzeroupper
7380 ; AVX2-SLOW-NEXT: retq
7382 ; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v2i256_factor8:
7383 ; AVX2-FAST-PERLANE: # %bb.0:
7384 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
7385 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7386 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1
7387 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7388 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7389 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7390 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
7391 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
7392 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
7393 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
7394 ; AVX2-FAST-PERLANE-NEXT: retq
7396 ; AVX2-FAST-LABEL: vec512_v16i32_to_v2i256_factor8:
7397 ; AVX2-FAST: # %bb.0:
7398 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
7399 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7400 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
7401 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7402 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7403 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7404 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
7405 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
7406 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
7407 ; AVX2-FAST-NEXT: vzeroupper
7408 ; AVX2-FAST-NEXT: retq
7410 ; AVX512F-LABEL: vec512_v16i32_to_v2i256_factor8:
7412 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7413 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7414 ; AVX512F-NEXT: movw $257, %ax # imm = 0x101
7415 ; AVX512F-NEXT: kmovw %eax, %k1
7416 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
7417 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7418 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7419 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7420 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7421 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
7422 ; AVX512F-NEXT: vzeroupper
7423 ; AVX512F-NEXT: retq
7425 ; AVX512BW-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8:
7426 ; AVX512BW-SLOW: # %bb.0:
7427 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
7428 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7429 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7430 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7431 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
7432 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7433 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
7434 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7435 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
7436 ; AVX512BW-SLOW-NEXT: vzeroupper
7437 ; AVX512BW-SLOW-NEXT: retq
7439 ; AVX512BW-FAST-LABEL: vec512_v16i32_to_v2i256_factor8:
7440 ; AVX512BW-FAST: # %bb.0:
7441 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
7442 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7443 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
7444 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7445 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7446 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7447 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7448 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
7449 ; AVX512BW-FAST-NEXT: vzeroupper
7450 ; AVX512BW-FAST-NEXT: retq
7451 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7452 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7453 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7454 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7455 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
7456 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7457 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7458 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7459 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7463 define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7464 ; SSE2-LABEL: vec512_v16i32_to_v1i512_factor16:
7466 ; SSE2-NEXT: movdqa (%rdi), %xmm0
7467 ; SSE2-NEXT: paddb (%rsi), %xmm0
7468 ; SSE2-NEXT: xorps %xmm1, %xmm1
7469 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
7470 ; SSE2-NEXT: movaps 16(%rdx), %xmm0
7471 ; SSE2-NEXT: movaps 32(%rdx), %xmm2
7472 ; SSE2-NEXT: movaps 48(%rdx), %xmm3
7473 ; SSE2-NEXT: paddb (%rdx), %xmm1
7474 ; SSE2-NEXT: movaps %xmm2, 32(%rcx)
7475 ; SSE2-NEXT: movaps %xmm3, 48(%rcx)
7476 ; SSE2-NEXT: movaps %xmm0, 16(%rcx)
7477 ; SSE2-NEXT: movdqa %xmm1, (%rcx)
7480 ; SSE42-LABEL: vec512_v16i32_to_v1i512_factor16:
7482 ; SSE42-NEXT: movdqa (%rdi), %xmm0
7483 ; SSE42-NEXT: paddb (%rsi), %xmm0
7484 ; SSE42-NEXT: pxor %xmm1, %xmm1
7485 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7486 ; SSE42-NEXT: movaps 16(%rdx), %xmm0
7487 ; SSE42-NEXT: movaps 32(%rdx), %xmm2
7488 ; SSE42-NEXT: movaps 48(%rdx), %xmm3
7489 ; SSE42-NEXT: paddb (%rdx), %xmm1
7490 ; SSE42-NEXT: movaps %xmm2, 32(%rcx)
7491 ; SSE42-NEXT: movaps %xmm3, 48(%rcx)
7492 ; SSE42-NEXT: movaps %xmm0, 16(%rcx)
7493 ; SSE42-NEXT: movdqa %xmm1, (%rcx)
7496 ; AVX-LABEL: vec512_v16i32_to_v1i512_factor16:
7498 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7499 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7500 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
7501 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
7502 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
7503 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7504 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7505 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7506 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
7507 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7508 ; AVX-NEXT: vzeroupper
7511 ; AVX2-LABEL: vec512_v16i32_to_v1i512_factor16:
7513 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7514 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7515 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
7516 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7517 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
7518 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7519 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
7520 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
7521 ; AVX2-NEXT: vzeroupper
7524 ; AVX512F-LABEL: vec512_v16i32_to_v1i512_factor16:
7526 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7527 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7528 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
7529 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7530 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7531 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
7532 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
7533 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7534 ; AVX512F-NEXT: vzeroupper
7535 ; AVX512F-NEXT: retq
7537 ; AVX512BW-LABEL: vec512_v16i32_to_v1i512_factor16:
7538 ; AVX512BW: # %bb.0:
7539 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7540 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
7541 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
7542 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
7543 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7544 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7545 ; AVX512BW-NEXT: vzeroupper
7546 ; AVX512BW-NEXT: retq
7547 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7548 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7549 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7550 %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32>
7551 %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
7552 %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8>
7553 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7554 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7555 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7559 define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7560 ; SSE-LABEL: vec512_v8i64_to_v4i128_factor2:
7562 ; SSE-NEXT: movdqa (%rdi), %xmm0
7563 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
7564 ; SSE-NEXT: paddb (%rsi), %xmm0
7565 ; SSE-NEXT: paddb 16(%rsi), %xmm1
7566 ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero
7567 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7568 ; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero
7569 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7570 ; SSE-NEXT: paddb 16(%rdx), %xmm0
7571 ; SSE-NEXT: paddb (%rdx), %xmm3
7572 ; SSE-NEXT: paddb 48(%rdx), %xmm1
7573 ; SSE-NEXT: paddb 32(%rdx), %xmm2
7574 ; SSE-NEXT: movdqa %xmm2, 32(%rcx)
7575 ; SSE-NEXT: movdqa %xmm1, 48(%rcx)
7576 ; SSE-NEXT: movdqa %xmm3, (%rcx)
7577 ; SSE-NEXT: movdqa %xmm0, 16(%rcx)
7580 ; AVX-LABEL: vec512_v8i64_to_v4i128_factor2:
7582 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7583 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
7584 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
7585 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7586 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
7587 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
7588 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
7589 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
7590 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3]
7591 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
7592 ; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
7593 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
7594 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
7595 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
7596 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7597 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7598 ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
7599 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7600 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
7601 ; AVX-NEXT: vzeroupper
7604 ; AVX2-LABEL: vec512_v8i64_to_v4i128_factor2:
7606 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7607 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7608 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
7609 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
7610 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
7611 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
7612 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
7613 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7614 ; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1
7615 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
7616 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
7617 ; AVX2-NEXT: vzeroupper
7620 ; AVX512F-LABEL: vec512_v8i64_to_v4i128_factor2:
7622 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7623 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7624 ; AVX512F-NEXT: movb $85, %al
7625 ; AVX512F-NEXT: kmovw %eax, %k1
7626 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
7627 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7628 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7629 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7630 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7631 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
7632 ; AVX512F-NEXT: vzeroupper
7633 ; AVX512F-NEXT: retq
7635 ; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2:
7636 ; AVX512BW-SLOW: # %bb.0:
7637 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
7638 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7639 ; AVX512BW-SLOW-NEXT: movb $5, %al
7640 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
7641 ; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
7642 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
7643 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
7644 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
7645 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7646 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7647 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
7648 ; AVX512BW-SLOW-NEXT: vzeroupper
7649 ; AVX512BW-SLOW-NEXT: retq
7651 ; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2:
7652 ; AVX512BW-FAST: # %bb.0:
7653 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
7654 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7655 ; AVX512BW-FAST-NEXT: movb $5, %al
7656 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1
7657 ; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
7658 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
7659 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,5,3,7]
7660 ; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3
7661 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0
7662 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7663 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
7664 ; AVX512BW-FAST-NEXT: vzeroupper
7665 ; AVX512BW-FAST-NEXT: retq
7666 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7667 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7668 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7669 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7670 %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
7671 %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8>
7672 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7673 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7674 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7678 define void @vec512_v8i64_to_v2i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7679 ; SSE-LABEL: vec512_v8i64_to_v2i256_factor4:
7681 ; SSE-NEXT: movdqa (%rdi), %xmm0
7682 ; SSE-NEXT: paddb (%rsi), %xmm0
7683 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
7684 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7685 ; SSE-NEXT: movaps 16(%rdx), %xmm2
7686 ; SSE-NEXT: movaps 48(%rdx), %xmm3
7687 ; SSE-NEXT: paddb (%rdx), %xmm1
7688 ; SSE-NEXT: paddb 32(%rdx), %xmm0
7689 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
7690 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
7691 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
7692 ; SSE-NEXT: movdqa %xmm1, (%rcx)
7695 ; AVX-LABEL: vec512_v8i64_to_v2i256_factor4:
7697 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7698 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7699 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7700 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
7701 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7702 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7703 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7704 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
7705 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7706 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
7707 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7708 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7711 ; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4:
7713 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7714 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7715 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero
7716 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7717 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
7718 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
7719 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
7720 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
7721 ; AVX2-NEXT: vzeroupper
7724 ; AVX512F-LABEL: vec512_v8i64_to_v2i256_factor4:
7726 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7727 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7728 ; AVX512F-NEXT: movb $17, %al
7729 ; AVX512F-NEXT: kmovw %eax, %k1
7730 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
7731 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7732 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7733 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7734 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7735 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
7736 ; AVX512F-NEXT: vzeroupper
7737 ; AVX512F-NEXT: retq
7739 ; AVX512BW-LABEL: vec512_v8i64_to_v2i256_factor4:
7740 ; AVX512BW: # %bb.0:
7741 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
7742 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7743 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero
7744 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
7745 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7746 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7747 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7748 ; AVX512BW-NEXT: vzeroupper
7749 ; AVX512BW-NEXT: retq
7750 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7751 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7752 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7753 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7754 %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
7755 %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8>
7756 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7757 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7758 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7762 define void @vec512_v8i64_to_v1i512_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7763 ; SSE-LABEL: vec512_v8i64_to_v1i512_factor8:
7765 ; SSE-NEXT: movdqa (%rdi), %xmm0
7766 ; SSE-NEXT: paddb (%rsi), %xmm0
7767 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
7768 ; SSE-NEXT: movaps 16(%rdx), %xmm1
7769 ; SSE-NEXT: movaps 32(%rdx), %xmm2
7770 ; SSE-NEXT: movaps 48(%rdx), %xmm3
7771 ; SSE-NEXT: paddb (%rdx), %xmm0
7772 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
7773 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
7774 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
7775 ; SSE-NEXT: movdqa %xmm0, (%rcx)
7778 ; AVX-LABEL: vec512_v8i64_to_v1i512_factor8:
7780 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7781 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7782 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
7783 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7784 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7785 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7786 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7787 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
7788 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7789 ; AVX-NEXT: vzeroupper
7792 ; AVX2-LABEL: vec512_v8i64_to_v1i512_factor8:
7794 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7795 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7796 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7797 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
7798 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7799 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
7800 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
7801 ; AVX2-NEXT: vzeroupper
7804 ; AVX512F-LABEL: vec512_v8i64_to_v1i512_factor8:
7806 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7807 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7808 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7809 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7810 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
7811 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
7812 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7813 ; AVX512F-NEXT: vzeroupper
7814 ; AVX512F-NEXT: retq
7816 ; AVX512BW-LABEL: vec512_v8i64_to_v1i512_factor8:
7817 ; AVX512BW: # %bb.0:
7818 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7819 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
7820 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
7821 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7822 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7823 ; AVX512BW-NEXT: vzeroupper
7824 ; AVX512BW-NEXT: retq
7825 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7826 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7827 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7828 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64>
7829 %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7830 %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8>
7831 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7832 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7833 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7837 define void @vec512_v4i128_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7838 ; SSE-LABEL: vec512_v4i128_to_v2i256_factor2:
7840 ; SSE-NEXT: movdqa (%rdi), %xmm0
7841 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
7842 ; SSE-NEXT: paddb 16(%rsi), %xmm1
7843 ; SSE-NEXT: paddb (%rsi), %xmm0
7844 ; SSE-NEXT: movaps 16(%rdx), %xmm2
7845 ; SSE-NEXT: movaps 48(%rdx), %xmm3
7846 ; SSE-NEXT: paddb (%rdx), %xmm0
7847 ; SSE-NEXT: paddb 32(%rdx), %xmm1
7848 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
7849 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
7850 ; SSE-NEXT: movdqa %xmm1, 32(%rcx)
7851 ; SSE-NEXT: movdqa %xmm0, (%rcx)
7854 ; AVX-LABEL: vec512_v4i128_to_v2i256_factor2:
7856 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7857 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
7858 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7859 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
7860 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
7861 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7862 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7863 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3
7864 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx)
7865 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7866 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7867 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
7870 ; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2:
7872 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
7873 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7874 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
7875 ; AVX2-NEXT: vmovdqa %xmm0, %xmm0
7876 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7877 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7878 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
7879 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
7880 ; AVX2-NEXT: vzeroupper
7883 ; AVX512F-LABEL: vec512_v4i128_to_v2i256_factor2:
7885 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
7886 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7887 ; AVX512F-NEXT: movb $51, %al
7888 ; AVX512F-NEXT: kmovw %eax, %k1
7889 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
7890 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7891 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
7892 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7893 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7894 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
7895 ; AVX512F-NEXT: vzeroupper
7896 ; AVX512F-NEXT: retq
7898 ; AVX512BW-LABEL: vec512_v4i128_to_v2i256_factor2:
7899 ; AVX512BW: # %bb.0:
7900 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
7901 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
7902 ; AVX512BW-NEXT: vmovdqa %xmm0, %xmm1
7903 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
7904 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7905 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7906 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7907 ; AVX512BW-NEXT: vzeroupper
7908 ; AVX512BW-NEXT: retq
7909 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7910 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7911 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7912 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
7913 %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
7914 %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8>
7915 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7916 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7917 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7921 define void @vec512_v4i128_to_v1i512_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7922 ; SSE-LABEL: vec512_v4i128_to_v1i512_factor4:
7924 ; SSE-NEXT: movdqa (%rdi), %xmm0
7925 ; SSE-NEXT: paddb (%rsi), %xmm0
7926 ; SSE-NEXT: movaps 16(%rdx), %xmm1
7927 ; SSE-NEXT: movaps 32(%rdx), %xmm2
7928 ; SSE-NEXT: movaps 48(%rdx), %xmm3
7929 ; SSE-NEXT: paddb (%rdx), %xmm0
7930 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
7931 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
7932 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
7933 ; SSE-NEXT: movdqa %xmm0, (%rcx)
7936 ; AVX-LABEL: vec512_v4i128_to_v1i512_factor4:
7938 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
7939 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7940 ; AVX-NEXT: vmovaps 32(%rdx), %ymm1
7941 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
7942 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2
7943 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx)
7944 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
7945 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
7946 ; AVX-NEXT: vzeroupper
7949 ; AVX2-LABEL: vec512_v4i128_to_v1i512_factor4:
7951 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
7952 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7953 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
7954 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7955 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
7956 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
7957 ; AVX2-NEXT: vzeroupper
7960 ; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4:
7962 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
7963 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7964 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
7965 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
7966 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
7967 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
7968 ; AVX512F-NEXT: vzeroupper
7969 ; AVX512F-NEXT: retq
7971 ; AVX512BW-LABEL: vec512_v4i128_to_v1i512_factor4:
7972 ; AVX512BW: # %bb.0:
7973 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
7974 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
7975 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
7976 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
7977 ; AVX512BW-NEXT: vzeroupper
7978 ; AVX512BW-NEXT: retq
7979 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
7980 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
7981 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
7982 %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128>
7983 %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
7984 %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8>
7985 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
7986 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
7987 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
7991 define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
7992 ; SSE-LABEL: vec512_v2i256_to_v1i512_factor2:
7994 ; SSE-NEXT: movdqa (%rdi), %xmm0
7995 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
7996 ; SSE-NEXT: paddb (%rsi), %xmm0
7997 ; SSE-NEXT: paddb 16(%rsi), %xmm1
7998 ; SSE-NEXT: movaps 32(%rdx), %xmm2
7999 ; SSE-NEXT: movaps 48(%rdx), %xmm3
8000 ; SSE-NEXT: paddb 16(%rdx), %xmm1
8001 ; SSE-NEXT: paddb (%rdx), %xmm0
8002 ; SSE-NEXT: movaps %xmm2, 32(%rcx)
8003 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
8004 ; SSE-NEXT: movdqa %xmm0, (%rcx)
8005 ; SSE-NEXT: movdqa %xmm1, 16(%rcx)
8008 ; AVX-LABEL: vec512_v2i256_to_v1i512_factor2:
8010 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
8011 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
8012 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
8013 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
8014 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2
8015 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
8016 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
8017 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
8018 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
8019 ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
8020 ; AVX-NEXT: vzeroupper
8023 ; AVX2-LABEL: vec512_v2i256_to_v1i512_factor2:
8025 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
8026 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
8027 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
8028 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
8029 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
8030 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
8031 ; AVX2-NEXT: vzeroupper
8034 ; AVX512F-LABEL: vec512_v2i256_to_v1i512_factor2:
8036 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
8037 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
8038 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
8039 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
8040 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
8041 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
8042 ; AVX512F-NEXT: vzeroupper
8043 ; AVX512F-NEXT: retq
8045 ; AVX512BW-LABEL: vec512_v2i256_to_v1i512_factor2:
8046 ; AVX512BW: # %bb.0:
8047 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
8048 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
8049 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
8050 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
8051 ; AVX512BW-NEXT: vzeroupper
8052 ; AVX512BW-NEXT: retq
8053 %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
8054 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
8055 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
8056 %in.vec.cast = bitcast <64 x i8> %in.vec to <2 x i256>
8057 %zextd.vec = shufflevector <2 x i256> %in.vec.cast, <2 x i256> zeroinitializer, <2 x i32> <i32 0, i32 3>
8058 %out.bytevec = bitcast <2 x i256> %zextd.vec to <64 x i8>
8059 %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64
8060 %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias
8061 store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64
8064 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: